Class: SClust::KMean::DocumentClusterer

Inherits:
Clusterer
  • Object
show all
Defined in:
lib/sclust/kmean/doccluster.rb

Overview

A document clusterer that overrides the + operator to allow for adding Document objects.

Instance Attribute Summary collapse

Attributes inherited from Clusterer

#cluster_count, #clusters, #iterations, #logger, #points

Instance Method Summary collapse

Methods inherited from Clusterer

#+, #assign_all_points, #cluster, #each_cluster, #get_max_terms, #rebuild_document_collection

Constructor Details

#initializeDocumentClusterer

Returns a new instance of DocumentClusterer.



38
39
40
41
# File 'lib/sclust/kmean/doccluster.rb', line 38

def initialize()
    @document_collection = SClust::Util::DocumentCollection.new()
    super()
end

Instance Attribute Details

#document_collectionObject (readonly)

Returns the value of attribute document_collection.



36
37
38
# File 'lib/sclust/kmean/doccluster.rb', line 36

def document_collection
  @document_collection
end

Instance Method Details

#<<(d) ⇒ Object



43
44
45
46
47
48
49
# File 'lib/sclust/kmean/doccluster.rb', line 43

def <<(d)
    if ( d.is_a?(SClust::Util::Document) )
        @document_collection << d
    else
        @document_collection << SClust::Util::Document.new(d.to_s)
    end
end

#initialize_pointsObject

This must be run to conver the document collection into the points in a cluster.



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/sclust/kmean/doccluster.rb', line 53

def initialize_points()
    
    point_list = []
    
    @document_collection.doclist.each do |doc|
        
        doc_terms = SClust::Util::SparseVector.new(0)
        
        # Buid a BIG term vector list for this document.
        doc.terms.each_key do |term|
            doc_terms[term] = doc.tf(term) - @document_collection.idf(term)
        end
        
        # def initialize(terms, values, source_object = nil)
        point_list << ClusterPoint.new(doc_terms, doc)
    end
    
    self.points = point_list
    
end

#topics=(n) ⇒ Object



74
75
76
77
78
79
# File 'lib/sclust/kmean/doccluster.rb', line 74

def topics=(n)
    
    initialize_points unless ( self.points && self.points.size > 0 )
    super(n)
    
end