Class: SClust::KMean::Clusterer

Inherits:
Object
  • Object
show all
Defined in:
lib/sclust/kmean/cluster.rb

Direct Known Subclasses

DocumentClusterer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(points = []) ⇒ Clusterer

Optionally takes a notifier.



159
160
161
162
163
164
165
166
167
168
169
# File 'lib/sclust/kmean/cluster.rb', line 159

def initialize(points=[])
    @iterations    = 3
    @cluster_count = 0
    @points        = points
    @clusters      = []
    @logger        = Log4r::Logger.new('Clusterer')
    @logger.add('default')

    # Randomly select a few starting documents.
    #build_empty_clusters('crp')
end

Instance Attribute Details

#cluster_countObject

Returns the value of attribute cluster_count.



155
156
157
# File 'lib/sclust/kmean/cluster.rb', line 155

def cluster_count
  @cluster_count
end

#clustersObject

Returns the value of attribute clusters.



155
156
157
# File 'lib/sclust/kmean/cluster.rb', line 155

def clusters
  @clusters
end

#iterationsObject

Returns the value of attribute iterations.



155
156
157
# File 'lib/sclust/kmean/cluster.rb', line 155

def iterations
  @iterations
end

#loggerObject

Returns the value of attribute logger.



155
156
157
# File 'lib/sclust/kmean/cluster.rb', line 155

def logger
  @logger
end

#pointsObject

Returns the value of attribute points.



155
156
157
# File 'lib/sclust/kmean/cluster.rb', line 155

def points
  @points
end

Instance Method Details

#+(point) ⇒ Object



207
208
209
# File 'lib/sclust/kmean/cluster.rb', line 207

def +(point)
    @points << point
end

#assign_all_pointsObject



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# File 'lib/sclust/kmean/cluster.rb', line 215

def assign_all_points
          
    @points.each do |pt|
        
        #@logger.debug("Assigning point #{pt}.")
        
        # Randomize the first selection to ensure that in the case where there are 
        # many centers that are close, each has a (statistically) equal chance of
        # getting the document, thus moving the center, changing the center,
        # and perhaps matching other documents better because of more terms.
        min_cluster = @clusters[rand(@clusters.length)]
        min_dst     = min_cluster.center.distance(pt)

        @clusters.each do |cluster|
    
            tmp_distance = cluster.center.distance(pt)
            
            if tmp_distance.nil?
                next
                
            elsif min_dst.nil?
                min_dst = tmp_distance 
                min_cluster = cluster
                
            elsif tmp_distance < min_dst
                min_cluster = cluster
                min_dst = tmp_distance
                
            end
        end
        
        # If a point has a center...
        if pt.cluster
        
            # If it is not the same cluster...
            unless pt.cluster.equal? min_cluster
                pt.cluster  - pt
                min_cluster + pt
            end
        else
            min_cluster + pt
        end
        
        #pt.cluster  - pt if pt.cluster
        
        #min_cluster + pt
    end
end

#clusterObject



264
265
266
267
268
269
270
271
272
273
# File 'lib/sclust/kmean/cluster.rb', line 264

def cluster
    
    # If we are not initialized, initialize the cluster! :)
    self.build_empty_clusters('crp') unless @clusters && @clusters.size > 0
    
    iterations.times do |i|
        @logger.info("Starting iteration #{i+1} of #{iterations}.")
        assign_all_points
    end
end

#each_cluster(&c) ⇒ Object



211
212
213
# File 'lib/sclust/kmean/cluster.rb', line 211

def each_cluster(&c)
    @clusters.each { |cluster| yield cluster }
end

#get_max_terms(n = 3) ⇒ Object



275
276
277
278
279
280
281
282
283
# File 'lib/sclust/kmean/cluster.rb', line 275

def get_max_terms(n=3)
    r = []
    
    each_cluster do |cluster|
        r << cluster.get_max_terms(n)
    end
    
    r
end

#rebuild_document_collectionObject

If you edit the document collection behind the scenes in and LDA clusterer, you need to run this to avoid terms with 0 showing up. However, K-Mean has so little document-related state that this method does nothing and is only here for API compatibility. We would like LDA and KMean implementations that are drop-in replacements.



289
290
# File 'lib/sclust/kmean/cluster.rb', line 289

def rebuild_document_collection()
end

#topics=(process) ⇒ Object

Drop all existing clusters and recreate them using the given method. If the given method is an integer, then that many clusters are created and the centers are randomly chosen from the documents contained in the @points attribute. If it is CRP, then the Chinese Resteraunt Process is used, considering each document and creating a cluster with that document as the center stochastically and proportionally the number of documents already considered.



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/sclust/kmean/cluster.rb', line 177

def topics=(process)
    
    @clusters = []
    
    if ( process.is_a?(Integer))
        @logger.info("Building cluster of constant cluster count #{process}.")
        @cluster_count = process
        @cluster_count.times { @clusters << Cluster.new(@points[rand(points.length)]) }
        
    elsif(process.is_a?(String))
        if ( process == "crp" )
            
            @logger.info("Building clusters using CRP.")
            
            1.upto(@points.length) do |i|
        
                @cluster_count = 0
        
                if ( rand(i) == 0 )
                    @clusters << Cluster.new(@points[i-1])
                    @cluster_count += 1
                end
                
            end
            
            @logger.info("Built #{@cluster_count} clusters.")
        end
    end
end