Class: KMeansClusterer

Inherits:

Object

Object
KMeansClusterer

show all

Defined in:: lib/kmeans-clusterer.rb

Defined Under Namespace

Modules: Distance, Scaler, Utils Classes: Cluster, Point

Constant Summary collapse

TYPECODE =

{ double: NArray::DFLOAT, single: NArray::SFLOAT }

DEFAULT_OPTS =

{ scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }

Instance Attribute Summary collapse

#centroids ⇒ Object readonly

Returns the value of attribute centroids.
#clusters ⇒ Object readonly

Returns the value of attribute clusters.
#data ⇒ Object readonly

Returns the value of attribute data.
#distances ⇒ Object readonly

Returns the value of attribute distances.
#error ⇒ Object readonly

Returns the value of attribute error.
#iterations ⇒ Object readonly

Returns the value of attribute iterations.
#k ⇒ Object readonly

Returns the value of attribute k.
#mean ⇒ Object readonly

Returns the value of attribute mean.
#points ⇒ Object readonly

Returns the value of attribute points.
#runtime ⇒ Object readonly

Returns the value of attribute runtime.
#std ⇒ Object readonly

Returns the value of attribute std.

Class Method Summary collapse

.run(k, data, opts = {}) ⇒ Object

Instance Method Summary collapse

#finish ⇒ Object
#initialize(opts = {}) ⇒ KMeansClusterer constructor

A new instance of KMeansClusterer.
#inspect ⇒ Object
#predict(data) ⇒ Object
#run ⇒ Object
#silhouette ⇒ Object
#sorted_clusters(point = origin) ⇒ Object

Constructor Details

#initialize(opts = {}) ⇒ `KMeansClusterer`

Returns a new instance of KMeansClusterer.

# File 'lib/kmeans-clusterer.rb', line 162

def initialize opts = {}
  @k = opts[:k]
  @init = opts[:init]
  @labels = opts[:labels] || []
  @row_norms = opts[:row_norms]

  @data = opts[:data]
  @points_count = @data ? @data.shape[1] : 0
  @mean = Utils.ensure_narray(opts[:mean]) if opts[:mean]
  @std = Utils.ensure_narray(opts[:std]) if opts[:std]
  @scale_data = opts[:scale_data]
  @typecode = TYPECODE[opts[:float_precision] || :double]
  @max_iter = opts[:max_iter]

  init_centroids
end

Instance Attribute Details

#centroids ⇒ `Object` (readonly)

Returns the value of attribute centroids.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def centroids
  @centroids
end

#clusters ⇒ `Object` (readonly)

Returns the value of attribute clusters.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def clusters
  @clusters
end

#data ⇒ `Object` (readonly)

Returns the value of attribute data.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def data
  @data
end

#distances ⇒ `Object` (readonly)

Returns the value of attribute distances.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def distances
  @distances
end

#error ⇒ `Object` (readonly)

Returns the value of attribute error.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def error
  @error
end

#iterations ⇒ `Object` (readonly)

Returns the value of attribute iterations.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def iterations
  @iterations
end

#k ⇒ `Object` (readonly)

Returns the value of attribute k.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def k
  @k
end

#mean ⇒ `Object` (readonly)

Returns the value of attribute mean.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def mean
  @mean
end

#points ⇒ `Object` (readonly)

Returns the value of attribute points.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def points
  @points
end

#runtime ⇒ `Object` (readonly)

Returns the value of attribute runtime.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def runtime
  @runtime
end

#std ⇒ `Object` (readonly)

Returns the value of attribute std.



159
160
161

# File 'lib/kmeans-clusterer.rb', line 159

def std
  @std
end

Class Method Details

.run(k, data, opts = {}) ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 124

def self.run k, data, opts = {}
  opts = DEFAULT_OPTS.merge(opts)

  opts[:k] = k
  typecode = TYPECODE[opts[:float_precision]]

  data = Utils.ensure_matrix data, typecode

  if opts[:scale_data]
    data, mean, std = Scaler.scale(data, nil, nil, typecode)
    opts[:mean] = mean
    opts[:std] = std
  end

  opts[:data] = data
  opts[:row_norms] = Scaler.row_norms(data)

  bestrun = nil

  opts[:runs].times do |i|
    km = new(opts).run

    if opts[:log]
      puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
    end
    
    if bestrun.nil? || (km.error < bestrun.error)
      bestrun = km
    end
  end

  bestrun.finish
end

Instance Method Details

#finish ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 222

def finish
  @clusters = @k.times.map do |i|
    centroid = NArray.ref @centroids[true, i].flatten
    Cluster.new i, Point.new(-1, centroid, nil, nil)
  end

  @points = @points_count.times.map do |i|
    data = NArray.ref @data[true, i].flatten
    point = Point.new(i, data, @distances[i, true], @labels[i])
    cluster = @clusters[@cluster_assigns[i]]
    cluster << point
    point
  end

  @clusters.each do |c| 
    c.points.sort_by! &:centroid_distance
  end

  self
end

#inspect ⇒ `Object`



281
282
283

# File 'lib/kmeans-clusterer.rb', line 281

def inspect
  %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
end

#predict(data) ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 243

def predict data
  data = Utils.ensure_matrix data, @typecode
  data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
  distances = Distance.euclidean(@centroids, data)
  data.shape[1].times.map do |i|
    distances[i, true].sort_index[0] # index of closest cluster
  end
end

#run ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 179

def run 
  start_time = Time.now
  @iterations, @runtime = 0, 0
  @cluster_assigns = NArray.int(@points_count)
  min_distances = NArray.new(@typecode, @points_count)

  loop do
    @iterations +=1

    min_distances.fill! Float::INFINITY
    @distances = Distance.euclidean(@centroids, @data, @row_norms)

    @k.times do |cluster_id|
      dist = NArray.ref @distances[true, cluster_id].flatten
      mask = dist < min_distances
      @cluster_assigns[mask] = cluster_id
      min_distances[mask] = dist[mask]
    end

    max_move = 0

    @k.times do |cluster_id|
      centroid = NArray.ref(@centroids[true, cluster_id].flatten)
      point_ids = @cluster_assigns.eq(cluster_id).where

      unless point_ids.empty?
        points = @data[true, point_ids]
        newcenter = points.mean(1)
        move = Distance.euclidean(centroid, newcenter)
        max_move = move if move > max_move
        @centroids[true, cluster_id] = newcenter
      end
    end

    break if max_move < 0.001 # i.e., no movement
    break if @iterations >= @max_iter
  end

  @error = (min_distances**2).sum
  @runtime =  Time.now - start_time
  self
end

#silhouette ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 259

def silhouette
  return 1.0 if @k < 2

  # calculate all point-to-point distances at once
  # uses more memory, but much faster
  point_distances = Distance.euclidean @data, @data

  scores = @points.map do |point|
    dissimilarities = @clusters.map do |cluster|  
      dissimilarity(point.id, cluster.id, point_distances)
    end
    a = dissimilarities[point.cluster.id]
    # set to Infinity so we can pick next closest via min()
    dissimilarities[point.cluster.id] = Float::INFINITY
    b = dissimilarities.min

    (b - a) / [a,b].max
  end

  scores.reduce(:+) / scores.length # mean score for all points
end

#sorted_clusters(point = origin) ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 252

def sorted_clusters point = origin
  point = point.data if point.is_a?(Point)
  point = NArray.cast(point, @typecode) unless point.is_a?(NArray)
  distances = Distance.euclidean(NArray.ref(@centroids), point)
  @clusters.sort_by.with_index {|c, i| distances[i] }
end

Class: KMeansClusterer

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ KMeansClusterer

Instance Attribute Details

#centroids ⇒ Object (readonly)

#clusters ⇒ Object (readonly)

#data ⇒ Object (readonly)

#distances ⇒ Object (readonly)

#error ⇒ Object (readonly)

#iterations ⇒ Object (readonly)

#k ⇒ Object (readonly)

#mean ⇒ Object (readonly)

#points ⇒ Object (readonly)

#runtime ⇒ Object (readonly)

#std ⇒ Object (readonly)