Class: KMeansClusterer

Inherits:

Object

Object
KMeansClusterer

show all

Defined in:: lib/kmeans-clusterer.rb

Defined Under Namespace

Modules: Scaler Classes: Cluster, Point

Constant Summary collapse

TYPECODE =

{ double: NArray::DFLOAT, single: NArray::SFLOAT }

DEFAULT_OPTS =

{ scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }

Instance Attribute Summary collapse

#clusters ⇒ Object readonly

Returns the value of attribute clusters.
#error ⇒ Object readonly

Returns the value of attribute error.
#iterations ⇒ Object readonly

Returns the value of attribute iterations.
#k ⇒ Object readonly

Returns the value of attribute k.
#mean ⇒ Object readonly

Returns the value of attribute mean.
#points ⇒ Object readonly

Returns the value of attribute points.
#runtime ⇒ Object readonly

Returns the value of attribute runtime.
#std ⇒ Object readonly

Returns the value of attribute std.

Class Method Summary collapse

.run(k, data, opts = {}) ⇒ Object

Instance Method Summary collapse

#finish ⇒ Object
#initialize(opts = {}) ⇒ KMeansClusterer constructor

A new instance of KMeansClusterer.
#inspect ⇒ Object
#origin ⇒ Object
#predict(data) ⇒ Object
#run ⇒ Object
#silhouette ⇒ Object (also: #silhouette_score)
#sorted_clusters(point = origin) ⇒ Object

Constructor Details

#initialize(opts = {}) ⇒ `KMeansClusterer`

Returns a new instance of KMeansClusterer.

# File 'lib/kmeans-clusterer.rb', line 112

def initialize opts = {}
  @k = opts[:k]
  @init = opts[:init]
  @labels = opts[:labels] || []
  @row_norms = opts[:row_norms]

  @points_matrix = opts[:points_matrix]
  @points_count = @points_matrix.shape[1] if @points_matrix
  @mean = opts[:mean]
  @std = opts[:std]
  @scale_data = opts[:scale_data]
  @typecode = opts[:typecode]

  init_centroids
end

Instance Attribute Details

#clusters ⇒ `Object` (readonly)

Returns the value of attribute clusters.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def clusters
  @clusters
end

#error ⇒ `Object` (readonly)

Returns the value of attribute error.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def error
  @error
end

#iterations ⇒ `Object` (readonly)

Returns the value of attribute iterations.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def iterations
  @iterations
end

#k ⇒ `Object` (readonly)

Returns the value of attribute k.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def k
  @k
end

#mean ⇒ `Object` (readonly)

Returns the value of attribute mean.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def mean
  @mean
end

#points ⇒ `Object` (readonly)

Returns the value of attribute points.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def points
  @points
end

#runtime ⇒ `Object` (readonly)

Returns the value of attribute runtime.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def runtime
  @runtime
end

#std ⇒ `Object` (readonly)

Returns the value of attribute std.



109
110
111

# File 'lib/kmeans-clusterer.rb', line 109

def std
  @std
end

Class Method Details

.run(k, data, opts = {}) ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 74

def self.run k, data, opts = {}
  opts = DEFAULT_OPTS.merge(opts)

  opts[:k] = k
  opts[:typecode] = TYPECODE[opts[:float_precision]]

  data = NMatrix.cast data, opts[:typecode]

  if opts[:scale_data]
    data, mean, std = Scaler.scale(data, nil, nil, opts[:typecode])
    opts[:mean] = mean
    opts[:std] = std
  end

  opts[:points_matrix] = data
  opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)

  bestrun = nil

  opts[:runs].times do |i|
    km = new(opts).run

    if opts[:log]
      puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
    end
    
    if bestrun.nil? || (km.error < bestrun.error)
      bestrun = km
    end
  end

  bestrun.finish
end

Instance Method Details

#finish ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 177

def finish
  set_points
  set_clusters
  self
end

#inspect ⇒ `Object`



225
226
227

# File 'lib/kmeans-clusterer.rb', line 225

def inspect
  %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
end

#origin ⇒ `Object`



199
200
201

# File 'lib/kmeans-clusterer.rb', line 199

def origin
  wrap_point Array.new(@points[0].dimension, 0) 
end

#predict(data) ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 183

def predict data
  data = NMatrix.cast(data, @typecode)
  data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
  distances = distance(@centroids, data, nil)
  data.shape[1].times.map do |i|
    distances[i, true].sort_index[0] # index of closest cluster
  end
end

#run ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 128

def run 
  start_time = Time.now
  @iterations, @runtime = 0, 0

  @cluster_point_ids = Array.new(@k) { [] }

  loop do
    @iterations +=1

    distances = distance(@centroids, @points_matrix)

    # assign point ids to @cluster_point_ids
    @points_count.times do |i|
      min_distance_index = distances[i, true].sort_index[0]
      @cluster_point_ids[min_distance_index] << i
    end

    moves = []
    updated_centroids = []

    @k.times do |i|
      centroid = NArray.ref(@centroids[true, i].flatten)
      point_ids = @cluster_point_ids[i]

      if point_ids.empty?
        newcenter = centroid
        moves << 0
      else
        points = @points_matrix[true, point_ids]
        newcenter = points.mean(1)
        moves << distance(centroid, newcenter)
      end

      updated_centroids << newcenter
    end

    @centroids = NMatrix.cast updated_centroids, @typecode

    break if moves.max < 0.001 # i.e., no movement
    break if @iterations >= 300

    @cluster_point_ids = Array.new(@k) { [] }
  end

  @error = calculate_error
  @runtime =  Time.now - start_time
  self
end

#silhouette ⇒ `Object` Also known as: silhouette_score

# File 'lib/kmeans-clusterer.rb', line 203

def silhouette
  return 1.0 if @k < 2

  distances = distance(@centroids, @points_matrix)

  scores = @points_count.times.map do |i|
    point = get_point i
    cluster_indexes = distances[i, true].sort_index

    c1_points = get_points_for_centroid cluster_indexes[0]
    c2_points = get_points_for_centroid cluster_indexes[1]

    a = dissimilarity(c1_points, point)
    b = dissimilarity(c2_points, point)
    (b - a) / [a,b].max
  end

  scores.reduce(:+) / scores.length # mean score for all points
end

#sorted_clusters(point = origin) ⇒ `Object`

# File 'lib/kmeans-clusterer.rb', line 192

def sorted_clusters point = origin
  point = wrap_point point
  centroids = get_cluster_centroids
  distances = distance(centroids, point.data)
  @clusters.sort_by.with_index {|c, i| distances[i] }
end

Class: KMeansClusterer

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ KMeansClusterer

Instance Attribute Details

#clusters ⇒ Object (readonly)

#error ⇒ Object (readonly)

#iterations ⇒ Object (readonly)

#k ⇒ Object (readonly)

#mean ⇒ Object (readonly)

#points ⇒ Object (readonly)

#runtime ⇒ Object (readonly)

#std ⇒ Object (readonly)