Class: KMeansClusterer

Inherits:
Object
  • Object
show all
Defined in:
lib/kmeans-clusterer.rb

Defined Under Namespace

Modules: Scaler Classes: Cluster, Point

Constant Summary collapse

TYPECODE =
{ double: NArray::DFLOAT, single: NArray::SFLOAT }
DEFAULT_OPTS =
{ scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double }

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ KMeansClusterer

Returns a new instance of KMeansClusterer.



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/kmeans-clusterer.rb', line 112

def initialize opts = {}
  @k = opts[:k]
  @init = opts[:init]
  @labels = opts[:labels] || []
  @row_norms = opts[:row_norms]

  @points_matrix = opts[:points_matrix]
  @points_count = @points_matrix.shape[1] if @points_matrix
  @mean = opts[:mean]
  @std = opts[:std]
  @scale_data = opts[:scale_data]
  @typecode = opts[:typecode]

  init_centroids
end

Instance Attribute Details

#clustersObject (readonly)

Returns the value of attribute clusters.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def clusters
  @clusters
end

#errorObject (readonly)

Returns the value of attribute error.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def error
  @error
end

#iterationsObject (readonly)

Returns the value of attribute iterations.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def iterations
  @iterations
end

#kObject (readonly)

Returns the value of attribute k.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def k
  @k
end

#meanObject (readonly)

Returns the value of attribute mean.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def mean
  @mean
end

#pointsObject (readonly)

Returns the value of attribute points.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def points
  @points
end

#runtimeObject (readonly)

Returns the value of attribute runtime.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def runtime
  @runtime
end

#stdObject (readonly)

Returns the value of attribute std.



109
110
111
# File 'lib/kmeans-clusterer.rb', line 109

def std
  @std
end

Class Method Details

.run(k, data, opts = {}) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/kmeans-clusterer.rb', line 74

def self.run k, data, opts = {}
  opts = DEFAULT_OPTS.merge(opts)

  opts[:k] = k
  opts[:typecode] = TYPECODE[opts[:float_precision]]

  data = NMatrix.cast data, opts[:typecode]

  if opts[:scale_data]
    data, mean, std = Scaler.scale(data, nil, nil, opts[:typecode])
    opts[:mean] = mean
    opts[:std] = std
  end

  opts[:points_matrix] = data
  opts[:row_norms] = opts[:points_matrix].map {|v| v**2}.sum(0)

  bestrun = nil

  opts[:runs].times do |i|
    km = new(opts).run

    if opts[:log]
      puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err"
    end
    
    if bestrun.nil? || (km.error < bestrun.error)
      bestrun = km
    end
  end

  bestrun.finish
end

Instance Method Details

#finishObject



177
178
179
180
181
# File 'lib/kmeans-clusterer.rb', line 177

def finish
  set_points
  set_clusters
  self
end

#inspectObject



225
226
227
# File 'lib/kmeans-clusterer.rb', line 225

def inspect
  %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>}
end

#originObject



199
200
201
# File 'lib/kmeans-clusterer.rb', line 199

def origin
  wrap_point Array.new(@points[0].dimension, 0) 
end

#predict(data) ⇒ Object



183
184
185
186
187
188
189
190
# File 'lib/kmeans-clusterer.rb', line 183

def predict data
  data = NMatrix.cast(data, @typecode)
  data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data
  distances = distance(@centroids, data, nil)
  data.shape[1].times.map do |i|
    distances[i, true].sort_index[0] # index of closest cluster
  end
end

#runObject



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/kmeans-clusterer.rb', line 128

def run 
  start_time = Time.now
  @iterations, @runtime = 0, 0

  @cluster_point_ids = Array.new(@k) { [] }

  loop do
    @iterations +=1

    distances = distance(@centroids, @points_matrix)

    # assign point ids to @cluster_point_ids
    @points_count.times do |i|
      min_distance_index = distances[i, true].sort_index[0]
      @cluster_point_ids[min_distance_index] << i
    end

    moves = []
    updated_centroids = []

    @k.times do |i|
      centroid = NArray.ref(@centroids[true, i].flatten)
      point_ids = @cluster_point_ids[i]

      if point_ids.empty?
        newcenter = centroid
        moves << 0
      else
        points = @points_matrix[true, point_ids]
        newcenter = points.mean(1)
        moves << distance(centroid, newcenter)
      end

      updated_centroids << newcenter
    end

    @centroids = NMatrix.cast updated_centroids, @typecode

    break if moves.max < 0.001 # i.e., no movement
    break if @iterations >= 300

    @cluster_point_ids = Array.new(@k) { [] }
  end

  @error = calculate_error
  @runtime =  Time.now - start_time
  self
end

#silhouetteObject Also known as: silhouette_score



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/kmeans-clusterer.rb', line 203

def silhouette
  return 1.0 if @k < 2

  distances = distance(@centroids, @points_matrix)

  scores = @points_count.times.map do |i|
    point = get_point i
    cluster_indexes = distances[i, true].sort_index

    c1_points = get_points_for_centroid cluster_indexes[0]
    c2_points = get_points_for_centroid cluster_indexes[1]

    a = dissimilarity(c1_points, point)
    b = dissimilarity(c2_points, point)
    (b - a) / [a,b].max
  end

  scores.reduce(:+) / scores.length # mean score for all points
end

#sorted_clusters(point = origin) ⇒ Object



192
193
194
195
196
197
# File 'lib/kmeans-clusterer.rb', line 192

def sorted_clusters point = origin
  point = wrap_point point
  centroids = get_cluster_centroids
  distances = distance(centroids, point.data)
  @clusters.sort_by.with_index {|c, i| distances[i] }
end