Class: KMeansClusterer
- Inherits:
-
Object
- Object
- KMeansClusterer
- Defined in:
- lib/kmeans-clusterer.rb
Defined Under Namespace
Modules: Distance, Scaler, Utils Classes: Cluster, Point
Constant Summary collapse
- TYPECODE =
{ double: NArray::DFLOAT, single: NArray::SFLOAT }
- DEFAULT_OPTS =
{ scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }
Instance Attribute Summary collapse
-
#centroids ⇒ Object
readonly
Returns the value of attribute centroids.
-
#clusters ⇒ Object
readonly
Returns the value of attribute clusters.
-
#data ⇒ Object
readonly
Returns the value of attribute data.
-
#distances ⇒ Object
readonly
Returns the value of attribute distances.
-
#error ⇒ Object
readonly
Returns the value of attribute error.
-
#iterations ⇒ Object
readonly
Returns the value of attribute iterations.
-
#k ⇒ Object
readonly
Returns the value of attribute k.
-
#mean ⇒ Object
readonly
Returns the value of attribute mean.
-
#points ⇒ Object
readonly
Returns the value of attribute points.
-
#runtime ⇒ Object
readonly
Returns the value of attribute runtime.
-
#std ⇒ Object
readonly
Returns the value of attribute std.
Class Method Summary collapse
Instance Method Summary collapse
- #finish ⇒ Object
-
#initialize(opts = {}) ⇒ KMeansClusterer
constructor
A new instance of KMeansClusterer.
- #inspect ⇒ Object
- #predict(data) ⇒ Object
- #run ⇒ Object
- #silhouette ⇒ Object
- #sorted_clusters(point = origin) ⇒ Object
Constructor Details
#initialize(opts = {}) ⇒ KMeansClusterer
Returns a new instance of KMeansClusterer.
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/kmeans-clusterer.rb', line 158 def initialize opts = {} @k = opts[:k] @init = opts[:init] @labels = opts[:labels] || [] @row_norms = opts[:row_norms] @data = opts[:data] @points_count = @data.shape[1] if @data @mean = opts[:mean] @std = opts[:std] @scale_data = opts[:scale_data] @typecode = opts[:typecode] @max_iter = opts[:max_iter] init_centroids end |
Instance Attribute Details
#centroids ⇒ Object (readonly)
Returns the value of attribute centroids.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def centroids @centroids end |
#clusters ⇒ Object (readonly)
Returns the value of attribute clusters.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def clusters @clusters end |
#data ⇒ Object (readonly)
Returns the value of attribute data.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def data @data end |
#distances ⇒ Object (readonly)
Returns the value of attribute distances.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def distances @distances end |
#error ⇒ Object (readonly)
Returns the value of attribute error.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def error @error end |
#iterations ⇒ Object (readonly)
Returns the value of attribute iterations.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def iterations @iterations end |
#k ⇒ Object (readonly)
Returns the value of attribute k.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def k @k end |
#mean ⇒ Object (readonly)
Returns the value of attribute mean.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def mean @mean end |
#points ⇒ Object (readonly)
Returns the value of attribute points.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def points @points end |
#runtime ⇒ Object (readonly)
Returns the value of attribute runtime.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def runtime @runtime end |
#std ⇒ Object (readonly)
Returns the value of attribute std.
155 156 157 |
# File 'lib/kmeans-clusterer.rb', line 155 def std @std end |
Class Method Details
.run(k, data, opts = {}) ⇒ Object
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/kmeans-clusterer.rb', line 120 def self.run k, data, opts = {} opts = DEFAULT_OPTS.merge(opts) opts[:k] = k opts[:typecode] = TYPECODE[opts[:float_precision]] data = Utils.ensure_matrix data, opts[:typecode] if opts[:scale_data] data, mean, std = Scaler.scale(data, nil, nil, opts[:typecode]) opts[:mean] = mean opts[:std] = std end opts[:data] = data opts[:row_norms] = Scaler.row_norms(data) bestrun = nil opts[:runs].times do |i| km = new(opts).run if opts[:log] puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err" end if bestrun.nil? || (km.error < bestrun.error) bestrun = km end end bestrun.finish end |
Instance Method Details
#finish ⇒ Object
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
# File 'lib/kmeans-clusterer.rb', line 218 def finish @clusters = @k.times.map do |i| centroid = NArray.ref @centroids[true, i].flatten Cluster.new i, Point.new(-1, centroid, nil, nil) end @points = @points_count.times.map do |i| data = NArray.ref @data[true, i].flatten point = Point.new(i, data, @distances[i, true], @labels[i]) cluster = @clusters[@cluster_assigns[i]] cluster << point point end @clusters.each do |c| c.points.sort_by! &:centroid_distance end self end |
#inspect ⇒ Object
273 274 275 |
# File 'lib/kmeans-clusterer.rb', line 273 def inspect %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>} end |
#predict(data) ⇒ Object
239 240 241 242 243 244 245 246 |
# File 'lib/kmeans-clusterer.rb', line 239 def predict data data = Utils.ensure_matrix data, @typecode data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data distances = Distance.euclidean(@centroids, data) data.shape[1].times.map do |i| distances[i, true].sort_index[0] # index of closest cluster end end |
#run ⇒ Object
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# File 'lib/kmeans-clusterer.rb', line 175 def run start_time = Time.now @iterations, @runtime = 0, 0 @cluster_assigns = NArray.int(@points_count) min_distances = NArray.new(@typecode, @points_count) loop do @iterations +=1 min_distances.fill! Float::INFINITY @distances = Distance.euclidean(@centroids, @data, @row_norms) @k.times do |cluster_id| dist = NArray.ref @distances[true, cluster_id].flatten mask = dist < min_distances @cluster_assigns[mask] = cluster_id min_distances[mask] = dist[mask] end max_move = 0 @k.times do |cluster_id| centroid = NArray.ref(@centroids[true, cluster_id].flatten) point_ids = @cluster_assigns.eq(cluster_id).where unless point_ids.empty? points = @data[true, point_ids] newcenter = points.mean(1) move = Distance.euclidean(centroid, newcenter) max_move = move if move > max_move @centroids[true, cluster_id] = newcenter end end break if max_move < 0.001 # i.e., no movement break if @iterations >= @max_iter end @error = (min_distances**2).sum @runtime = Time.now - start_time self end |
#silhouette ⇒ Object
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
# File 'lib/kmeans-clusterer.rb', line 255 def silhouette return 1.0 if @k < 2 # calculate all point-to-point distances at once # uses more memory, but much faster point_distances = Distance.euclidean @data, @data scores = @points.map do |point| sort_index = point.centroid_distances.sort_index c1, c2 = sort_index[0], sort_index[1] a = dissimilarity point.id, c1, point_distances b = dissimilarity point.id, c2, point_distances (b - a) / [a,b].max end scores.reduce(:+) / scores.length # mean score for all points end |
#sorted_clusters(point = origin) ⇒ Object
248 249 250 251 252 253 |
# File 'lib/kmeans-clusterer.rb', line 248 def sorted_clusters point = origin point = point.data if point.is_a?(Point) point = NArray.cast(point, @typecode) unless point.is_a?(NArray) distances = Distance.euclidean(NArray.ref(@centroids), point) @clusters.sort_by.with_index {|c, i| distances[i] } end |