Class: KMeansClusterer
- Inherits:
-
Object
- Object
- KMeansClusterer
- Defined in:
- lib/kmeans-clusterer.rb
Defined Under Namespace
Modules: Distance, Scaler, Utils Classes: Cluster, Point
Constant Summary collapse
- TYPECODE =
{ double: NArray::DFLOAT, single: NArray::SFLOAT }
- DEFAULT_OPTS =
{ scale_data: false, runs: 10, log: false, init: :kmpp, float_precision: :double, max_iter: 300 }
Instance Attribute Summary collapse
-
#centroids ⇒ Object
readonly
Returns the value of attribute centroids.
-
#clusters ⇒ Object
readonly
Returns the value of attribute clusters.
-
#data ⇒ Object
readonly
Returns the value of attribute data.
-
#distances ⇒ Object
readonly
Returns the value of attribute distances.
-
#error ⇒ Object
readonly
Returns the value of attribute error.
-
#iterations ⇒ Object
readonly
Returns the value of attribute iterations.
-
#k ⇒ Object
readonly
Returns the value of attribute k.
-
#mean ⇒ Object
readonly
Returns the value of attribute mean.
-
#points ⇒ Object
readonly
Returns the value of attribute points.
-
#runtime ⇒ Object
readonly
Returns the value of attribute runtime.
-
#std ⇒ Object
readonly
Returns the value of attribute std.
Class Method Summary collapse
Instance Method Summary collapse
- #finish ⇒ Object
-
#initialize(opts = {}) ⇒ KMeansClusterer
constructor
A new instance of KMeansClusterer.
- #inspect ⇒ Object
- #predict(data) ⇒ Object
- #run ⇒ Object
- #silhouette ⇒ Object
- #sorted_clusters(point = origin) ⇒ Object
Constructor Details
#initialize(opts = {}) ⇒ KMeansClusterer
Returns a new instance of KMeansClusterer.
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/kmeans-clusterer.rb', line 162 def initialize opts = {} @k = opts[:k] @init = opts[:init] @labels = opts[:labels] || [] @row_norms = opts[:row_norms] @data = opts[:data] @points_count = @data ? @data.shape[1] : 0 @mean = Utils.ensure_narray(opts[:mean]) if opts[:mean] @std = Utils.ensure_narray(opts[:std]) if opts[:std] @scale_data = opts[:scale_data] @typecode = TYPECODE[opts[:float_precision] || :double] @max_iter = opts[:max_iter] init_centroids end |
Instance Attribute Details
#centroids ⇒ Object (readonly)
Returns the value of attribute centroids.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def centroids @centroids end |
#clusters ⇒ Object (readonly)
Returns the value of attribute clusters.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def clusters @clusters end |
#data ⇒ Object (readonly)
Returns the value of attribute data.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def data @data end |
#distances ⇒ Object (readonly)
Returns the value of attribute distances.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def distances @distances end |
#error ⇒ Object (readonly)
Returns the value of attribute error.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def error @error end |
#iterations ⇒ Object (readonly)
Returns the value of attribute iterations.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def iterations @iterations end |
#k ⇒ Object (readonly)
Returns the value of attribute k.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def k @k end |
#mean ⇒ Object (readonly)
Returns the value of attribute mean.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def mean @mean end |
#points ⇒ Object (readonly)
Returns the value of attribute points.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def points @points end |
#runtime ⇒ Object (readonly)
Returns the value of attribute runtime.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def runtime @runtime end |
#std ⇒ Object (readonly)
Returns the value of attribute std.
159 160 161 |
# File 'lib/kmeans-clusterer.rb', line 159 def std @std end |
Class Method Details
.run(k, data, opts = {}) ⇒ Object
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/kmeans-clusterer.rb', line 124 def self.run k, data, opts = {} opts = DEFAULT_OPTS.merge(opts) opts[:k] = k typecode = TYPECODE[opts[:float_precision]] data = Utils.ensure_matrix data, typecode if opts[:scale_data] data, mean, std = Scaler.scale(data, nil, nil, typecode) opts[:mean] = mean opts[:std] = std end opts[:data] = data opts[:row_norms] = Scaler.row_norms(data) bestrun = nil opts[:runs].times do |i| km = new(opts).run if opts[:log] puts "[#{i + 1}] #{km.iterations} iter\t#{km.runtime.round(2)}s\t#{km.error.round(2)} err" end if bestrun.nil? || (km.error < bestrun.error) bestrun = km end end bestrun.finish end |
Instance Method Details
#finish ⇒ Object
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
# File 'lib/kmeans-clusterer.rb', line 222 def finish @clusters = @k.times.map do |i| centroid = NArray.ref @centroids[true, i].flatten Cluster.new i, Point.new(-1, centroid, nil, nil) end @points = @points_count.times.map do |i| data = NArray.ref @data[true, i].flatten point = Point.new(i, data, @distances[i, true], @labels[i]) cluster = @clusters[@cluster_assigns[i]] cluster << point point end @clusters.each do |c| c.points.sort_by! &:centroid_distance end self end |
#inspect ⇒ Object
281 282 283 |
# File 'lib/kmeans-clusterer.rb', line 281 def inspect %{#<#{self.class.name} k:#{@k} iterations:#{@iterations} error:#{@error} runtime:#{@runtime}>} end |
#predict(data) ⇒ Object
243 244 245 246 247 248 249 250 |
# File 'lib/kmeans-clusterer.rb', line 243 def predict data data = Utils.ensure_matrix data, @typecode data, _m, _s = Scaler.scale(data, @mean, @std, @typecode) if @scale_data distances = Distance.euclidean(@centroids, data) data.shape[1].times.map do |i| distances[i, true].sort_index[0] # index of closest cluster end end |
#run ⇒ Object
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# File 'lib/kmeans-clusterer.rb', line 179 def run start_time = Time.now @iterations, @runtime = 0, 0 @cluster_assigns = NArray.int(@points_count) min_distances = NArray.new(@typecode, @points_count) loop do @iterations +=1 min_distances.fill! Float::INFINITY @distances = Distance.euclidean(@centroids, @data, @row_norms) @k.times do |cluster_id| dist = NArray.ref @distances[true, cluster_id].flatten mask = dist < min_distances @cluster_assigns[mask] = cluster_id min_distances[mask] = dist[mask] end max_move = 0 @k.times do |cluster_id| centroid = NArray.ref(@centroids[true, cluster_id].flatten) point_ids = @cluster_assigns.eq(cluster_id).where unless point_ids.empty? points = @data[true, point_ids] newcenter = points.mean(1) move = Distance.euclidean(centroid, newcenter) max_move = move if move > max_move @centroids[true, cluster_id] = newcenter end end break if max_move < 0.001 # i.e., no movement break if @iterations >= @max_iter end @error = (min_distances**2).sum @runtime = Time.now - start_time self end |
#silhouette ⇒ Object
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
# File 'lib/kmeans-clusterer.rb', line 259 def silhouette return 1.0 if @k < 2 # calculate all point-to-point distances at once # uses more memory, but much faster point_distances = Distance.euclidean @data, @data scores = @points.map do |point| dissimilarities = @clusters.map do |cluster| dissimilarity(point.id, cluster.id, point_distances) end a = dissimilarities[point.cluster.id] # set to Infinity so we can pick next closest via min() dissimilarities[point.cluster.id] = Float::INFINITY b = dissimilarities.min (b - a) / [a,b].max end scores.reduce(:+) / scores.length # mean score for all points end |
#sorted_clusters(point = origin) ⇒ Object
252 253 254 255 256 257 |
# File 'lib/kmeans-clusterer.rb', line 252 def sorted_clusters point = origin point = point.data if point.is_a?(Point) point = NArray.cast(point, @typecode) unless point.is_a?(NArray) distances = Distance.euclidean(NArray.ref(@centroids), point) @clusters.sort_by.with_index {|c, i| distances[i] } end |