Class: MachineLearningWorkbench::Compressor::VectorQuantization

Inherits:
Object
  • Object
show all
Defined in:
lib/machine_learning_workbench/compressor/vector_quantization.rb

Overview

Standard Vector Quantization

Direct Known Subclasses

CopyVQ, DecayingLearningRateVQ, IncrDictVQ

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed) ⇒ VectorQuantization

Returns a new instance of VectorQuantization.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 11

def initialize ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed

  @rng = Random.new rseed # TODO: RNG CURRENTLY NOT USED!!

  @dims = Array(dims)
  check_lrate lrate # hack: so that we can overload it in dlr_vq
  @lrate = lrate
  @simil_type = simil_type || raise("missing simil_type")
  @encoding_type = encoding_type || raise("missing encoding_type")
  @init_centr_vrange ||= vrange
  @vrange = case vrange
    when Array
      raise ArgumentError, "vrange size not 2: #{vrange}" unless vrange.size == 2
      vrange.map &method(:Float)
    when Range
      [vrange.first, vrange.last].map &method(:Float)
    else raise ArgumentError, "vrange: unrecognized type: #{vrange.class}"
  end
  init_centrs nc: ncentrs
  @ntrains = [0]*ncentrs              # per-centroid number of trainings
  @utility = NArray.zeros [code_size] # trace how 'useful' are centroids to encodings
  @ncodes = 0
end

Instance Attribute Details

#centrsObject (readonly)

Returns the value of attribute centrs.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def centrs
  @centrs
end

#dimsObject (readonly)

Returns the value of attribute dims.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def dims
  @dims
end

#encoding_typeObject (readonly)

Returns the value of attribute encoding_type.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def encoding_type
  @encoding_type
end

#init_centr_vrangeObject (readonly)

Returns the value of attribute init_centr_vrange.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def init_centr_vrange
  @init_centr_vrange
end

#lrateObject (readonly)

Returns the value of attribute lrate.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def lrate
  @lrate
end

#ncodesObject

Returns the value of attribute ncodes.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def ncodes
  @ncodes
end

#ntrainsObject (readonly)

Returns the value of attribute ntrains.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def ntrains
  @ntrains
end

#rngObject (readonly)

Returns the value of attribute rng.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def rng
  @rng
end

#simil_typeObject (readonly)

Returns the value of attribute simil_type.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def simil_type
  @simil_type
end

#utilityObject

Returns the value of attribute utility.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def utility
  @utility
end

#vrangeObject (readonly)

Returns the value of attribute vrange.



7
8
9
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def vrange
  @vrange
end

Instance Method Details

#check_lrate(lrate) ⇒ Object

Verify lrate to be present and withing unit bounds As a separate method only so it can be overloaded in ‘DecayingLearningRateVQ`

Raises:

  • (ArgumentError)


46
47
48
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 46

def check_lrate lrate
  raise ArgumentError, "Pass a `lrate` between 0 and 1" unless lrate&.between?(0,1)
end

#code_sizeObject

HACKKETY HACKKETY HACK (can’t wait to refactor after the deadline)



40
41
42
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 40

def code_size
  encoding_type == :sparse_coding_v1 ? 2*ncentrs : ncentrs
end

#encode(vec, type: encoding_type) ⇒ Object

Encode a vector tracks utility of centroids based on how much they contribute to encoding TODO: ‘encode = Encodings.const_get(type)` in initialize` NOTE: hashes of lambdas or modules cannot access ncodes and utility TODO: refactor anyway through `stats` object, this thing is getting out of hand



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 86

def encode vec, type: encoding_type
  case type
  when :most_similar
    simils = similarities vec
    code = simils.max_index
    @ncodes += 1
    @utility[code] += 1
    code
  when :most_similar_ary
    simils = similarities vec
    code = simils.new_zeros
    code[simils.max_index] = 1
    @ncodes += 1
    @utility += code
    code
  when :ensemble
    simils = similarities vec
    code = simils
    tot = simils.sum
    tot = 1 if tot < 1e-5  # HACK: avoid division by zero
    contrib = code / tot
    @ncodes += 1
    @utility += (contrib - utility) / ncodes # cumulative moving average
    code
  when :norm_ensemble
    simils = similarities vec
    tot = simils.sum
    # NOTE this actually makes a big discontinuity if the total is equal to zero.
    # Does that even ever happen? I guess only w/ reset img (zeros) as lone centroid.
    # Which after first gen is really useless and should just be dropped anyway...
    tot = 1 if tot < 1e-5  # HACK: avoid division by zero
    code = simils / tot
    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
  when :sparse_coding_v1
    raise "requires centroids normalized to unit length!"
    @encoder = nil if @encoder&.shape&.first != centrs.shape.first
    # Danafar & Cuccu: compact form linear regression encoder
    @encoder ||= (centrs.dot centrs.transpose).invert.dot centrs

    raw_code = @encoder.dot(vec)
    # separate positive and negative features (NOTE: all features will be positive)
    # i.e. split[0...n] = max {0, raw[i]}; split[n...2*n] = max {0, -raw[i]}
    # TODO: cite Coates & Ng
    # TODO: optimize and remove redundant variables
    split_code = raw_code.concatenate(-raw_code)
    split_code[split_code<0] = 0
    # normalize such that the code sums to 1
    norm_code = split_code / split_code.sum
    # Danafar: drop to say 80% of info (à la pca)
    thold = 0.2
    sparse_code = norm_code.dup
    sum = 0
    # NOTE: the last element in the sort below has the highest contribution and
    # should NEVER be put to 0, even if it could contribute alone to 100% of the
    # total
    # NOTE: upon further study I disagree this represent information content unless
    # the centroids are unit vectors. So I'm commenting this implementation now,
    # together with the following, until I implement a switch to normalize the
    # centroids based on configuration.



    # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97
    # norm_code.sort_index[0...-1].each do |idx|
    norm_code.size.times.sort_by { |i| norm_code[i] }[0...-1].each do |idx|



      sparse_code[idx] = 0
      sum += norm_code[idx]
      break if sum >= thold # we know the code's total is normalized to 1 and has no negatives
    end
    code = sparse_code / sparse_code.sum # re-normalize sum to 1

    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
   when :sparse_coding_v2
    # Cuccu & Danafar: incremental reconstruction encoding
    # turns out to be closely related to (Orthogonal) Matching Pursuit
    raise "requires centroids normalized to unit length!"
    # return centrs.dot vec # speed test for the rest of the system
    sparse_code = NArray.zeros code_size
    resid = vec
    # cap the number of non-zero elements in the code
    max_nonzero = [1,ncentrs/3].max
    max_nonzero.times do |i|
      # OPT: remove msc from centrs at each loop
      # the algorithm should work even without this opt because
      # we are working on the residuals each time
      simils = centrs.dot resid



      # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97
      # msc = simils.max_index
      simils = simils.to_a
      simils_abs = simils.map &:abs
      msc = simils_abs.index simils_abs.max # most similar centroid



      max_simil = simils[msc]
      # remember to distinguish here to use the pos/neg features trick
      sparse_code[msc] = max_simil
      reconstr = max_simil * centrs[msc, true]
      resid -= reconstr
      # puts "resid#{i} #{resid.abs.mean}" # if debug
      epsilon = 0.005
      # print resid.abs.mean, ' '
      # print sparse_code.to_a, ' '
      break if resid.abs.mean <= epsilon
    end

    # should normalize sum to 1?
    code = sparse_code #/ sparse_code.sum # normalize sum to 1

    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
  when :sparse_coding
    # Cuccu: Direct residual encoding
    # return centrs.dot vec # speed test for the rest of the system
    sparse_code = NArray.zeros code_size
    resid = vec
    # cap the number of non-zero elements in the code
    max_nonzero = [1,ncentrs/3].max
    max_nonzero.times do |i|
      # OPT: remove msc from centrs at each loop
      # the algorithm should work even without this opt because
      # we are working on the residuals each time
      diff = (centrs - resid).abs.sum(1)



      # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97
      # msc = diff.max_index
      diff = diff.to_a
      msc = diff.index diff.min # most similar centroid



      min_diff = diff[msc]
      # remember to distinguish here to use the pos/neg features trick
      sparse_code[msc] = 1
      reconstr = centrs[msc, true]
      resid -= reconstr
      resid[(resid<0).where] = 0 # ignore artifacts introduced by the centroids in reconstruction

      # puts "resid#{i} #{resid.abs.mean}" # if debug
      epsilon = 0.005
      # print resid.abs.mean, ' ' if $ngen == 2; exit if $ngen==3
      # print sparse_code.to_a, ' ' if $ngen == 3; exit if $ngen==4
      break if resid.abs.mean <= epsilon
    end

    code = sparse_code
    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
  else raise ArgumentError, "Unrecognized encode #{type}"
  end
end

#init_centrs(nc: ncentrs, base: nil, proport: nil) ⇒ Object

Initializes a list of centroids



51
52
53
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 51

def init_centrs nc: ncentrs, base: nil, proport: nil
  @centrs = nc.times.map { new_centr base, proport }.to_na
end

#most_similar_centr(vec) ⇒ Array<Integer, Float>

Returns index and similitude of most similar centroid to vector

Returns:

  • (Array<Integer, Float>)

    the index of the most similar centroid, followed by the corresponding similarity



290
291
292
293
294
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 290

def most_similar_centr vec
  simils = similarities vec
  max_idx = simils.max_index
  [max_idx, simils[max_idx]]
end

#ncentrsObject



35
36
37
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 35

def ncentrs
  @centrs.shape.first
end

#new_centr(base = nil, proport = nil) ⇒ Object

Creates a new (random) centroid If a base is passed, this is meshed with the random centroid. This is done to facilitate distributing the training across centroids. TODO: USE RNG HERE!!

Raises:

  • (ArgumentError)


59
60
61
62
63
64
65
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 59

def new_centr base=nil, proport=nil
  raise ArgumentError, "Either both or none" if base.nil? ^ proport.nil?
  # require 'pry'; binding.pry if base.nil? ^ proport.nil?
  ret = NArray.new(*dims).rand(*init_centr_vrange)
  ret = ret * (1-proport) + base * proport if base&&proport
  ret
end

#reconstr_error(vec, code: nil, type: encoding_type) ⇒ NArray

Per-pixel errors in reconstructing vector

Returns:



298
299
300
301
302
303
304
305
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 298

def reconstr_error vec, code: nil, type: encoding_type
  code ||= encode vec, type: type
  resid = vec - reconstruction(code, type: type)
  # we ignore the extra stuff coming from the centroids,
  # only care that everything in the obs is represented in centrs
  resid[resid<0] = 0 if encoding_type == :sparse_coding
  resid
end

#reconstruction(code, type: encoding_type) ⇒ Object

Reconstruct vector from its code (encoding)



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 253

def reconstruction code, type: encoding_type
  case type
  when :most_similar
    centrs[code, true]
  when :most_similar_ary
    centrs[code.eq(1), true]
  when :ensemble
    # tot = code.reduce :+
    # centrs.zip(code).map { |centr, contr| centr*contr/tot }.reduce :+
    centrs.dot(code) / code.sum
  when :norm_ensemble
    centrs.dot code
    # centrs.zip(code).map { |centr, contr| centr*contr }.reduce :+
  when :sparse_coding_v1
    raise "requires normalized centroids!"
    reconstr_code = code[0...(code.size/2)] - code[(code.size/2)..-1]
    reconstr = centrs.transpose.dot reconstr_code
  when :sparse_coding_v2
    raise "requires normalized centroids!"


    # BUG IN NARRAY DOT!! ruby-numo/numo-narray#99
    # reconstr = code.dot centrs
    reconstr = code.expand_dims(0).dot centrs


  when :sparse_coding
    # the code is binary, so just sum over the corresponding centroids
    # note: sum, not mean, because of how it's used in reconstr_error
    reconstr = centrs[code.cast_to(Numo::Bit).where, true].sum(0)
  else raise ArgumentError, "unrecognized reconstruction type: #{type}"
  end
end

#similarities(vec, type: simil_type) ⇒ Object

Computes similarities between vector and all centroids

Raises:

  • (NotImplementedError)


73
74
75
76
77
78
79
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 73

def similarities vec, type: simil_type
  raise NotImplementedError if vec.shape.size > 1
  raise "need to check since centrs is a NArray now" if type == :mse
  # simil_fn = SIMIL[type] || raise(ArgumentError, "Unrecognized simil #{type}")
  # centrs.map { |centr| simil_fn.call centr, vec }
  centrs.dot vec
end

#train(vec_lst, debug: false) ⇒ Object

Train on vector list



321
322
323
324
325
326
327
328
329
330
331
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 321

def train vec_lst, debug: false
  # Two ways here:
  # - Batch: canonical, centrs updated with each vec
  # - Parallel: could be parallel either on simils or on training (?)
  # Unsure on the correctness of either Parallel, let's stick with Batch
  vec_lst.each_with_index do |vec, i|
    trained_idx = train_one vec
    print '.' if debug
    @ntrains[trained_idx] += 1 if @ntrains
  end
end

#train_one(vec, eps: nil) ⇒ Integer

Train on one vector

Returns:

  • (Integer)

    index of trained centroid



309
310
311
312
313
314
315
316
317
318
# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 309

def train_one vec, eps: nil
  # NOTE: ignores epsilon if passed
  trg_idx, _simil = most_similar_centr(vec)
  # note: uhm that actually looks like a dot product... maybe faster?
  #   `[c[i], vec].dot([1-lrate, lrate])`
  # norm_vec = vec / NLinalg.norm(vec)
  # centrs[trg_idx, true] = centrs[trg_idx, true] * (1-lrate) + norm_vec * lrate
  centrs[trg_idx, true] = centrs[trg_idx, true] * (1-lrate) + vec * lrate
  trg_idx
end