Class: MachineLearningWorkbench::Compressor::VectorQuantization

Inherits:

Object

Object
MachineLearningWorkbench::Compressor::VectorQuantization

show all

Defined in:: lib/machine_learning_workbench/compressor/vector_quantization.rb

Overview

Standard Vector Quantization

Direct Known Subclasses

CopyVQ, DecayingLearningRateVQ, IncrDictVQ

Instance Attribute Summary collapse

#centrs ⇒ Object readonly

Returns the value of attribute centrs.
#dims ⇒ Object readonly

Returns the value of attribute dims.
#encoding_type ⇒ Object readonly

Returns the value of attribute encoding_type.
#init_centr_vrange ⇒ Object readonly

Returns the value of attribute init_centr_vrange.
#lrate ⇒ Object readonly

Returns the value of attribute lrate.
#ncodes ⇒ Object

Returns the value of attribute ncodes.
#ntrains ⇒ Object readonly

Returns the value of attribute ntrains.
#rng ⇒ Object readonly

Returns the value of attribute rng.
#simil_type ⇒ Object readonly

Returns the value of attribute simil_type.
#utility ⇒ Object

Returns the value of attribute utility.
#vrange ⇒ Object readonly

Returns the value of attribute vrange.

Instance Method Summary collapse

#check_lrate(lrate) ⇒ Object

Verify lrate to be present and withing unit bounds As a separate method only so it can be overloaded in ‘DecayingLearningRateVQ`.
#code_size ⇒ Object

HACKKETY HACKKETY HACK (can’t wait to refactor after the deadline).
#encode(vec, type: encoding_type) ⇒ Object

Encode a vector tracks utility of centroids based on how much they contribute to encoding TODO: ‘encode = Encodings.const_get(type)` in initialize` NOTE: hashes of lambdas or modules cannot access ncodes and utility TODO: refactor anyway through `stats` object, this thing is getting out of hand.
#init_centrs(nc: ncentrs, base: nil, proport: nil) ⇒ Object

Initializes a list of centroids.
#initialize(ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed) ⇒ VectorQuantization constructor

A new instance of VectorQuantization.
#most_similar_centr(vec) ⇒ Array<Integer, Float>

Returns index and similitude of most similar centroid to vector.
#ncentrs ⇒ Object
#new_centr(base = nil, proport = nil) ⇒ Object

Creates a new (random) centroid If a base is passed, this is meshed with the random centroid.
#reconstr_error(vec, code: nil, type: encoding_type) ⇒ NArray

Per-pixel errors in reconstructing vector.
#reconstruction(code, type: encoding_type) ⇒ Object

Reconstruct vector from its code (encoding).
#similarities(vec, type: simil_type) ⇒ Object

Computes similarities between vector and all centroids.
#train(vec_lst, debug: false) ⇒ Object

Train on vector list.
#train_one(vec, eps: nil) ⇒ Integer

Train on one vector.

Constructor Details

#initialize(ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed) ⇒ `VectorQuantization`

Returns a new instance of VectorQuantization.

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 11

def initialize ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed

  @rng = Random.new rseed # TODO: RNG CURRENTLY NOT USED!!

  @dims = Array(dims)
  check_lrate lrate # hack: so that we can overload it in dlr_vq
  @lrate = lrate
  @simil_type = simil_type || raise("missing simil_type")
  @encoding_type = encoding_type || raise("missing encoding_type")
  @init_centr_vrange ||= vrange
  @vrange = case vrange
    when Array
      raise ArgumentError, "vrange size not 2: #{vrange}" unless vrange.size == 2
      vrange.map &method(:Float)
    when Range
      [vrange.first, vrange.last].map &method(:Float)
    else raise ArgumentError, "vrange: unrecognized type: #{vrange.class}"
  end
  init_centrs nc: ncentrs
  @ntrains = [0]*ncentrs              # per-centroid number of trainings
  @utility = NArray.zeros [code_size] # trace how 'useful' are centroids to encodings
  @ncodes = 0
end

Instance Attribute Details

#centrs ⇒ `Object` (readonly)

Returns the value of attribute centrs.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def centrs
  @centrs
end

#dims ⇒ `Object` (readonly)

Returns the value of attribute dims.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def dims
  @dims
end

#encoding_type ⇒ `Object` (readonly)

Returns the value of attribute encoding_type.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def encoding_type
  @encoding_type
end

#init_centr_vrange ⇒ `Object` (readonly)

Returns the value of attribute init_centr_vrange.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def init_centr_vrange
  @init_centr_vrange
end

#lrate ⇒ `Object` (readonly)

Returns the value of attribute lrate.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def lrate
  @lrate
end

#ncodes ⇒ `Object`

Returns the value of attribute ncodes.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def ncodes
  @ncodes
end

#ntrains ⇒ `Object` (readonly)

Returns the value of attribute ntrains.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def ntrains
  @ntrains
end

#rng ⇒ `Object` (readonly)

Returns the value of attribute rng.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def rng
  @rng
end

#simil_type ⇒ `Object` (readonly)

Returns the value of attribute simil_type.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def simil_type
  @simil_type
end

#utility ⇒ `Object`

Returns the value of attribute utility.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def utility
  @utility
end

#vrange ⇒ `Object` (readonly)

Returns the value of attribute vrange.



7
8
9

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 7

def vrange
  @vrange
end

Instance Method Details

#check_lrate(lrate) ⇒ `Object`

Verify lrate to be present and withing unit bounds As a separate method only so it can be overloaded in ‘DecayingLearningRateVQ`

Raises:

(ArgumentError)



46
47
48

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 46

def check_lrate lrate
  raise ArgumentError, "Pass a `lrate` between 0 and 1" unless lrate&.between?(0,1)
end

#code_size ⇒ `Object`

HACKKETY HACKKETY HACK (can’t wait to refactor after the deadline)



40
41
42

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 40

def code_size
  encoding_type == :sparse_coding_v1 ? 2*ncentrs : ncentrs
end

#encode(vec, type: encoding_type) ⇒ `Object`

Encode a vector tracks utility of centroids based on how much they contribute to encoding TODO: ‘encode = Encodings.const_get(type)` in initialize` NOTE: hashes of lambdas or modules cannot access ncodes and utility TODO: refactor anyway through `stats` object, this thing is getting out of hand

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 86

def encode vec, type: encoding_type
  case type
  when :most_similar
    simils = similarities vec
    code = simils.max_index
    @ncodes += 1
    @utility[code] += 1
    code
  when :most_similar_ary
    simils = similarities vec
    code = simils.new_zeros
    code[simils.max_index] = 1
    @ncodes += 1
    @utility += code
    code
  when :ensemble
    simils = similarities vec
    code = simils
    tot = simils.sum
    tot = 1 if tot < 1e-5  # HACK: avoid division by zero
    contrib = code / tot
    @ncodes += 1
    @utility += (contrib - utility) / ncodes # cumulative moving average
    code
  when :norm_ensemble
    simils = similarities vec
    tot = simils.sum
    # NOTE this actually makes a big discontinuity if the total is equal to zero.
    # Does that even ever happen? I guess only w/ reset img (zeros) as lone centroid.
    # Which after first gen is really useless and should just be dropped anyway...
    tot = 1 if tot < 1e-5  # HACK: avoid division by zero
    code = simils / tot
    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
  when :sparse_coding_v1
    raise "requires centroids normalized to unit length!"
    @encoder = nil if @encoder&.shape&.first != centrs.shape.first
    # Danafar & Cuccu: compact form linear regression encoder
    @encoder ||= (centrs.dot centrs.transpose).invert.dot centrs

    raw_code = @encoder.dot(vec)
    # separate positive and negative features (NOTE: all features will be positive)
    # i.e. split[0...n] = max {0, raw[i]}; split[n...2*n] = max {0, -raw[i]}
    # TODO: cite Coates & Ng
    # TODO: optimize and remove redundant variables
    split_code = raw_code.concatenate(-raw_code)
    split_code[split_code<0] = 0
    # normalize such that the code sums to 1
    norm_code = split_code / split_code.sum
    # Danafar: drop to say 80% of info (à la pca)
    thold = 0.2
    sparse_code = norm_code.dup
    sum = 0
    # NOTE: the last element in the sort below has the highest contribution and
    # should NEVER be put to 0, even if it could contribute alone to 100% of the
    # total
    # NOTE: upon further study I disagree this represent information content unless
    # the centroids are unit vectors. So I'm commenting this implementation now,
    # together with the following, until I implement a switch to normalize the
    # centroids based on configuration.



    # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97
    # norm_code.sort_index[0...-1].each do |idx|
    norm_code.size.times.sort_by { |i| norm_code[i] }[0...-1].each do |idx|



      sparse_code[idx] = 0
      sum += norm_code[idx]
      break if sum >= thold # we know the code's total is normalized to 1 and has no negatives
    end
    code = sparse_code / sparse_code.sum # re-normalize sum to 1

    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
   when :sparse_coding_v2
    # Cuccu & Danafar: incremental reconstruction encoding
    # turns out to be closely related to (Orthogonal) Matching Pursuit
    raise "requires centroids normalized to unit length!"
    # return centrs.dot vec # speed test for the rest of the system
    sparse_code = NArray.zeros code_size
    resid = vec
    # cap the number of non-zero elements in the code
    max_nonzero = [1,ncentrs/3].max
    max_nonzero.times do |i|
      # OPT: remove msc from centrs at each loop
      # the algorithm should work even without this opt because
      # we are working on the residuals each time
      simils = centrs.dot resid



      # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97
      # msc = simils.max_index
      simils = simils.to_a
      simils_abs = simils.map &:abs
      msc = simils_abs.index simils_abs.max # most similar centroid



      max_simil = simils[msc]
      # remember to distinguish here to use the pos/neg features trick
      sparse_code[msc] = max_simil
      reconstr = max_simil * centrs[msc, true]
      resid -= reconstr
      # puts "resid#{i} #{resid.abs.mean}" # if debug
      epsilon = 0.005
      # print resid.abs.mean, ' '
      # print sparse_code.to_a, ' '
      break if resid.abs.mean <= epsilon
    end

    # should normalize sum to 1?
    code = sparse_code #/ sparse_code.sum # normalize sum to 1

    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
  when :sparse_coding
    # Cuccu: Direct residual encoding
    # return centrs.dot vec # speed test for the rest of the system
    sparse_code = NArray.zeros code_size
    resid = vec
    # cap the number of non-zero elements in the code
    max_nonzero = [1,ncentrs/3].max
    max_nonzero.times do |i|
      # OPT: remove msc from centrs at each loop
      # the algorithm should work even without this opt because
      # we are working on the residuals each time
      diff = (centrs - resid).abs.sum(1)



      # BUG IN NARRAY SORT!! ruby-numo/numo-narray#97
      # msc = diff.max_index
      diff = diff.to_a
      msc = diff.index diff.min # most similar centroid



      min_diff = diff[msc]
      # remember to distinguish here to use the pos/neg features trick
      sparse_code[msc] = 1
      reconstr = centrs[msc, true]
      resid -= reconstr
      resid[(resid<0).where] = 0 # ignore artifacts introduced by the centroids in reconstruction

      # puts "resid#{i} #{resid.abs.mean}" # if debug
      epsilon = 0.005
      # print resid.abs.mean, ' ' if $ngen == 2; exit if $ngen==3
      # print sparse_code.to_a, ' ' if $ngen == 3; exit if $ngen==4
      break if resid.abs.mean <= epsilon
    end

    code = sparse_code
    @ncodes += 1
    @utility += (code - utility) / ncodes # cumulative moving average
    code
  else raise ArgumentError, "Unrecognized encode #{type}"
  end
end

#init_centrs(nc: ncentrs, base: nil, proport: nil) ⇒ `Object`

Initializes a list of centroids



51
52
53

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 51

def init_centrs nc: ncentrs, base: nil, proport: nil
  @centrs = nc.times.map { new_centr base, proport }.to_na
end

#most_similar_centr(vec) ⇒ `Array<Integer, Float>`

Returns index and similitude of most similar centroid to vector

Returns:

(Array<Integer, Float>) —

the index of the most similar centroid, followed by the corresponding similarity

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 290

def most_similar_centr vec
  simils = similarities vec
  max_idx = simils.max_index
  [max_idx, simils[max_idx]]
end

#ncentrs ⇒ `Object`



35
36
37

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 35

def ncentrs
  @centrs.shape.first
end

#new_centr(base = nil, proport = nil) ⇒ `Object`

Creates a new (random) centroid If a base is passed, this is meshed with the random centroid. This is done to facilitate distributing the training across centroids. TODO: USE RNG HERE!!

Raises:

(ArgumentError)

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 59

def new_centr base=nil, proport=nil
  raise ArgumentError, "Either both or none" if base.nil? ^ proport.nil?
  # require 'pry'; binding.pry if base.nil? ^ proport.nil?
  ret = NArray.new(*dims).rand(*init_centr_vrange)
  ret = ret * (1-proport) + base * proport if base&&proport
  ret
end

#reconstr_error(vec, code: nil, type: encoding_type) ⇒ `NArray`

Per-pixel errors in reconstructing vector

Returns:

(NArray) —

residuals

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 298

def reconstr_error vec, code: nil, type: encoding_type
  code ||= encode vec, type: type
  resid = vec - reconstruction(code, type: type)
  # we ignore the extra stuff coming from the centroids,
  # only care that everything in the obs is represented in centrs
  resid[resid<0] = 0 if encoding_type == :sparse_coding
  resid
end

#reconstruction(code, type: encoding_type) ⇒ `Object`

Reconstruct vector from its code (encoding)

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 253

def reconstruction code, type: encoding_type
  case type
  when :most_similar
    centrs[code, true]
  when :most_similar_ary
    centrs[code.eq(1), true]
  when :ensemble
    # tot = code.reduce :+
    # centrs.zip(code).map { |centr, contr| centr*contr/tot }.reduce :+
    centrs.dot(code) / code.sum
  when :norm_ensemble
    centrs.dot code
    # centrs.zip(code).map { |centr, contr| centr*contr }.reduce :+
  when :sparse_coding_v1
    raise "requires normalized centroids!"
    reconstr_code = code[0...(code.size/2)] - code[(code.size/2)..-1]
    reconstr = centrs.transpose.dot reconstr_code
  when :sparse_coding_v2
    raise "requires normalized centroids!"


    # BUG IN NARRAY DOT!! ruby-numo/numo-narray#99
    # reconstr = code.dot centrs
    reconstr = code.expand_dims(0).dot centrs


  when :sparse_coding
    # the code is binary, so just sum over the corresponding centroids
    # note: sum, not mean, because of how it's used in reconstr_error
    reconstr = centrs[code.cast_to(Numo::Bit).where, true].sum(0)
  else raise ArgumentError, "unrecognized reconstruction type: #{type}"
  end
end

#similarities(vec, type: simil_type) ⇒ `Object`

Computes similarities between vector and all centroids

Raises:

(NotImplementedError)

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 73

def similarities vec, type: simil_type
  raise NotImplementedError if vec.shape.size > 1
  raise "need to check since centrs is a NArray now" if type == :mse
  # simil_fn = SIMIL[type] || raise(ArgumentError, "Unrecognized simil #{type}")
  # centrs.map { |centr| simil_fn.call centr, vec }
  centrs.dot vec
end

#train(vec_lst, debug: false) ⇒ `Object`

Train on vector list

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 321

def train vec_lst, debug: false
  # Two ways here:
  # - Batch: canonical, centrs updated with each vec
  # - Parallel: could be parallel either on simils or on training (?)
  # Unsure on the correctness of either Parallel, let's stick with Batch
  vec_lst.each_with_index do |vec, i|
    trained_idx = train_one vec
    print '.' if debug
    @ntrains[trained_idx] += 1 if @ntrains
  end
end

#train_one(vec, eps: nil) ⇒ `Integer`

Train on one vector

Returns:

(Integer) —

index of trained centroid

# File 'lib/machine_learning_workbench/compressor/vector_quantization.rb', line 309

def train_one vec, eps: nil
  # NOTE: ignores epsilon if passed
  trg_idx, _simil = most_similar_centr(vec)
  # note: uhm that actually looks like a dot product... maybe faster?
  #   `[c[i], vec].dot([1-lrate, lrate])`
  # norm_vec = vec / NLinalg.norm(vec)
  # centrs[trg_idx, true] = centrs[trg_idx, true] * (1-lrate) + norm_vec * lrate
  centrs[trg_idx, true] = centrs[trg_idx, true] * (1-lrate) + vec * lrate
  trg_idx
end

Class: MachineLearningWorkbench::Compressor::VectorQuantization

Overview

Direct Known Subclasses

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed) ⇒ VectorQuantization

Instance Attribute Details

#centrs ⇒ Object (readonly)

#dims ⇒ Object (readonly)

#encoding_type ⇒ Object (readonly)

#init_centr_vrange ⇒ Object (readonly)

#lrate ⇒ Object (readonly)

#ncodes ⇒ Object

#ntrains ⇒ Object (readonly)

#rng ⇒ Object (readonly)

#simil_type ⇒ Object (readonly)

#utility ⇒ Object

#vrange ⇒ Object (readonly)

Instance Method Details

#check_lrate(lrate) ⇒ Object

#code_size ⇒ Object

#encode(vec, type: encoding_type) ⇒ Object

#init_centrs(nc: ncentrs, base: nil, proport: nil) ⇒ Object

#most_similar_centr(vec) ⇒ Array<Integer, Float>

#ncentrs ⇒ Object

#new_centr(base = nil, proport = nil) ⇒ Object

#reconstr_error(vec, code: nil, type: encoding_type) ⇒ NArray

#reconstruction(code, type: encoding_type) ⇒ Object

#similarities(vec, type: simil_type) ⇒ Object

#train(vec_lst, debug: false) ⇒ Object

#train_one(vec, eps: nil) ⇒ Integer

#initialize(ncentrs:, dims:, vrange:, lrate:, simil_type: nil, encoding_type: nil, init_centr_vrange: nil, rseed: Random.new_seed) ⇒ `VectorQuantization`

#centrs ⇒ `Object` (readonly)

#dims ⇒ `Object` (readonly)

#encoding_type ⇒ `Object` (readonly)

#init_centr_vrange ⇒ `Object` (readonly)

#lrate ⇒ `Object` (readonly)

#ncodes ⇒ `Object`

#ntrains ⇒ `Object` (readonly)

#rng ⇒ `Object` (readonly)

#simil_type ⇒ `Object` (readonly)

#utility ⇒ `Object`

#vrange ⇒ `Object` (readonly)

#check_lrate(lrate) ⇒ `Object`

#code_size ⇒ `Object`

#encode(vec, type: encoding_type) ⇒ `Object`

#init_centrs(nc: ncentrs, base: nil, proport: nil) ⇒ `Object`

#most_similar_centr(vec) ⇒ `Array<Integer, Float>`

#ncentrs ⇒ `Object`

#new_centr(base = nil, proport = nil) ⇒ `Object`

#reconstr_error(vec, code: nil, type: encoding_type) ⇒ `NArray`

#reconstruction(code, type: encoding_type) ⇒ `Object`

#similarities(vec, type: simil_type) ⇒ `Object`

#train(vec_lst, debug: false) ⇒ `Object`

#train_one(vec, eps: nil) ⇒ `Integer`