Class: LSH::Index

Inherits:
Object
  • Object
show all
Defined in:
lib/lsh/index.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameters = {}, storage = LSH::Storage::Memory.new) ⇒ Index

Returns a new instance of Index.



23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/lsh/index.rb', line 23

def initialize(parameters = {}, storage = LSH::Storage::Memory.new)
  @storage = storage
  unless storage.has_index?
    storage.parameters = parameters
    # Initializing projections and buckets
    storage.projections = generate_projections(
      parameters[:dim], 
      parameters[:number_of_random_vectors], 
      parameters[:number_of_independent_projections]
    )
    parameters[:number_of_independent_projections].times { |i| storage.create_new_bucket }
  end
end

Instance Attribute Details

#storageObject (readonly)

Returns the value of attribute storage.



21
22
23
# File 'lib/lsh/index.rb', line 21

def storage
  @storage
end

Class Method Details

.load(storage) ⇒ Object



37
38
39
# File 'lib/lsh/index.rb', line 37

def self.load(storage)
  Index.new(storage.parameters, storage) if storage.has_index? 
end

Instance Method Details

#add(vector, id = nil) ⇒ Object



41
42
43
44
45
46
47
48
49
50
# File 'lib/lsh/index.rb', line 41

def add(vector, id = nil)
  vector_hash = vector.hash
  storage.add_vector(vector, vector_hash)
  storage.add_vector_id(vector_hash, id) if id
  hashes(vector).each_with_index do |hash, i|
    hash_i = array_to_hash(hash)
    bucket = storage.find_bucket(i)
    storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash)
  end
end

#array_to_hash(array) ⇒ Object



148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/lsh/index.rb', line 148

def array_to_hash(array)
  return array.hash
  # Derives a 28 bit hash value from an array of integers
  # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572
  # TODO: Check it works for non-binary LSH
  #return 0 if array.size == 0
  #value = (array.first << 7)
  #array.each do |v|
  #  value = (101 * value + v) & 0xffffff
  #end
  #value
end

#generate_projection(dim, k) ⇒ Object



169
170
171
# File 'lib/lsh/index.rb', line 169

def generate_projection(dim, k)
 MathUtil.random_gaussian_matrix(k, dim)
end

#generate_projections(dim, k, l) ⇒ Object



161
162
163
164
165
166
167
# File 'lib/lsh/index.rb', line 161

def generate_projections(dim, k, l)
  projections = []
  l.times do |i|
    projections << generate_projection(dim, k)
  end
  projections
end

#hash(vector, projection, bias = true) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/lsh/index.rb', line 116

def hash(vector, projection, bias = true)
  hash = []
  dot_products = (projection * vector.transpose).column(0).to_a
  window = storage.parameters[:window]
  dot_products.each do |dot_product|
    if window == Float::INFINITY # Binary LSH
      if dot_product >= 0
        hash << 1
      else
        hash << 0
      end
    else
      b = bias ? MathUtil.random_uniform : 0.0
      hash << (b + dot_product / window).floor
    end
  end
  hash
end

#hashes(vector) ⇒ Object



108
109
110
111
112
113
114
# File 'lib/lsh/index.rb', line 108

def hashes(vector)
  hashes = []
  storage.projections.each do |projection|
    hashes << hash(vector, projection)
  end
  hashes
end

#hashes_are_binary?Boolean

Returns:

  • (Boolean)


135
136
137
# File 'lib/lsh/index.rb', line 135

def hashes_are_binary?
  storage.parameters[:window] == Float::INFINITY
end

#id_to_vector(id) ⇒ Object



56
57
58
# File 'lib/lsh/index.rb', line 56

def id_to_vector(id)
  storage.id_to_vector(id)
end

#inspectObject



177
178
179
# File 'lib/lsh/index.rb', line 177

def inspect
  "#<LSH index; dimension: #{storage.parameters[:dim]}; window size: #{storage.parameters[:window]}; #{storage.parameters[:number_of_random_vectors]} random vectors; #{storage.parameters[:number_of_independent_projections]} independent projections>"
end

#multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
# File 'lib/lsh/index.rb', line 89

def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
  mp_arrays = []
  (1..multiprobe_radius).to_a.each do |radius|
    (0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips|
      probes = Marshal.load(Marshal.dump(hash_arrays))
      probes.each { |probe| flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 } }
      mp_arrays << probes
    end
  end
  mp_arrays
end

#order_results_by_similarity(vector, results) ⇒ Object



101
102
103
104
105
106
# File 'lib/lsh/index.rb', line 101

def order_results_by_similarity(vector, results)
  # Faster than vectors.sort - we precompute all similarities to vector
  # and order using those
  similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] }
  similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } }
end

#query(vector, multiprobe_radius = 0) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/lsh/index.rb', line 60

def query(vector, multiprobe_radius = 0)
  hash_arrays = hashes(vector)
  hashes = hash_arrays.map { |a| array_to_hash(a) }
  results = storage.query_buckets(hashes)
  # Multiprobe LSH
  # Take query hashes, move them around at radius r, and use them to do another query
  # TODO: only works for binary LSH atm
  if multiprobe_radius > 0
    raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary?
    mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius)
    mp_arrays.each do |probes_arrays|
      probes_hashes = probes_arrays.map { |a| array_to_hash(a) }
      results += storage.query_buckets(probes_hashes)
    end
    results.uniq! { |result| result[:hash] }
  end
  order_results_by_similarity(vector, results)
end

#query_ids(id, multiprobe_radius = 0) ⇒ Object



79
80
81
82
# File 'lib/lsh/index.rb', line 79

def query_ids(id, multiprobe_radius = 0)
  vector = id_to_vector(id)
  query_ids_by_vector(vector, multiprobe_radius)
end

#query_ids_by_vector(vector, multiprobe_radius = 0) ⇒ Object



84
85
86
87
# File 'lib/lsh/index.rb', line 84

def query_ids_by_vector(vector, multiprobe_radius = 0)
  results = query(vector, multiprobe_radius)
  results.map { |result| vector_hash_to_id(result[:hash]) }
end

#random_vector(dim) ⇒ Object



139
140
141
# File 'lib/lsh/index.rb', line 139

def random_vector(dim)
  MathUtil.random_gaussian_matrix(1, dim)
end

#random_vector_unit(dim) ⇒ Object



143
144
145
146
# File 'lib/lsh/index.rb', line 143

def random_vector_unit(dim)
  r = random_vector(dim)
  r /= MathUtil.norm(r)
end

#similarity(v1, v2) ⇒ Object



173
174
175
# File 'lib/lsh/index.rb', line 173

def similarity(v1, v2)
  MathUtil.dot(v1, v2)
end

#vector_hash_to_id(vector_hash) ⇒ Object



52
53
54
# File 'lib/lsh/index.rb', line 52

def vector_hash_to_id(vector_hash)
  storage.vector_hash_to_id(vector_hash)
end