Class: LSH::Index
- Inherits:
-
Object
- Object
- LSH::Index
- Defined in:
- lib/lsh/index.rb
Instance Attribute Summary collapse
-
#storage ⇒ Object
readonly
Returns the value of attribute storage.
Class Method Summary collapse
Instance Method Summary collapse
- #add(vector, id = nil) ⇒ Object
- #array_to_hash(array) ⇒ Object
- #generate_projection(dim, k) ⇒ Object
- #generate_projections(dim, k, l) ⇒ Object
- #hash(vector, projection, bias = true) ⇒ Object
- #hashes(vector) ⇒ Object
- #hashes_are_binary? ⇒ Boolean
- #id_to_vector(id) ⇒ Object
-
#initialize(parameters = {}, storage = LSH::Storage::Memory.new) ⇒ Index
constructor
A new instance of Index.
- #inspect ⇒ Object
- #multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) ⇒ Object
- #order_results_by_similarity(vector, results) ⇒ Object
- #query(vector, multiprobe_radius = 0) ⇒ Object
- #query_ids(id, multiprobe_radius = 0) ⇒ Object
- #query_ids_by_vector(vector, multiprobe_radius = 0) ⇒ Object
- #random_vector(dim) ⇒ Object
- #random_vector_unit(dim) ⇒ Object
- #similarity(v1, v2) ⇒ Object
- #vector_hash_to_id(vector_hash) ⇒ Object
Constructor Details
#initialize(parameters = {}, storage = LSH::Storage::Memory.new) ⇒ Index
Returns a new instance of Index.
23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/lsh/index.rb', line 23 def initialize(parameters = {}, storage = LSH::Storage::Memory.new) @storage = storage unless storage.has_index? storage.parameters = parameters # Initializing projections and buckets storage.projections = generate_projections( parameters[:dim], parameters[:number_of_random_vectors], parameters[:number_of_independent_projections] ) parameters[:number_of_independent_projections].times { |i| storage.create_new_bucket } end end |
Instance Attribute Details
#storage ⇒ Object (readonly)
Returns the value of attribute storage.
21 22 23 |
# File 'lib/lsh/index.rb', line 21 def storage @storage end |
Class Method Details
Instance Method Details
#add(vector, id = nil) ⇒ Object
41 42 43 44 45 46 47 48 49 50 |
# File 'lib/lsh/index.rb', line 41 def add(vector, id = nil) vector_hash = vector.hash storage.add_vector(vector, vector_hash) storage.add_vector_id(vector_hash, id) if id hashes(vector).each_with_index do |hash, i| hash_i = array_to_hash(hash) bucket = storage.find_bucket(i) storage.add_vector_hash_to_bucket(bucket, hash_i, vector_hash) end end |
#array_to_hash(array) ⇒ Object
148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/lsh/index.rb', line 148 def array_to_hash(array) return array.hash # Derives a 28 bit hash value from an array of integers # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572 # TODO: Check it works for non-binary LSH #return 0 if array.size == 0 #value = (array.first << 7) #array.each do |v| # value = (101 * value + v) & 0xffffff #end #value end |
#generate_projection(dim, k) ⇒ Object
169 170 171 |
# File 'lib/lsh/index.rb', line 169 def generate_projection(dim, k) MathUtil.random_gaussian_matrix(k, dim) end |
#generate_projections(dim, k, l) ⇒ Object
161 162 163 164 165 166 167 |
# File 'lib/lsh/index.rb', line 161 def generate_projections(dim, k, l) projections = [] l.times do |i| projections << generate_projection(dim, k) end projections end |
#hash(vector, projection, bias = true) ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/lsh/index.rb', line 116 def hash(vector, projection, bias = true) hash = [] dot_products = (projection * vector.transpose).column(0).to_a window = storage.parameters[:window] dot_products.each do |dot_product| if window == Float::INFINITY # Binary LSH if dot_product >= 0 hash << 1 else hash << 0 end else b = bias ? MathUtil.random_uniform : 0.0 hash << (b + dot_product / window).floor end end hash end |
#hashes(vector) ⇒ Object
108 109 110 111 112 113 114 |
# File 'lib/lsh/index.rb', line 108 def hashes(vector) hashes = [] storage.projections.each do |projection| hashes << hash(vector, projection) end hashes end |
#hashes_are_binary? ⇒ Boolean
135 136 137 |
# File 'lib/lsh/index.rb', line 135 def hashes_are_binary? storage.parameters[:window] == Float::INFINITY end |
#id_to_vector(id) ⇒ Object
56 57 58 |
# File 'lib/lsh/index.rb', line 56 def id_to_vector(id) storage.id_to_vector(id) end |
#inspect ⇒ Object
177 178 179 |
# File 'lib/lsh/index.rb', line 177 def inspect "#<LSH index; dimension: #{storage.parameters[:dim]}; window size: #{storage.parameters[:window]}; #{storage.parameters[:number_of_random_vectors]} random vectors; #{storage.parameters[:number_of_independent_projections]} independent projections>" end |
#multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) ⇒ Object
89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/lsh/index.rb', line 89 def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) mp_arrays = [] (1..multiprobe_radius).to_a.each do |radius| (0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips| probes = Marshal.load(Marshal.dump(hash_arrays)) probes.each { |probe| flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 } } mp_arrays << probes end end mp_arrays end |
#order_results_by_similarity(vector, results) ⇒ Object
101 102 103 104 105 106 |
# File 'lib/lsh/index.rb', line 101 def order_results_by_similarity(vector, results) # Faster than vectors.sort - we precompute all similarities to vector # and order using those similarities = results.map { |result| [ result[:hash], result[:id], result[:data], similarity(vector, result[:data]) ] } similarities.sort_by { |hash, id, vector, sim| sim } .reverse .map { |vs| { :hash => vs[0], :id => vs[1], :data => vs[2] } } end |
#query(vector, multiprobe_radius = 0) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/lsh/index.rb', line 60 def query(vector, multiprobe_radius = 0) hash_arrays = hashes(vector) hashes = hash_arrays.map { |a| array_to_hash(a) } results = storage.query_buckets(hashes) # Multiprobe LSH # Take query hashes, move them around at radius r, and use them to do another query # TODO: only works for binary LSH atm if multiprobe_radius > 0 raise Exception.new("Non-zero multiprobe radius only implemented for binary LSH") unless hashes_are_binary? mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) mp_arrays.each do |probes_arrays| probes_hashes = probes_arrays.map { |a| array_to_hash(a) } results += storage.query_buckets(probes_hashes) end results.uniq! { |result| result[:hash] } end order_results_by_similarity(vector, results) end |
#query_ids(id, multiprobe_radius = 0) ⇒ Object
79 80 81 82 |
# File 'lib/lsh/index.rb', line 79 def query_ids(id, multiprobe_radius = 0) vector = id_to_vector(id) query_ids_by_vector(vector, multiprobe_radius) end |
#query_ids_by_vector(vector, multiprobe_radius = 0) ⇒ Object
84 85 86 87 |
# File 'lib/lsh/index.rb', line 84 def query_ids_by_vector(vector, multiprobe_radius = 0) results = query(vector, multiprobe_radius) results.map { |result| vector_hash_to_id(result[:hash]) } end |
#random_vector(dim) ⇒ Object
139 140 141 |
# File 'lib/lsh/index.rb', line 139 def random_vector(dim) MathUtil.random_gaussian_matrix(1, dim) end |
#random_vector_unit(dim) ⇒ Object
143 144 145 146 |
# File 'lib/lsh/index.rb', line 143 def random_vector_unit(dim) r = random_vector(dim) r /= MathUtil.norm(r) end |
#similarity(v1, v2) ⇒ Object
173 174 175 |
# File 'lib/lsh/index.rb', line 173 def similarity(v1, v2) MathUtil.dot(v1, v2) end |
#vector_hash_to_id(vector_hash) ⇒ Object
52 53 54 |
# File 'lib/lsh/index.rb', line 52 def vector_hash_to_id(vector_hash) storage.vector_hash_to_id(vector_hash) end |