Class: LSH::Index
- Inherits:
-
Object
- Object
- LSH::Index
- Defined in:
- lib/lsh/index.rb
Instance Attribute Summary collapse
-
#buckets ⇒ Object
readonly
Returns the value of attribute buckets.
-
#projections ⇒ Object
readonly
Returns the value of attribute projections.
-
#storage ⇒ Object
readonly
Returns the value of attribute storage.
Class Method Summary collapse
Instance Method Summary collapse
- #add(vector, id = nil) ⇒ Object
- #array_to_hash(array) ⇒ Object
- #generate_projection(dim, k) ⇒ Object
- #generate_projections(dim, k, l) ⇒ Object
- #hash(vector, projection, bias = true) ⇒ Object
- #hashes(vector) ⇒ Object
- #id_to_vector(id) ⇒ Object
-
#initialize(parameters = {}, storage = LSH::Storage::Memory.new) ⇒ Index
constructor
A new instance of Index.
- #inspect ⇒ Object
- #multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) ⇒ Object
- #order_vectors_by_similarity(vector, vectors) ⇒ Object
- #query(vector, multiprobe_radius = 0) ⇒ Object
- #query_ids(id, multiprobe_radius = 0) ⇒ Object
- #query_ids_by_vector(vector, multiprobe_radius = 0) ⇒ Object
- #random_vector(dim) ⇒ Object
- #random_vector_unit(dim) ⇒ Object
- #similarity(v1, v2) ⇒ Object
- #vector_to_id(vector) ⇒ Object
Constructor Details
#initialize(parameters = {}, storage = LSH::Storage::Memory.new) ⇒ Index
Returns a new instance of Index.
23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/lsh/index.rb', line 23 def initialize(parameters = {}, storage = LSH::Storage::Memory.new) @storage = storage unless storage.has_index? storage.parameters = parameters # Initializing projections and buckets storage.projections = generate_projections( parameters[:dim], parameters[:number_of_random_vectors], parameters[:number_of_independent_projections] ) parameters[:number_of_independent_projections].times { |i| storage.create_new_bucket } end end |
Instance Attribute Details
#buckets ⇒ Object (readonly)
Returns the value of attribute buckets.
21 22 23 |
# File 'lib/lsh/index.rb', line 21 def buckets @buckets end |
#projections ⇒ Object (readonly)
Returns the value of attribute projections.
21 22 23 |
# File 'lib/lsh/index.rb', line 21 def projections @projections end |
#storage ⇒ Object (readonly)
Returns the value of attribute storage.
21 22 23 |
# File 'lib/lsh/index.rb', line 21 def storage @storage end |
Class Method Details
Instance Method Details
#add(vector, id = nil) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/lsh/index.rb', line 41 def add(vector, id = nil) storage.add_vector_id(vector, id) if id hashes(vector).each_with_index do |hash, i| hash_i = array_to_hash(hash) bucket = storage.find_bucket(i) storage.add_vector_to_bucket(bucket, hash_i, vector) end end |
#array_to_hash(array) ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/lsh/index.rb', line 131 def array_to_hash(array) return array.hash # Derives a 28 bit hash value from an array of integers # http://stackoverflow.com/questions/2909106/python-whats-a-correct-and-good-way-to-implement-hash#2909572 # TODO: Check it works for non-binary LSH #return 0 if array.size == 0 #value = (array.first << 7) #array.each do |v| # value = (101 * value + v) & 0xffffff #end #value end |
#generate_projection(dim, k) ⇒ Object
152 153 154 155 156 157 158 |
# File 'lib/lsh/index.rb', line 152 def generate_projection(dim, k) vectors = [] k.times do |i| vectors << random_vector(dim) end vectors end |
#generate_projections(dim, k, l) ⇒ Object
144 145 146 147 148 149 150 |
# File 'lib/lsh/index.rb', line 144 def generate_projections(dim, k, l) projections = [] l.times do |i| projections << generate_projection(dim, k) end projections end |
#hash(vector, projection, bias = true) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/lsh/index.rb', line 112 def hash(vector, projection, bias = true) hash = [] projection.each do |random_vector| dot_product = similarity(vector, random_vector) window = storage.parameters[:window] if window == Float::INFINITY # Binary LSH if dot_product >= 0 hash << 1 else hash << 0 end else b = bias ? MathUtil.random_uniform : 0.0 hash << (b + dot_product / window).floor end end hash end |
#hashes(vector) ⇒ Object
104 105 106 107 108 109 110 |
# File 'lib/lsh/index.rb', line 104 def hashes(vector) hashes = [] storage.projections.each do |projection| hashes << hash(vector, projection) end hashes end |
#id_to_vector(id) ⇒ Object
54 55 56 |
# File 'lib/lsh/index.rb', line 54 def id_to_vector(id) storage.id_to_vector(id) end |
#inspect ⇒ Object
173 174 175 |
# File 'lib/lsh/index.rb', line 173 def inspect "#<LSH index; dimension: #{storage.parameters[:dim]}; window size: #{storage.parameters[:window]}; #{storage.parameters[:number_of_random_vectors]} random vectors; #{storage.parameters[:number_of_independent_projections]} independent projections>" end |
#multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) ⇒ Object
88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/lsh/index.rb', line 88 def multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) mp_arrays = [] (1..multiprobe_radius).to_a.each do |radius| (0..(storage.parameters[:number_of_random_vectors] - 1)).to_a.combination(radius).each do |flips| probes = Marshal.load(Marshal.dump(hash_arrays)) probes.each { |probe| flips.each { |d| probe[d] = (probe[d] == 1) ? 0 : 1 } } mp_arrays << probes end end mp_arrays end |
#order_vectors_by_similarity(vector, vectors) ⇒ Object
100 101 102 |
# File 'lib/lsh/index.rb', line 100 def order_vectors_by_similarity(vector, vectors) vectors.map { |v| [ v, similarity(vector, v) ] } .sort_by { |v, sim| sim } .reverse .map { |vs| vs[0] } end |
#query(vector, multiprobe_radius = 0) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/lsh/index.rb', line 58 def query(vector, multiprobe_radius = 0) hash_arrays = hashes(vector) hashes = hash_arrays.map { |a| array_to_hash(a) } results = storage.query_buckets(hashes) # Multiprobe LSH # Take query hashes, move them around at radius r, and use them to do another query # TODO: only works for binary LSH atm if multiprobe_radius > 0 mp_arrays = multiprobe_hashes_arrays(hash_arrays, multiprobe_radius) mp_arrays.each do |probes_arrays| probes_hashes = probes_arrays.map { |a| array_to_hash(a) } results += storage.query_buckets(probes_hashes) end end results = MathUtil.uniq(results) order_vectors_by_similarity(vector, results) end |
#query_ids(id, multiprobe_radius = 0) ⇒ Object
76 77 78 79 |
# File 'lib/lsh/index.rb', line 76 def query_ids(id, multiprobe_radius = 0) vector = id_to_vector(id) query_ids_by_vector(vector, multiprobe_radius) end |
#query_ids_by_vector(vector, multiprobe_radius = 0) ⇒ Object
81 82 83 84 85 86 |
# File 'lib/lsh/index.rb', line 81 def query_ids_by_vector(vector, multiprobe_radius = 0) vectors = query(vector, multiprobe_radius) results = [] vectors.each { |v| results << vector_to_id(v) } results end |
#random_vector(dim) ⇒ Object
165 166 167 |
# File 'lib/lsh/index.rb', line 165 def random_vector(dim) MathUtil.random_gaussian_vector(dim) end |
#random_vector_unit(dim) ⇒ Object
160 161 162 163 |
# File 'lib/lsh/index.rb', line 160 def random_vector_unit(dim) r = random_vector(dim) r /= MathUtil.norm(r) end |
#similarity(v1, v2) ⇒ Object
169 170 171 |
# File 'lib/lsh/index.rb', line 169 def similarity(v1, v2) MathUtil.dot(v1, v2) end |
#vector_to_id(vector) ⇒ Object
50 51 52 |
# File 'lib/lsh/index.rb', line 50 def vector_to_id(vector) storage.vector_to_id(vector) end |