Class: Roseflow::VectorStores::HNSWMemoryStore

Inherits:
Base
  • Object
show all
Defined in:
lib/roseflow/vector_stores/hnsw_memory_store.rb

Overview

HNSWMemoryStore is an in-memory vector store that implements the HNSW algorithm.

Defined Under Namespace

Classes: BoundedPriorityQueue, HNSWNode, PriorityQueue

Constant Summary collapse

PROBABILITY_FACTORS =
[
  0.5,
  1 / Math::E,
].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Base

#build_vector, #has_embeddings?, #list_vectors, #query, #update_vector

Constructor Details

#initialize(similarity_metric, dimensions, m, ef) ⇒ HNSWMemoryStore

Initializes a new HNSWMemoryStore with the specified similarity metric, dimensions, m and ef.

Parameters:

  • similarity_metric (Symbol)

    the similarity metric to use

  • dimensions (Integer)

    the number of dimensions of the vectors

  • m (Integer)

    the number of neighbors to consider when adding a node

  • ef (Integer)

    the number of neighbors to consider when searching

Raises:



27
28
29
30
31
32
33
34
35
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 27

def initialize(similarity_metric, dimensions, m, ef)
  @similarity_metric = similarity_metric
  @dimensions = dimensions
  @m = m
  @ef = ef
  @max_level = 0
  @entrypoint = nil
  @nodes = {}
end

Instance Attribute Details

#nodesObject

Returns the value of attribute nodes.



39
40
41
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 39

def nodes
  @nodes
end

Class Method Details

.deserialize(serialized_data) ⇒ Object

Deserializes a binary string into a vector store.



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 114

def self.deserialize(serialized_data)
  graph = HNSWGraph.decode(serialized_data)

  hnsw = new(graph.similarity_metric, graph.dimensions, graph.m, graph.ef)

  # Create nodes
  graph.nodes.each do |node|
    hnsw.nodes[node.id] = HNSWNode.new(node.id, node.vector, node.level, graph.m)
  end

  # Set neighbors
  graph.nodes.each do |node|
    neighbors = node.neighbors.each_slice(graph.m).to_a
    neighbors.each_with_index do |neighbor_ids, level|
      neighbor_ids.each_with_index do |neighbor_id, index|
        hnsw.nodes[node.id].neighbors[level][index] = hnsw.nodes[neighbor_id] if hnsw.nodes.key?(neighbor_id)
      end
    end
  end

  hnsw.instance_variable_set(:@entrypoint, hnsw.nodes[graph.entrypoint_id])
  hnsw.instance_variable_set(:@max_level, graph.max_level)

  hnsw
end

Instance Method Details

#add_neighbors_to_candidates(closest, level, visited, candidates) ⇒ Object



258
259
260
261
262
263
264
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 258

def add_neighbors_to_candidates(closest, level, visited, candidates)
  closest.neighbors[level].each do |neighbor|
    next unless neighbor
    next if visited.include?(neighbor.id)
    candidates.add(neighbor)
  end
end

#add_node(node_id, vector) ⇒ HNSWNode Also known as: create_vector

Adds a new node to the vector store.

Parameters:

  • node_id (String)

    the ID of the node

  • vector (Array<Float>)

    the vector of the node

Returns:



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 46

def add_node(node_id, vector)
  level = get_random_level
  node = HNSWNode.new(node_id, vector, level, @m)

  if @entrypoint.nil?
    @entrypoint = node
    return @nodes[node_id] = node
  end

  update_max_level(level)
  current_node = search_level(vector, @entrypoint, @max_level)

  @max_level.downto(0) do |i|
    if i <= level
      neighbors = find_neighbors(current_node, vector, i)
      update_neighbors(node, neighbors, vector, i)
    end

    current_node = search_level(vector, @entrypoint, i - 1) if i > 0
  end

  @nodes[node_id] = node
end

#cosine_distance(from, to) ⇒ Object

Calculates the cosine distance between two vectors.



289
290
291
292
293
294
295
296
297
298
299
300
301
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 289

def cosine_distance(from, to)
  dot_product = 0
  norm_from = 0
  norm_to = 0

  from.each_with_index do |value, index|
    dot_product += value * to[index]
    norm_from += value ** 2
    norm_to += to[index] ** 2
  end

  1 - (dot_product / (Math.sqrt(norm_from) * Math.sqrt(norm_to)))
end

#delete_node(node_id) ⇒ HNSWNode Also known as: delete_vector

Deletes a node from the vector store.

Parameters:

  • node_id (String)

    the ID of the node

Returns:



76
77
78
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 76

def delete_node(node_id)
  @nodes.delete(node_id)
end

#distance(from, to) ⇒ Object

Calculates the distance between two vectors.



267
268
269
270
271
272
273
274
275
276
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 267

def distance(from, to)
  case @similarity_metric.to_sym
  when :euclidean
    euclidean_distance(from, to)
  when :cosine
    cosine_distance(from, to)
  else
    raise UnsupportedSimilarityMetricError, "Similarity metric #{@similarity_metric} is not supported"
  end
end

#euclidean_distance(from, to) ⇒ Object

Calculates the euclidean distance between two vectors.



279
280
281
282
283
284
285
286
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 279

def euclidean_distance(from, to)
  e_distance = 0
  from.each_with_index do |value, index|
    e_distance += (value - to[index]) ** 2
  end

  Math.sqrt(e_distance)
end

#find(node_id) ⇒ HNSWNode?

Finds a node in the vector store.

Parameters:

  • node_id (String)

    the ID of the node

Returns:

  • (HNSWNode)

    the found node

  • (nil)

    if the node was not found



87
88
89
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 87

def find(node_id)
  @nodes[node_id]
end

#find_closest_candidate(candidates, query) ⇒ Object



229
230
231
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 229

def find_closest_candidate(candidates, query)
  candidates.min_by { |c| distance(query, c.vector) }
end

#find_closest_neighbor(query, neighbors) ⇒ Object



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 192

def find_closest_neighbor(query, neighbors)
  closest_neighbor = nil
  closest_distance = Float::INFINITY

  neighbors.each do |neighbor|
    next unless neighbor
    distance = distance(query, neighbor.vector)
    if distance < closest_distance
      closest_distance = distance
      closest_neighbor = neighbor
    end
  end

  [closest_neighbor, closest_distance]
end

#find_neighbors(node, query, level) ⇒ Object

Finds the nearest neighbors of a vector.



141
142
143
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 141

def find_neighbors(node, query, level)
  search_knn(node, query, @m, level)
end

#get_random_levelObject

Returns a random level for a node.



304
305
306
307
308
309
310
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 304

def get_random_level
  level = 0
  while rand < PROBABILITY_FACTORS[0] && level < @max_level
    level += 1
  end
  level
end

#nearest_neighbors(query, k) ⇒ Object

Finds the k nearest neighbors of a vector.



165
166
167
168
169
170
171
172
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 165

def nearest_neighbors(query, k)
  return [] unless @entrypoint
  entry_point = @entrypoint
  (0..@max_level).reverse_each do |level|
    entry_point = search_level(query, entry_point, level)
  end
  search_knn(entry_point, query, k, 0)
end

#search_knn(entry_point, query, k, level) ⇒ Object

Finds the k nearest neighbors of a vector.



209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 209

def search_knn(entry_point, query, k, level)
  visited = Set.new
  candidates = Set.new([entry_point])
  result = []

  while candidates.size > 0
    closest = find_closest_candidate(candidates, query)
    candidates.delete(closest)
    visited.add(closest.id)

    result = update_result(result, closest, query, k)

    break if termination_condition_met?(result, closest, query, k)

    add_neighbors_to_candidates(closest, level, visited, candidates)
  end

  result
end

#search_level(query, entry_point, level) ⇒ Object



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 174

def search_level(query, entry_point, level)
  current = entry_point
  best_distance = distance(query, current.vector)

  loop do
    closest_neighbor, closest_distance = find_closest_neighbor(query, current.neighbors[level])

    if closest_neighbor && closest_distance < best_distance
      best_distance = closest_distance
      current = closest_neighbor
    else
      break
    end
  end

  current
end

#serializeObject

Serializes the vector store to a binary string.



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 92

def serialize
  graph = HNSWGraph.new(
    entrypoint_id: @entrypoint.id,
    max_level: @max_level,
    similarity_metric: @similarity_metric,
    dimensions: @dimensions,
    m: @m,
    ef: @ef,
    nodes: @nodes.values.map do |node|
      HNSWGraphNode.new(
        id: node.id,
        vector: node.vector,
        level: node.level,
        neighbors: node.neighbors.flatten.compact.map(&:id),
      )
    end,
  )

  graph.to_proto
end

#termination_condition_met?(result, closest, query, k) ⇒ Boolean

Returns:

  • (Boolean)


249
250
251
252
253
254
255
256
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 249

def termination_condition_met?(result, closest, query, k)
  return false if result.size < k

  furthest_result_distance = distance(query, result.max_by { |r| distance(query, r.vector) }.vector)
  closest_distance = distance(query, closest.vector)

  closest_distance >= furthest_result_distance
end

#update_max_level(level) ⇒ Object

Updates maximum level of the graph.



160
161
162
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 160

def update_max_level(level)
  @max_level = level if level > @max_level
end

#update_neighbors(node, neighbors, query, level) ⇒ Object

Updates the neighbors of a node.



146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 146

def update_neighbors(node, neighbors, query, level)
  node.neighbors[level] = neighbors[0, @m]

  neighbors.each do |neighbor|
    n_distance = distance(neighbor.vector, query)
    furthest_neighbor_index = neighbor.neighbors[level].index { |n| n.nil? || n_distance < distance(neighbor.vector, n.vector) }
    next unless furthest_neighbor_index

    neighbor.neighbors[level].insert(furthest_neighbor_index, node)
    neighbor.neighbors[level].pop if neighbor.neighbors[level].size > @m
  end
end

#update_result(result, candidate, query, k) ⇒ Object



233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# File 'lib/roseflow/vector_stores/hnsw_memory_store.rb', line 233

def update_result(result, candidate, query, k)
  if result.size < k
    result.push(candidate)
  else
    furthest_result = result.max_by { |r| distance(query, r.vector) }
    closest_distance = distance(query, candidate.vector)
    furthest_result_distance = distance(query, furthest_result.vector)

    if closest_distance < furthest_result_distance
      result.delete(furthest_result)
      result.push(candidate)
    end
  end
  result
end