Class: VectorStore

Inherits:
Object
  • Object
show all
Defined in:
lib/vectorstore.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(quantized: false) ⇒ VectorStore

Returns a new instance of VectorStore.



13
14
15
16
17
# File 'lib/vectorstore.rb', line 13

def initialize(quantized: false)
  # Internal store mapping primary key to vector (array of numbers)
  @vectors = {}
  @quantized = quantized
end

Instance Attribute Details

#vectorsObject (readonly)

Returns the value of attribute vectors.



12
13
14
# File 'lib/vectorstore.rb', line 12

def vectors
  @vectors
end

Instance Method Details

#add(key, vector) ⇒ Object

Add a vector with the given primary key. Overwrites any existing vector.



20
21
22
23
24
25
26
# File 'lib/vectorstore.rb', line 20

def add(key, vector)
  if @quantized
    @vectors[key] = quantize(vector)
  else
    @vectors[key] = vector
  end
end

#add_with_openai(key, text, embedding_model: "text-embedding-3-small") ⇒ Object



28
29
30
31
# File 'lib/vectorstore.rb', line 28

def add_with_openai(key, text, embedding_model: "text-embedding-3-small")
  return false unless defined?(OpenAI)
  add(key, get_openai_embedding(text, embedding_model: embedding_model))
end

#cosine_similarity(vec1, vec2) ⇒ Object

Compute the cosine similarity between two vectors.



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/vectorstore.rb', line 44

def cosine_similarity(vec1, vec2)
  if @quantized
    vec1 = vec1.is_a?(String) ? vec1 : quantize(vec1)
    vec2 = vec2.is_a?(String) ? vec2 : quantize(vec2)
    return cosine_similarity_quantized(vec1, vec2)
  end
  if vec1.is_a?(String) && vec2.is_a?(String)
    return cosine_similarity_quantized(vec1, vec2)
  end

  # Ensure vectors are of the same size
  raise "Vector dimensions do not match" if vec1.size != vec2.size

  dot_product = vec1.zip(vec2).map { |a, b| a * b }.sum
  norm1 = Math.sqrt(vec1.map { |x| x * x }.sum)
  norm2 = Math.sqrt(vec2.map { |x| x * x }.sum)
  return 0.0 if norm1 == 0 || norm2 == 0

  dot_product / (norm1 * norm2)
end

#cosine_similarity_quantized(str1, str2) ⇒ Object

Compute cosine similarity for quantized vectors (bit strings).



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/vectorstore.rb', line 108

def cosine_similarity_quantized(str1, str2)
  dot = 0
  total_ones_str1 = 0
  total_ones_str2 = 0
  str1.each_byte.with_index do |byte1, index|
    byte2 = str2.getbyte(index)
    dot += (byte1 & byte2).to_s(2).count("1")
    total_ones_str1 += byte1.to_s(2).count("1")
    total_ones_str2 += byte2.to_s(2).count("1")
  end
  return 0.0 if total_ones_str1 == 0 || total_ones_str2 == 0
  sim = dot.to_f / (Math.sqrt(total_ones_str1) * Math.sqrt(total_ones_str2))
  sim = 1.0 if (1.0 - sim).abs < 1e-6
  sim
end

#deserialize(json_string) ⇒ Object

Deserialize a JSON string and update the internal store.



154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/vectorstore.rb', line 154

def deserialize(json_string)
  data = JSON.parse(json_string)
  # We need to detect if the data is quantized or not
  # by seeing if the values are strings and not arrays
  @quantized = data.values.first.is_a?(String)

  if @quantized
    decoded = {}
    data.each do |k, v|
      decoded[k] = Base64.decode64(v)
    end
    @vectors = decoded
  else
    @vectors = data
  end
end

#find_closest(query_vector, k = 1) ⇒ Object

Find the top k closest vectors to the query vector using cosine similarity. Returns an array of [key, similarity] pairs.



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/vectorstore.rb', line 67

def find_closest(query_vector, k=1)
  if @quantized
    query_vector = quantize(query_vector)
  end

  similarities = @vectors.map do |key, vector|
    similarity = cosine_similarity(query_vector, vector)
    [key, similarity]
  end
  similarities.sort_by { |_, sim| -sim }.first(k)
end

#find_closest_with_key(key, k = 1) ⇒ Object



79
80
81
82
# File 'lib/vectorstore.rb', line 79

def find_closest_with_key(key, k=1)
  query_vector = @vectors[key]
  find_closest(query_vector, k)
end

#find_closest_with_openai(query_text, k = 1, embedding_model: "text-embedding-3-small") ⇒ Object



101
102
103
104
105
# File 'lib/vectorstore.rb', line 101

def find_closest_with_openai(query_text, k=1, embedding_model: "text-embedding-3-small")
  return false unless defined?(OpenAI)
  query_vector = get_openai_embedding(query_text, embedding_model: embedding_model)
  find_closest(query_vector, k)
end

#get(key) ⇒ Object

Retrieve a vector by its primary key.



39
40
41
# File 'lib/vectorstore.rb', line 39

def get(key)
  @vectors[key]
end

#get_openai_embedding(text, embedding_model: "text-embedding-3-small") ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/vectorstore.rb', line 84

def get_openai_embedding(text, embedding_model: "text-embedding-3-small")
  return false unless defined?(OpenAI)

  client = OpenAI::Client.new(
    access_token: ENV["OPENAI_API_KEY"],
    log_errors: true
  )
  response = client.embeddings(
    parameters: {
      model: embedding_model,
      input: text
    }
  )

  response.dig("data", 0, "embedding")
end

#load(filename) ⇒ Object

Load the internal vector store from a file.



177
178
179
180
# File 'lib/vectorstore.rb', line 177

def load(filename)
  json_string = File.read(filename)
  deserialize(json_string)
end

#quantize(vector) ⇒ Object

Convert an array of floats to a 1-bit quantized bit string.



125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/vectorstore.rb', line 125

def quantize(vector)
  # If it's already a string, it's already quantized
  return vector if vector.is_a?(String)
  
  bits = vector.map { |x| x >= 0 ? 1 : 0 }
  bytes = []
  bits.each_slice(8) do |slice|
    byte = slice.join.to_i(2)
    bytes << byte.chr("ASCII-8BIT")
  end
  result = bytes.join
  result.force_encoding("ASCII-8BIT")
  result
end

#remove(key) ⇒ Object

Remove a vector by its primary key.



34
35
36
# File 'lib/vectorstore.rb', line 34

def remove(key)
  @vectors.delete(key)
end

#save(filename) ⇒ Object

Save the internal vector store to a file.



172
173
174
# File 'lib/vectorstore.rb', line 172

def save(filename)
  File.write(filename, serialize)
end

#serializeObject

Serialize the internal vector store to a JSON string.



141
142
143
144
145
146
147
148
149
150
151
# File 'lib/vectorstore.rb', line 141

def serialize
  if @quantized
    encoded = {}
    @vectors.each do |k, v|
      encoded[k] = Base64.strict_encode64(v)
    end
    JSON.dump(encoded)
  else
    JSON.dump(@vectors)
  end
end