Class: VectorStore
- Inherits:
-
Object
- Object
- VectorStore
- Defined in:
- lib/vectorstore.rb
Instance Attribute Summary collapse
-
#vectors ⇒ Object
readonly
Returns the value of attribute vectors.
Instance Method Summary collapse
-
#add(key, vector) ⇒ Object
Add a vector with the given primary key.
- #add_with_openai(key, text, embedding_model: "text-embedding-3-small") ⇒ Object
-
#cosine_similarity(vec1, vec2) ⇒ Object
Compute the cosine similarity between two vectors.
-
#cosine_similarity_quantized(str1, str2) ⇒ Object
Compute cosine similarity for quantized vectors (bit strings).
-
#deserialize(json_string) ⇒ Object
Deserialize a JSON string and update the internal store.
-
#find_closest(query_vector, k = 1) ⇒ Object
Find the top k closest vectors to the query vector using cosine similarity.
- #find_closest_with_key(key, k = 1) ⇒ Object
- #find_closest_with_openai(query_text, k = 1, embedding_model: "text-embedding-3-small") ⇒ Object
-
#get(key) ⇒ Object
Retrieve a vector by its primary key.
- #get_openai_embedding(text, embedding_model: "text-embedding-3-small") ⇒ Object
-
#initialize(quantized: false) ⇒ VectorStore
constructor
A new instance of VectorStore.
-
#load(filename) ⇒ Object
Load the internal vector store from a file.
-
#quantize(vector) ⇒ Object
Convert an array of floats to a 1-bit quantized bit string.
-
#remove(key) ⇒ Object
Remove a vector by its primary key.
-
#save(filename) ⇒ Object
Save the internal vector store to a file.
-
#serialize ⇒ Object
Serialize the internal vector store to a JSON string.
Constructor Details
#initialize(quantized: false) ⇒ VectorStore
Returns a new instance of VectorStore.
13 14 15 16 17 |
# File 'lib/vectorstore.rb', line 13 def initialize(quantized: false) # Internal store mapping primary key to vector (array of numbers) @vectors = {} @quantized = quantized end |
Instance Attribute Details
#vectors ⇒ Object (readonly)
Returns the value of attribute vectors.
12 13 14 |
# File 'lib/vectorstore.rb', line 12 def vectors @vectors end |
Instance Method Details
#add(key, vector) ⇒ Object
Add a vector with the given primary key. Overwrites any existing vector.
20 21 22 23 24 25 26 |
# File 'lib/vectorstore.rb', line 20 def add(key, vector) if @quantized @vectors[key] = quantize(vector) else @vectors[key] = vector end end |
#add_with_openai(key, text, embedding_model: "text-embedding-3-small") ⇒ Object
28 29 30 31 |
# File 'lib/vectorstore.rb', line 28 def add_with_openai(key, text, embedding_model: "text-embedding-3-small") return false unless defined?(OpenAI) add(key, (text, embedding_model: )) end |
#cosine_similarity(vec1, vec2) ⇒ Object
Compute the cosine similarity between two vectors.
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/vectorstore.rb', line 44 def cosine_similarity(vec1, vec2) if @quantized vec1 = vec1.is_a?(String) ? vec1 : quantize(vec1) vec2 = vec2.is_a?(String) ? vec2 : quantize(vec2) return cosine_similarity_quantized(vec1, vec2) end if vec1.is_a?(String) && vec2.is_a?(String) return cosine_similarity_quantized(vec1, vec2) end # Ensure vectors are of the same size raise "Vector dimensions do not match" if vec1.size != vec2.size dot_product = vec1.zip(vec2).map { |a, b| a * b }.sum norm1 = Math.sqrt(vec1.map { |x| x * x }.sum) norm2 = Math.sqrt(vec2.map { |x| x * x }.sum) return 0.0 if norm1 == 0 || norm2 == 0 dot_product / (norm1 * norm2) end |
#cosine_similarity_quantized(str1, str2) ⇒ Object
Compute cosine similarity for quantized vectors (bit strings).
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/vectorstore.rb', line 108 def cosine_similarity_quantized(str1, str2) dot = 0 total_ones_str1 = 0 total_ones_str2 = 0 str1.each_byte.with_index do |byte1, index| byte2 = str2.getbyte(index) dot += (byte1 & byte2).to_s(2).count("1") total_ones_str1 += byte1.to_s(2).count("1") total_ones_str2 += byte2.to_s(2).count("1") end return 0.0 if total_ones_str1 == 0 || total_ones_str2 == 0 sim = dot.to_f / (Math.sqrt(total_ones_str1) * Math.sqrt(total_ones_str2)) sim = 1.0 if (1.0 - sim).abs < 1e-6 sim end |
#deserialize(json_string) ⇒ Object
Deserialize a JSON string and update the internal store.
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/vectorstore.rb', line 154 def deserialize(json_string) data = JSON.parse(json_string) # We need to detect if the data is quantized or not # by seeing if the values are strings and not arrays @quantized = data.values.first.is_a?(String) if @quantized decoded = {} data.each do |k, v| decoded[k] = Base64.decode64(v) end @vectors = decoded else @vectors = data end end |
#find_closest(query_vector, k = 1) ⇒ Object
Find the top k closest vectors to the query vector using cosine similarity. Returns an array of [key, similarity] pairs.
67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/vectorstore.rb', line 67 def find_closest(query_vector, k=1) if @quantized query_vector = quantize(query_vector) end similarities = @vectors.map do |key, vector| similarity = cosine_similarity(query_vector, vector) [key, similarity] end similarities.sort_by { |_, sim| -sim }.first(k) end |
#find_closest_with_key(key, k = 1) ⇒ Object
79 80 81 82 |
# File 'lib/vectorstore.rb', line 79 def find_closest_with_key(key, k=1) query_vector = @vectors[key] find_closest(query_vector, k) end |
#find_closest_with_openai(query_text, k = 1, embedding_model: "text-embedding-3-small") ⇒ Object
101 102 103 104 105 |
# File 'lib/vectorstore.rb', line 101 def find_closest_with_openai(query_text, k=1, embedding_model: "text-embedding-3-small") return false unless defined?(OpenAI) query_vector = (query_text, embedding_model: ) find_closest(query_vector, k) end |
#get(key) ⇒ Object
Retrieve a vector by its primary key.
39 40 41 |
# File 'lib/vectorstore.rb', line 39 def get(key) @vectors[key] end |
#get_openai_embedding(text, embedding_model: "text-embedding-3-small") ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/vectorstore.rb', line 84 def (text, embedding_model: "text-embedding-3-small") return false unless defined?(OpenAI) client = OpenAI::Client.new( access_token: ENV["OPENAI_API_KEY"], log_errors: true ) response = client.( parameters: { model: , input: text } ) response.dig("data", 0, "embedding") end |
#load(filename) ⇒ Object
Load the internal vector store from a file.
177 178 179 180 |
# File 'lib/vectorstore.rb', line 177 def load(filename) json_string = File.read(filename) deserialize(json_string) end |
#quantize(vector) ⇒ Object
Convert an array of floats to a 1-bit quantized bit string.
125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/vectorstore.rb', line 125 def quantize(vector) # If it's already a string, it's already quantized return vector if vector.is_a?(String) bits = vector.map { |x| x >= 0 ? 1 : 0 } bytes = [] bits.each_slice(8) do |slice| byte = slice.join.to_i(2) bytes << byte.chr("ASCII-8BIT") end result = bytes.join result.force_encoding("ASCII-8BIT") result end |
#remove(key) ⇒ Object
Remove a vector by its primary key.
34 35 36 |
# File 'lib/vectorstore.rb', line 34 def remove(key) @vectors.delete(key) end |
#save(filename) ⇒ Object
Save the internal vector store to a file.
172 173 174 |
# File 'lib/vectorstore.rb', line 172 def save(filename) File.write(filename, serialize) end |
#serialize ⇒ Object
Serialize the internal vector store to a JSON string.
141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/vectorstore.rb', line 141 def serialize if @quantized encoded = {} @vectors.each do |k, v| encoded[k] = Base64.strict_encode64(v) end JSON.dump(encoded) else JSON.dump(@vectors) end end |