Class: Ragdoll::Embedding

Inherits:
ActiveRecord::Base
  • Object
show all
Defined in:
app/models/ragdoll/embedding.rb

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.search_similar(query_embedding, limit: 20, threshold: 0.8, filters: {}) ⇒ Object

PostgreSQL pgvector similarity search using neighbor gem



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'app/models/ragdoll/embedding.rb', line 58

def self.search_similar(query_embedding, limit: 20,
                        threshold: 0.8, filters: {})
  # Apply filters
  scope = all
  scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
  scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
  scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]

  # Document-level filters require joining through embeddable (STI Content) to documents
  needs_document_join = filters[:document_type] || filters[:keywords]
  
  if needs_document_join
    scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
                 .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
  end

  if filters[:document_type]
    scope = scope.where("ragdoll_documents.document_type = ?", filters[:document_type])
  end

  # Keywords filtering using PostgreSQL array operations
  if filters[:keywords] && filters[:keywords].any?
    normalized_keywords = Array(filters[:keywords]).map(&:to_s).map(&:downcase).reject(&:empty?)
    if normalized_keywords.any?
      # Use PostgreSQL array overlap operator with proper array literal
      quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
      array_literal = "'{#{quoted_keywords}}'::text[]"
      scope = scope.where("ragdoll_documents.keywords && #{array_literal}")
    end
  end

  # Use pgvector for similarity search
  search_with_pgvector(query_embedding, scope, limit, threshold)
end

.search_similar_with_stats(query_embedding, limit: 20, threshold: 0.8, filters: {}) ⇒ Object

Enhanced search that returns both results and similarity statistics



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'app/models/ragdoll/embedding.rb', line 94

def self.search_similar_with_stats(query_embedding, limit: 20, threshold: 0.8, filters: {})
  # Apply filters
  scope = all
  scope = scope.where(embeddable_id: filters[:embeddable_id]) if filters[:embeddable_id]
  scope = scope.where(embeddable_type: filters[:embeddable_type]) if filters[:embeddable_type]
  scope = scope.by_model(filters[:embedding_model]) if filters[:embedding_model]

  # Document-level filters require joining through embeddable (STI Content) to documents
  needs_document_join = filters[:document_type] || filters[:keywords]
  
  if needs_document_join
    scope = scope.joins("JOIN ragdoll_contents ON ragdoll_contents.id = ragdoll_embeddings.embeddable_id")
                 .joins("JOIN ragdoll_documents ON ragdoll_documents.id = ragdoll_contents.document_id")
  end

  if filters[:document_type]
    scope = scope.where("ragdoll_documents.document_type = ?", filters[:document_type])
  end

  # Keywords filtering using PostgreSQL array operations
  if filters[:keywords] && filters[:keywords].any?
    normalized_keywords = Array(filters[:keywords]).map(&:to_s).map(&:downcase).reject(&:empty?)
    if normalized_keywords.any?
      # Use PostgreSQL array overlap operator with proper array literal
      quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
      array_literal = "'{#{quoted_keywords}}'::text[]"
      scope = scope.where("ragdoll_documents.keywords && #{array_literal}")
    end
  end

  search_with_pgvector_stats(query_embedding, scope, limit, threshold)
end

.search_with_pgvector(query_embedding, scope, limit, threshold) ⇒ Object

Fast search using pgvector with neighbor gem



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'app/models/ragdoll/embedding.rb', line 128

def self.search_with_pgvector(query_embedding, scope, limit, threshold)
  # Use pgvector for similarity search
  neighbor_results = scope
                     .includes(:embeddable)
                     .nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
                     .limit(limit * 2) # Get more to filter by threshold

  results = []
  highest_similarity = 0.0

  neighbor_results.each do |embedding|
    # Calculate cosine similarity (neighbor returns distance, we want similarity)
    similarity = 1.0 - embedding.neighbor_distance

    highest_similarity = similarity if similarity > highest_similarity
    next if similarity < threshold

    usage_score = calculate_usage_score(embedding)
    combined_score = similarity + usage_score

    results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
                                 usage_score, combined_score)
  end

  # Sort by combined score and limit
  results = results.sort_by { |r| -r[:combined_score] }.take(limit)
  mark_embeddings_as_used(results)
  results
end

.search_with_pgvector_stats(query_embedding, scope, limit, threshold) ⇒ Object

Enhanced search with statistics



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'app/models/ragdoll/embedding.rb', line 159

def self.search_with_pgvector_stats(query_embedding, scope, limit, threshold)
  # Use pgvector for similarity search - get more results to analyze
  # Note: We convert to array immediately to avoid SQL conflicts with count operations
  neighbor_results = scope
                     .includes(:embeddable)
                     .nearest_neighbors(:embedding_vector, query_embedding, distance: "cosine")
                     .limit([limit * 3, 50].max) # Get enough for statistics
                     .to_a # Convert to array to avoid SQL conflicts

  results = []
  all_similarities = []
  highest_similarity = 0.0
  lowest_similarity = 1.0
  total_checked = neighbor_results.length

  neighbor_results.each do |embedding|
    # Calculate cosine similarity (neighbor returns distance, we want similarity)
    similarity = 1.0 - embedding.neighbor_distance
    all_similarities << similarity

    highest_similarity = similarity if similarity > highest_similarity
    lowest_similarity = similarity if similarity < lowest_similarity

    next if similarity < threshold

    usage_score = calculate_usage_score(embedding)
    combined_score = similarity + usage_score

    results << build_result_hash(embedding, query_embedding, similarity, highest_similarity,
                                 usage_score, combined_score)
  end

  # Sort by combined score and limit
  results = results.sort_by { |r| -r[:combined_score] }.take(limit)
  mark_embeddings_as_used(results)
  
  # Calculate statistics
  stats = {
    total_embeddings_checked: total_checked,
    threshold_used: threshold,
    highest_similarity: highest_similarity,
    lowest_similarity: lowest_similarity,
    average_similarity: all_similarities.empty? ? 0.0 : (all_similarities.sum / all_similarities.length),
    similarities_above_threshold: all_similarities.count { |s| s >= threshold },
    total_similarities_calculated: all_similarities.length
  }

  {
    results: results,
    statistics: stats
  }
end

Instance Method Details

#embedding_dimensionsObject



43
44
45
# File 'app/models/ragdoll/embedding.rb', line 43

def embedding_dimensions
  embedding_vector&.length || 0
end

#embedding_modelObject

Access embedding_model via polymorphic relationship



48
49
50
# File 'app/models/ragdoll/embedding.rb', line 48

def embedding_model
  embeddable&.embedding_model
end

#mark_as_used!Object



52
53
54
55
# File 'app/models/ragdoll/embedding.rb', line 52

def mark_as_used!
  increment!(:usage_count)
  update!(returned_at: Time.current)
end