Class: Ragdoll::Document

Inherits:
ActiveRecord::Base
  • Object
show all
Defined in:
app/models/ragdoll/document.rb

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.all_classificationsObject

Get all unique classifications



413
414
415
# File 'app/models/ragdoll/document.rb', line 413

def self.all_classifications
  where("metadata ? 'classification'").distinct.pluck("metadata->>'classification'").compact.sort
end

.all_keywordsObject

Get all unique keywords from metadata



399
400
401
402
403
404
405
406
407
408
409
410
# File 'app/models/ragdoll/document.rb', line 399

def self.all_keywords
  keywords = []
  where("metadata ? 'keywords'").pluck(:metadata).each do |meta|
    case meta["keywords"]
    when Array
      keywords.concat(meta["keywords"])
    when String
      keywords.concat(meta["keywords"].split(",").map(&:strip))
    end
  end
  keywords.uniq.sort
end

.all_tagsObject

Get all unique tags



418
419
420
421
422
423
424
# File 'app/models/ragdoll/document.rb', line 418

def self.all_tags
  tags = []
  where("metadata ? 'tags'").pluck(:metadata).each do |meta|
    tags.concat(Array(meta["tags"]))
  end
  tags.uniq.sort
end

.extract_keywords(query:) ⇒ Object

Extract keywords from query string (words > 4 characters)



491
492
493
494
495
496
497
498
# File 'app/models/ragdoll/document.rb', line 491

def self.extract_keywords(query:)
  return [] if query.nil? || query.strip.empty?

  query.split(/\s+/)
       .map(&:strip)
       .reject(&:empty?)
       .select { |word| word.length > 4 }
end

.faceted_search(query: nil, keywords: [], classification: nil, tags: [], **options) ⇒ Object

Faceted search by metadata fields



367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# File 'app/models/ragdoll/document.rb', line 367

def self.faceted_search(query: nil, keywords: [], classification: nil, tags: [], **options)
  scope = all

  # Filter by keywords if provided
  if keywords.any?
    keywords.each do |keyword|
      scope = scope.where("metadata->>'keywords' ILIKE ?", "%#{keyword}%")
    end
  end

  # Filter by classification
  scope = scope.where("metadata->>'classification' = ?", classification) if classification.present?

  # Filter by tags
  if tags.any?
    tags.each do |tag|
      scope = scope.where("metadata ? 'tags' AND metadata->'tags' @> ?", [tag].to_json)
    end
  end

  # Apply PostgreSQL full-text search if query provided
  if query.present?
    scope = scope.where(
      "to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(metadata->>'summary', '') || ' ' || COALESCE(metadata->>'keywords', '') || ' ' || COALESCE(metadata->>'description', '')) @@ plainto_tsquery('english', ?)",
      query
    )
  end

  scope.limit(options[:limit] || 20)
end

.hybrid_search(query, query_embedding: nil, **options) ⇒ Object

Hybrid search combining semantic and PostgreSQL full-text search



441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
# File 'app/models/ragdoll/document.rb', line 441

def self.hybrid_search(query, query_embedding: nil, **options)
  limit = options[:limit] || 20
  semantic_weight = options[:semantic_weight] || 0.7
  text_weight = options[:text_weight] || 0.3

  results = []

  # Get semantic search results if embedding provided
  if query_embedding
    semantic_results = embeddings_search(query_embedding, limit: limit)
    results.concat(semantic_results.map do |result|
      result.merge(
        search_type: "semantic",
        weighted_score: result[:combined_score] * semantic_weight
      )
    end)
  end

  # Get PostgreSQL full-text search results
  text_results = search_content(query, limit: limit)
  text_results.each_with_index do |doc, index|
    score = (limit - index).to_f / limit * text_weight
    results << {
      document_id: doc.id.to_s,
      document_title: doc.title,
      document_location: doc.location,
      content: doc.content[0..500], # Preview
      search_type: "full_text",
      weighted_score: score,
      document: doc
    }
  end

  # Combine and deduplicate by document_id
  combined = results.group_by { |r| r[:document_id] }
                    .map do |_doc_id, doc_results|
    best_result = doc_results.max_by { |r| r[:weighted_score] }
    total_score = doc_results.sum { |r| r[:weighted_score] }
    search_types = doc_results.map { |r| r[:search_type] }.uniq

    best_result.merge(
      combined_score: total_score,
      search_types: search_types
    )
  end

  combined.sort_by { |r| -r[:combined_score] }.take(limit)
end

.keyword_frequenciesObject

Get keyword frequencies for faceted search



427
428
429
430
431
432
433
434
435
436
437
438
# File 'app/models/ragdoll/document.rb', line 427

def self.keyword_frequencies
  frequencies = Hash.new(0)
  where("metadata ? 'keywords'").pluck(:metadata).each do |meta|
    case meta["keywords"]
    when Array
      meta["keywords"].each { |k| frequencies[k] += 1 }
    when String
      meta["keywords"].split(",").map(&:strip).each { |k| frequencies[k] += 1 }
    end
  end
  frequencies.sort_by { |_k, v| -v }.to_h
end

.search_by_keywords(keywords_array, **options) ⇒ Object

Search documents by keywords using PostgreSQL array operations Returns documents that match keywords with scoring based on match count Inspired by find_matching_entries.rb algorithm but optimized for PostgreSQL arrays



332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
# File 'app/models/ragdoll/document.rb', line 332

def self.search_by_keywords(keywords_array, **options)
  return where("1 = 0") if keywords_array.blank?

  # Normalize keywords to lowercase strings array
  normalized_keywords = Array(keywords_array).map(&:to_s).map(&:downcase).reject(&:empty?)
  return where("1 = 0") if normalized_keywords.empty?

  limit = options[:limit] || 20
  
  # Use PostgreSQL array overlap operator with proper array literal
  quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
  array_literal = "'{#{quoted_keywords}}'::text[]"
  where("keywords && #{array_literal}")
    .order("created_at DESC")
    .limit(limit)
end

.search_by_keywords_all(keywords_array, **options) ⇒ Object

Find documents that contain ALL specified keywords (exact array matching)



350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# File 'app/models/ragdoll/document.rb', line 350

def self.search_by_keywords_all(keywords_array, **options)
  return where("1 = 0") if keywords_array.blank?

  normalized_keywords = Array(keywords_array).map(&:to_s).map(&:downcase).reject(&:empty?)
  return where("1 = 0") if normalized_keywords.empty?

  limit = options[:limit] || 20
  
  # Use PostgreSQL array contains operator with proper array literal
  quoted_keywords = normalized_keywords.map { |k| "\"#{k}\"" }.join(',')
  array_literal = "'{#{quoted_keywords}}'::text[]"
  where("keywords @> #{array_literal}")
    .order("created_at DESC")
    .limit(limit)
end

.search_content(query, **options) ⇒ Object

PostgreSQL full-text search on metadata fields with per-word match-ratio [0.0..1.0]



261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'app/models/ragdoll/document.rb', line 261

def self.search_content(query, **options)
  return none if query.blank?

  # Split into unique alphanumeric words
  words = query.downcase.scan(/[[:alnum:]]+/).uniq
  return none if words.empty?

  limit = options[:limit] || 20
  threshold = options[:threshold] || 0.0

  # Use precomputed tsvector column if it exists, otherwise build on the fly
  if column_names.include?("search_vector")
    tsvector = "#{table_name}.search_vector"
  else
    # Build tsvector from title and metadata fields
    text_expr = 
      "COALESCE(title, '') || ' ' || " \
      "COALESCE(metadata->>'summary', '') || ' ' || " \
      "COALESCE(metadata->>'keywords', '') || ' ' || " \
      "COALESCE(metadata->>'description', '')"
    tsvector = "to_tsvector('english', #{text_expr})"
  end

  # Prepare sanitized tsquery terms
  tsqueries = words.map do |word|
    sanitize_sql_array(["plainto_tsquery('english', ?)", word])
  end

  # Combine per-word tsqueries with OR so PostgreSQL can use the GIN index
  combined_tsquery = tsqueries.join(' || ')

  # Score each match (1 if present, 0 if not), sum them
  score_terms = tsqueries.map { |tsq| "(#{tsvector} @@ #{tsq})::int" }
  score_sum   = score_terms.join(' + ')

  # Similarity ratio: fraction of query words present
  similarity_sql = "(#{score_sum})::float / #{words.size}"

  # Start with basic search query
  query = select("#{table_name}.*, #{similarity_sql} AS fulltext_similarity")
  
  # Build where conditions
  conditions = ["#{tsvector} @@ (#{combined_tsquery})"]
  
  # Add status filter (default to processed unless overridden)
  status = options[:status] || 'processed'
  conditions << "#{table_name}.status = '#{status}'"
  
  # Add document type filter if specified
  if options[:document_type].present?
    conditions << sanitize_sql_array(["#{table_name}.document_type = ?", options[:document_type]])
  end
  
  # Add threshold filtering if specified
  if threshold > 0.0
    conditions << "#{similarity_sql} >= #{threshold}"
  end
  
  # Combine all conditions
  where_clause = conditions.join(' AND ')

  # Materialize to array to avoid COUNT/SELECT alias conflicts in some AR versions
  query.where(where_clause)
    .order(Arel.sql("fulltext_similarity DESC, updated_at DESC"))
    .limit(limit)
    .to_a
end

Instance Method Details

#add_keyword(keyword) ⇒ Object



157
158
159
160
161
162
163
164
165
166
# File 'app/models/ragdoll/document.rb', line 157

def add_keyword(keyword)
  return if keyword.blank?
  
  current_keywords = keywords_array
  normalized_keyword = keyword.to_s.strip.downcase
  return if current_keywords.map(&:downcase).include?(normalized_keyword)

  current_keywords << normalized_keyword
  self.keywords = current_keywords
end

#all_embeddings(content_type: nil) ⇒ Object



523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# File 'app/models/ragdoll/document.rb', line 523

def all_embeddings(content_type: nil)
  content_ids = []

  content_types = content_type ? [content_type.to_s] : %w[text image audio]

  content_types.each do |type|
    content_relation = send("#{type}_contents")
    content_ids.concat(content_relation.pluck(:id)) if content_relation.any?
  end

  return Ragdoll::Embedding.none if content_ids.empty?

  # Use the base STI class name 'Ragdoll::Content' as that's what's stored
  # in polymorphic associations with STI
  Ragdoll::Embedding.where(
    embeddable_type: "Ragdoll::Content",
    embeddable_id: content_ids
  )
end

#classificationObject



186
187
188
# File 'app/models/ragdoll/document.rb', line 186

def classification
  ["classification"]
end

#classification=(value) ⇒ Object



190
191
192
# File 'app/models/ragdoll/document.rb', line 190

def classification=(value)
  self. = .merge("classification" => value)
end

#contentObject

Dynamic content method that forwards to appropriate content table



88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'app/models/ragdoll/document.rb', line 88

def content
  type = primary_content_type

  if %w[text image audio].include?(type)
    # Return the combined content from the appropriate content type
    # For text: actual text content
    # For image: AI-generated descriptions (stored in content field)
    # For audio: transcripts (stored in content field)
    send("#{type}_contents").pluck(:content).compact.join("\n\n")
  else
    # Fallback: try to get any available content
    contents.pluck(:content).compact.join("\n\n")
  end
end

#content=(value) ⇒ Object

Set content method for backwards compatibility



104
105
106
107
108
109
110
111
112
# File 'app/models/ragdoll/document.rb', line 104

def content=(value)
  # Store the content to be created after save
  @pending_content = value

  # If document is already persisted, create the content immediately
  return unless persisted?

  create_content_from_pending
end

#content_typesObject



74
75
76
77
78
# File 'app/models/ragdoll/document.rb', line 74

def content_types
  %w[text image audio].select do |type|
    send("#{type}_contents").any?
  end
end

#descriptionObject

Metadata accessors for common fields



178
179
180
# File 'app/models/ragdoll/document.rb', line 178

def description
  ["description"]
end

#description=(value) ⇒ Object



182
183
184
# File 'app/models/ragdoll/document.rb', line 182

def description=(value)
  self. = .merge("description" => value)
end

#embeddings_by_typeObject



127
128
129
130
131
# File 'app/models/ragdoll/document.rb', line 127

def embeddings_by_type
  %w[text image audio].each_with_object({}) do |type, result|
    result[type.to_sym] = send("#{type}_embeddings").count
  end
end

#generate_embeddings_for_all_content!Object

Generate embeddings for all content types



232
233
234
235
236
# File 'app/models/ragdoll/document.rb', line 232

def generate_embeddings_for_all_content!
  %w[text image audio].each do |type|
    send("#{type}_contents").each(&:generate_embeddings!)
  end
end

#generate_metadata!Object

Generate structured metadata using LLM



239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'app/models/ragdoll/document.rb', line 239

def generate_metadata!
  require_relative "../../lib/ragdoll/core/services/metadata_generator"

  generator = Ragdoll::MetadataGenerator.new
   = generator.generate_for_document(self)

  # Validate metadata against schema
  errors = Ragdoll::MetadataSchemas.(document_type, )
  if errors.any?
    Rails.logger.warn "Metadata validation errors: #{errors.join(', ')}" if defined?(Rails)
    puts "Metadata validation errors: #{errors.join(', ')}"
  end

  # Merge with existing metadata (preserving user-set values)
  self. = .merge()
  save!
rescue StandardError => e
  Rails.logger.error "Metadata generation failed: #{e.message}" if defined?(Rails)
  puts "Metadata generation failed: #{e.message}"
end

#has_files?Boolean

File-related helper methods - now delegated to content models

Returns:

  • (Boolean)


203
204
205
# File 'app/models/ragdoll/document.rb', line 203

def has_files?
  contents.any? { |c| c.data.present? }
end

#has_keywords?Boolean

Returns:

  • (Boolean)


138
139
140
# File 'app/models/ragdoll/document.rb', line 138

def has_keywords?
  keywords.present?
end

#has_summary?Boolean

Document metadata methods - now using dedicated columns

Returns:

  • (Boolean)


134
135
136
# File 'app/models/ragdoll/document.rb', line 134

def has_summary?
  summary.present?
end

#keywords_arrayObject



142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'app/models/ragdoll/document.rb', line 142

def keywords_array
  return [] unless keywords.present?

  # After migration, keywords is now a PostgreSQL array
  case keywords
  when Array
    keywords.map(&:to_s).map(&:strip).reject(&:empty?)
  when String
    # Fallback for any remaining string data (shouldn't happen after migration)
    keywords.split(",").map(&:strip).reject(&:empty?)
  else
    []
  end
end

#multi_modal?Boolean

Multi-modal content type detection

Returns:

  • (Boolean)


70
71
72
# File 'app/models/ragdoll/document.rb', line 70

def multi_modal?
  content_types.length > 1
end

#primary_content_typeObject



80
81
82
83
84
85
# File 'app/models/ragdoll/document.rb', line 80

def primary_content_type
  return document_type if %w[text image audio].include?(document_type)
  return content_types.first if content_types.any?

  "text" # default
end

#primary_file_typeObject



212
213
214
215
# File 'app/models/ragdoll/document.rb', line 212

def primary_file_type
  # Return the document_type as the primary file type
  document_type
end

#process_content!Object

Content processing for multi-modal documents



218
219
220
221
222
223
224
225
226
227
228
229
# File 'app/models/ragdoll/document.rb', line 218

def process_content!
  # Content processing is now handled by individual content models
  # This method orchestrates the overall processing

  # Generate embeddings for all content
  generate_embeddings_for_all_content!

  # Generate structured metadata using LLM
  generate_metadata!

  update!(status: "processed")
end

#processed?Boolean

Returns:

  • (Boolean)


65
66
67
# File 'app/models/ragdoll/document.rb', line 65

def processed?
  status == "processed"
end

#remove_keyword(keyword) ⇒ Object



168
169
170
171
172
173
174
175
# File 'app/models/ragdoll/document.rb', line 168

def remove_keyword(keyword)
  return if keyword.blank?
  
  current_keywords = keywords_array
  normalized_keyword = keyword.to_s.strip.downcase
  current_keywords.reject! { |k| k.downcase == normalized_keyword }
  self.keywords = current_keywords
end

#search_dataObject

Get search data for indexing



501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# File 'app/models/ragdoll/document.rb', line 501

def search_data
  data = {
    title: title,
    document_type: document_type,
    location: location,
    status: status,
    total_word_count: total_word_count,
    total_character_count: total_character_count,
    total_embedding_count: total_embedding_count,
    content_types: content_types,
    multi_modal: multi_modal?
  }

  # Add document metadata
  data.merge!(.transform_keys { |k| "metadata_#{k}" }) if .present?

  # Add file metadata
  data.merge!(.transform_keys { |k| "file_#{k}" }) if .present?

  data
end

#tagsObject



194
195
196
# File 'app/models/ragdoll/document.rb', line 194

def tags
  ["tags"] || []
end

#tags=(value) ⇒ Object



198
199
200
# File 'app/models/ragdoll/document.rb', line 198

def tags=(value)
  self. = .merge("tags" => Array(value))
end

#to_hash(include_content: false) ⇒ Object

Convert document to hash representation for API responses



646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
# File 'app/models/ragdoll/document.rb', line 646

def to_hash(include_content: false)
  {
    id: id.to_s,
    title: title,
    location: location,
    document_type: document_type,
    status: status,
    content_length: content&.length || 0,
    file_modified_at: file_modified_at&.iso8601,
    created_at: created_at&.iso8601,
    updated_at: updated_at&.iso8601,
    metadata:  || {},
    content_summary: {
      text_contents: text_contents.count,
      image_contents: image_contents.count,
      audio_contents: audio_contents.count,
      embeddings_count: total_embeddings_count,
      embeddings_ready: status == "processed"
    }
  }.tap do |hash|
    if include_content
      hash[:content_details] = {
        text_content: text_contents.map(&:content),
        image_descriptions: image_contents.map(&:description),
        audio_transcripts: audio_contents.map(&:transcript)
      }
    end
  end
end

#total_character_countObject



119
120
121
# File 'app/models/ragdoll/document.rb', line 119

def total_character_count
  text_contents.sum { |tc| tc.character_count }
end

#total_embedding_countObject



123
124
125
# File 'app/models/ragdoll/document.rb', line 123

def total_embedding_count
  %w[text image audio].sum { |type| send("#{type}_embeddings").count }
end

#total_file_sizeObject



207
208
209
210
# File 'app/models/ragdoll/document.rb', line 207

def total_file_size
  # Could be implemented by summing file sizes from content metadata
  contents.sum { |c| c..dig("file_size") || 0 }
end

#total_word_countObject

Content statistics



115
116
117
# File 'app/models/ragdoll/document.rb', line 115

def total_word_count
  text_contents.sum { |tc| tc.word_count }
end