Class: Ragdoll::UnifiedDocument
- Inherits:
-
ActiveRecord::Base
- Object
- ActiveRecord::Base
- Ragdoll::UnifiedDocument
- Defined in:
- app/models/ragdoll/unified_document.rb
Overview
Unified document model for text-based RAG system All documents have their content converted to text for unified search and embedding
Class Method Summary collapse
-
.all_media_types ⇒ Object
Get all unique original media types.
-
.search_content(query, **options) ⇒ Object
Search content using PostgreSQL full-text search.
-
.stats ⇒ Object
Get document statistics.
Instance Method Summary collapse
-
#content ⇒ Object
Unified content access.
- #content=(value) ⇒ Object
-
#content_quality_score ⇒ Object
Content quality assessment.
-
#generate_embeddings_for_content! ⇒ Object
Generate embeddings for all content.
-
#generate_metadata! ⇒ Object
Generate structured metadata using LLM.
- #high_quality_content? ⇒ Boolean
-
#process_document! ⇒ Object
Document processing for unified text-based RAG.
- #processed? ⇒ Boolean
-
#to_hash(include_content: false) ⇒ Object
Convert document to hash representation.
- #total_character_count ⇒ Object
- #total_embedding_count ⇒ Object
-
#total_word_count ⇒ Object
Content statistics.
Class Method Details
.all_media_types ⇒ Object
Get all unique original media types
209 210 211 |
# File 'app/models/ragdoll/unified_document.rb', line 209 def self.all_media_types joins(:unified_contents).distinct.pluck("unified_contents.original_media_type").compact.sort end |
.search_content(query, **options) ⇒ Object
Search content using PostgreSQL full-text search
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'app/models/ragdoll/unified_document.rb', line 139 def self.search_content(query, **) return none if query.blank? words = query.downcase.scan(/[[:alnum:]]+/).uniq return none if words.empty? limit = [:limit] || 20 threshold = [:threshold] || 0.0 # Build tsvector from title and content text_expr = "COALESCE(title, '') || ' ' || COALESCE(content, '')" tsvector = "to_tsvector('english', #{text_expr})" # Prepare sanitized tsquery terms tsqueries = words.map do |word| sanitize_sql_array(["plainto_tsquery('english', ?)", word]) end # Combine per-word tsqueries combined_tsquery = tsqueries.join(' || ') # Score calculation score_terms = tsqueries.map { |tsq| "(#{tsvector} @@ #{tsq})::int" } score_sum = score_terms.join(' + ') similarity_sql = "(#{score_sum})::float / #{words.size}" # Build query with content from unified_contents query = joins(:unified_contents) .select("#{table_name}.*, string_agg(unified_contents.content, ' ') as content, #{similarity_sql} AS fulltext_similarity") .group("#{table_name}.id") # Build where conditions conditions = ["#{tsvector} @@ (#{combined_tsquery})"] # Add status filter status = [:status] || 'processed' conditions << "#{table_name}.status = '#{status}'" # Add document type filter if specified if [:document_type].present? conditions << sanitize_sql_array(["#{table_name}.document_type = ?", [:document_type]]) end # Add threshold filtering if specified if threshold > 0.0 conditions << "#{similarity_sql} >= #{threshold}" end # Combine all conditions where_clause = conditions.join(' AND ') query.where(where_clause) .order(Arel.sql("fulltext_similarity DESC, updated_at DESC")) .limit(limit) .to_a end |
.stats ⇒ Object
Get document statistics
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
# File 'app/models/ragdoll/unified_document.rb', line 214 def self.stats { total_documents: count, by_status: group(:status).count, by_type: group(:document_type).count, with_content: with_content.count, without_content: without_content.count, total_unified_contents: joins(:unified_contents).count, total_embeddings: joins(:embeddings).count, content_quality: { high: joins(:unified_contents).where("LENGTH(unified_contents.content) > 1000").distinct.count, medium: joins(:unified_contents).where("LENGTH(unified_contents.content) BETWEEN 100 AND 1000").distinct.count, low: joins(:unified_contents).where("LENGTH(unified_contents.content) < 100").distinct.count }, storage_type: "unified_text_based" } end |
Instance Method Details
#content ⇒ Object
Unified content access
46 47 48 |
# File 'app/models/ragdoll/unified_document.rb', line 46 def content unified_contents.pluck(:content).compact.join("\n\n") end |
#content=(value) ⇒ Object
50 51 52 53 54 55 56 |
# File 'app/models/ragdoll/unified_document.rb', line 50 def content=(value) @pending_content = value return unless persisted? create_unified_content_from_pending end |
#content_quality_score ⇒ Object
Content quality assessment
197 198 199 200 201 202 |
# File 'app/models/ragdoll/unified_document.rb', line 197 def content_quality_score return 0.0 unless unified_contents.any? scores = unified_contents.map(&:content_quality_score) scores.sum / scores.length end |
#generate_embeddings_for_content! ⇒ Object
Generate embeddings for all content
99 100 101 |
# File 'app/models/ragdoll/unified_document.rb', line 99 def unified_contents.each(&:generate_embeddings!) end |
#generate_metadata! ⇒ Object
Generate structured metadata using LLM
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'app/models/ragdoll/unified_document.rb', line 104 def return unless unified_contents.any? begin # Use the content for metadata generation full_content = content return if full_content.blank? # Generate basic metadata = { content_length: full_content.length, word_count: full_content.split(/\s+/).length, generated_at: Time.current, original_media_type: document_type } # Add document type specific metadata case document_type when "image" [:description_source] = "ai_generated" when "audio" [:transcript_source] = "auto_generated" when "video" [:content_source] = "mixed_media_conversion" end # Merge with existing metadata self. = .merge() save! rescue StandardError => e puts "Metadata generation failed: #{e.message}" end end |
#high_quality_content? ⇒ Boolean
204 205 206 |
# File 'app/models/ragdoll/unified_document.rb', line 204 def high_quality_content? content_quality_score >= 0.7 end |
#process_document! ⇒ Object
Document processing for unified text-based RAG
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'app/models/ragdoll/unified_document.rb', line 72 def process_document! return if processed? begin update!(status: "processing") # Convert document to text using unified converter text_content = Ragdoll::DocumentConverter.convert_to_text(location, document_type) # Create or update unified content create_or_update_unified_content(text_content) # Generate embeddings # Generate metadata update!(status: "processed") rescue StandardError => e puts "Document processing failed: #{e.message}" update!(status: "error", metadata: .merge("error" => e.)) raise end end |
#processed? ⇒ Boolean
41 42 43 |
# File 'app/models/ragdoll/unified_document.rb', line 41 def processed? status == "processed" end |
#to_hash(include_content: false) ⇒ Object
Convert document to hash representation
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
# File 'app/models/ragdoll/unified_document.rb', line 233 def to_hash(include_content: false) { id: id.to_s, title: title, location: location, document_type: document_type, status: status, content_length: content&.length || 0, word_count: total_word_count, embedding_count: , content_quality_score: content_quality_score, file_modified_at: file_modified_at&.iso8601, created_at: created_at&.iso8601, updated_at: updated_at&.iso8601, metadata: || {} }.tap do |hash| if include_content hash[:content] = content hash[:content_details] = unified_contents.map do |uc| { original_media_type: uc.original_media_type, content: uc.content, word_count: uc.word_count, embedding_count: uc., conversion_method: uc.conversion_method } end end end end |
#total_character_count ⇒ Object
63 64 65 |
# File 'app/models/ragdoll/unified_document.rb', line 63 def total_character_count unified_contents.sum(&:character_count) end |
#total_embedding_count ⇒ Object
67 68 69 |
# File 'app/models/ragdoll/unified_document.rb', line 67 def .count end |
#total_word_count ⇒ Object
Content statistics
59 60 61 |
# File 'app/models/ragdoll/unified_document.rb', line 59 def total_word_count unified_contents.sum(&:word_count) end |