Class: Ragdoll::UnifiedContent
- Inherits:
-
ActiveRecord::Base
- Object
- ActiveRecord::Base
- Ragdoll::UnifiedContent
- Defined in:
- app/models/ragdoll/unified_content.rb
Overview
Unified content model for text-based RAG system All content types (text, image, audio, video) are converted to text and stored in a single content field for unified embedding generation
Class Method Summary collapse
-
.search_content(query, **options) ⇒ Object
Search within this content type.
-
.stats ⇒ Object
Get statistics for all content.
Instance Method Summary collapse
- #audio_content? ⇒ Boolean
-
#audio_duration ⇒ Object
Audio-specific metadata.
- #audio_duration=(value) ⇒ Object
- #character_count ⇒ Object
-
#content_quality_score ⇒ Object
Content quality scoring.
- #conversion_method ⇒ Object
- #conversion_method=(value) ⇒ Object
- #embedding_count ⇒ Object
- #file_size ⇒ Object
- #file_size=(value) ⇒ Object
-
#generate_embeddings! ⇒ Object
Generate embeddings for this content.
- #image_content? ⇒ Boolean
- #image_dimensions ⇒ Object
- #image_height ⇒ Object
-
#image_width ⇒ Object
Image-specific metadata (for backward compatibility).
-
#original_filename ⇒ Object
Original media metadata.
- #original_filename=(value) ⇒ Object
-
#should_generate_embeddings? ⇒ Boolean
Whether this content should generate embeddings.
-
#text_content? ⇒ Boolean
Media type specific accessors for backward compatibility.
- #video_content? ⇒ Boolean
-
#word_count ⇒ Object
Statistics.
Class Method Details
.search_content(query, **options) ⇒ Object
Search within this content type
188 189 190 191 192 193 194 195 |
# File 'app/models/ragdoll/unified_content.rb', line 188 def self.search_content(query, **) return none if query.blank? where( "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)", query ).limit([:limit] || 20) end |
.stats ⇒ Object
Get statistics for all content
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
# File 'app/models/ragdoll/unified_content.rb', line 198 def self.stats { total_contents: count, by_media_type: group(:original_media_type).count, by_model: group(:embedding_model).count, total_embeddings: joins(:embeddings).count, with_embeddings: .count, without_embeddings: .count, average_word_count: average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"), average_character_count: average("LENGTH(content)"), content_quality_distribution: { high: where("LENGTH(content) > 1000").count, medium: where("LENGTH(content) BETWEEN 100 AND 1000").count, low: where("LENGTH(content) < 100").count } } end |
Instance Method Details
#audio_content? ⇒ Boolean
91 92 93 |
# File 'app/models/ragdoll/unified_content.rb', line 91 def audio_content? original_media_type == "audio" end |
#audio_duration ⇒ Object
Audio-specific metadata
142 143 144 |
# File 'app/models/ragdoll/unified_content.rb', line 142 def audio_duration .dig("duration") end |
#audio_duration=(value) ⇒ Object
146 147 148 |
# File 'app/models/ragdoll/unified_content.rb', line 146 def audio_duration=(value) self. = .merge("duration" => value) end |
#character_count ⇒ Object
74 75 76 |
# File 'app/models/ragdoll/unified_content.rb', line 74 def character_count content&.length || 0 end |
#content_quality_score ⇒ Object
Content quality scoring
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# File 'app/models/ragdoll/unified_content.rb', line 151 def content_quality_score return 0.0 if content.blank? score = 0.0 # Base score for having content score += 0.3 # Length scoring (normalized) if word_count > 0 # Score based on reasonable content length (50-2000 words is ideal) length_score = case word_count when 0..10 then 0.1 when 11..50 then 0.5 when 51..500 then 1.0 when 501..2000 then 0.9 when 2001..5000 then 0.7 else 0.5 end score += length_score * 0.4 end # Content type scoring type_score = case original_media_type when "text", "markdown" then 1.0 when "pdf", "docx", "html" then 0.9 when "image" then content.include?("Image file:") ? 0.3 : 0.8 when "audio" then content.include?("Audio file:") ? 0.3 : 0.8 when "video" then content.include?("Video file:") ? 0.3 : 0.7 else 0.5 end score += type_score * 0.3 [score, 1.0].min # Cap at 1.0 end |
#conversion_method ⇒ Object
116 117 118 |
# File 'app/models/ragdoll/unified_content.rb', line 116 def conversion_method .dig("conversion_method") end |
#conversion_method=(value) ⇒ Object
120 121 122 |
# File 'app/models/ragdoll/unified_content.rb', line 120 def conversion_method=(value) self. = .merge("conversion_method" => value) end |
#embedding_count ⇒ Object
78 79 80 |
# File 'app/models/ragdoll/unified_content.rb', line 78 def .count end |
#file_size ⇒ Object
108 109 110 |
# File 'app/models/ragdoll/unified_content.rb', line 108 def file_size .dig("file_size") || 0 end |
#file_size=(value) ⇒ Object
112 113 114 |
# File 'app/models/ragdoll/unified_content.rb', line 112 def file_size=(value) self. = .merge("file_size" => value) end |
#generate_embeddings! ⇒ Object
Generate embeddings for this content
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'app/models/ragdoll/unified_content.rb', line 34 def return unless # Clear existing embeddings .destroy_all # Use TextChunker to split content into chunks chunks = Ragdoll::TextChunker.chunk(content) # Generate embeddings for each chunk = Ragdoll::EmbeddingService.new chunks.each_with_index do |chunk_text, index| begin vector = .(chunk_text) .create!( content: chunk_text, embedding_vector: vector, chunk_index: index ) rescue StandardError => e puts "Failed to generate embedding for chunk #{index}: #{e.message}" end end update!(metadata: .merge("embeddings_generated_at" => Time.current)) end |
#image_content? ⇒ Boolean
87 88 89 |
# File 'app/models/ragdoll/unified_content.rb', line 87 def image_content? original_media_type == "image" end |
#image_dimensions ⇒ Object
133 134 135 136 137 138 139 |
# File 'app/models/ragdoll/unified_content.rb', line 133 def image_dimensions width = image_width height = image_height return nil unless width && height { width: width, height: height } end |
#image_height ⇒ Object
129 130 131 |
# File 'app/models/ragdoll/unified_content.rb', line 129 def image_height .dig("height") end |
#image_width ⇒ Object
Image-specific metadata (for backward compatibility)
125 126 127 |
# File 'app/models/ragdoll/unified_content.rb', line 125 def image_width .dig("width") end |
#original_filename ⇒ Object
Original media metadata
100 101 102 |
# File 'app/models/ragdoll/unified_content.rb', line 100 def original_filename .dig("original_filename") end |
#original_filename=(value) ⇒ Object
104 105 106 |
# File 'app/models/ragdoll/unified_content.rb', line 104 def original_filename=(value) self. = .merge("original_filename" => value) end |
#should_generate_embeddings? ⇒ Boolean
Whether this content should generate embeddings
64 65 66 |
# File 'app/models/ragdoll/unified_content.rb', line 64 def content.present? && .empty? end |
#text_content? ⇒ Boolean
Media type specific accessors for backward compatibility
83 84 85 |
# File 'app/models/ragdoll/unified_content.rb', line 83 def text_content? %w[text markdown html pdf docx].include?(original_media_type) end |
#video_content? ⇒ Boolean
95 96 97 |
# File 'app/models/ragdoll/unified_content.rb', line 95 def video_content? original_media_type == "video" end |
#word_count ⇒ Object
Statistics
69 70 71 72 |
# File 'app/models/ragdoll/unified_content.rb', line 69 def word_count return 0 unless content.present? content.split(/\s+/).length end |