Class: Ragdoll::UnifiedContent

Inherits:
ActiveRecord::Base
  • Object
show all
Defined in:
app/models/ragdoll/unified_content.rb

Overview

Unified content model for text-based RAG system All content types (text, image, audio, video) are converted to text and stored in a single content field for unified embedding generation

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.search_content(query, **options) ⇒ Object

Search within this content type



188
189
190
191
192
193
194
195
# File 'app/models/ragdoll/unified_content.rb', line 188

def self.search_content(query, **options)
  return none if query.blank?

  where(
    "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
    query
  ).limit(options[:limit] || 20)
end

.statsObject

Get statistics for all content



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'app/models/ragdoll/unified_content.rb', line 198

def self.stats
  {
    total_contents: count,
    by_media_type: group(:original_media_type).count,
    by_model: group(:embedding_model).count,
    total_embeddings: joins(:embeddings).count,
    with_embeddings: with_embeddings.count,
    without_embeddings: without_embeddings.count,
    average_word_count: average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"),
    average_character_count: average("LENGTH(content)"),
    content_quality_distribution: {
      high: where("LENGTH(content) > 1000").count,
      medium: where("LENGTH(content) BETWEEN 100 AND 1000").count,
      low: where("LENGTH(content) < 100").count
    }
  }
end

Instance Method Details

#audio_content?Boolean



91
92
93
# File 'app/models/ragdoll/unified_content.rb', line 91

def audio_content?
  original_media_type == "audio"
end

#audio_durationObject

Audio-specific metadata



142
143
144
# File 'app/models/ragdoll/unified_content.rb', line 142

def audio_duration
  .dig("duration")
end

#audio_duration=(value) ⇒ Object



146
147
148
# File 'app/models/ragdoll/unified_content.rb', line 146

def audio_duration=(value)
  self. = .merge("duration" => value)
end

#character_countObject



74
75
76
# File 'app/models/ragdoll/unified_content.rb', line 74

def character_count
  content&.length || 0
end

#content_quality_scoreObject

Content quality scoring



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'app/models/ragdoll/unified_content.rb', line 151

def content_quality_score
  return 0.0 if content.blank?

  score = 0.0

  # Base score for having content
  score += 0.3

  # Length scoring (normalized)
  if word_count > 0
    # Score based on reasonable content length (50-2000 words is ideal)
    length_score = case word_count
                  when 0..10 then 0.1
                  when 11..50 then 0.5
                  when 51..500 then 1.0
                  when 501..2000 then 0.9
                  when 2001..5000 then 0.7
                  else 0.5
                  end
    score += length_score * 0.4
  end

  # Content type scoring
  type_score = case original_media_type
              when "text", "markdown" then 1.0
              when "pdf", "docx", "html" then 0.9
              when "image" then content.include?("Image file:") ? 0.3 : 0.8
              when "audio" then content.include?("Audio file:") ? 0.3 : 0.8
              when "video" then content.include?("Video file:") ? 0.3 : 0.7
              else 0.5
              end
  score += type_score * 0.3

  [score, 1.0].min # Cap at 1.0
end

#conversion_methodObject



116
117
118
# File 'app/models/ragdoll/unified_content.rb', line 116

def conversion_method
  .dig("conversion_method")
end

#conversion_method=(value) ⇒ Object



120
121
122
# File 'app/models/ragdoll/unified_content.rb', line 120

def conversion_method=(value)
  self. = .merge("conversion_method" => value)
end

#embedding_countObject



78
79
80
# File 'app/models/ragdoll/unified_content.rb', line 78

def embedding_count
  embeddings.count
end

#file_sizeObject



108
109
110
# File 'app/models/ragdoll/unified_content.rb', line 108

def file_size
  .dig("file_size") || 0
end

#file_size=(value) ⇒ Object



112
113
114
# File 'app/models/ragdoll/unified_content.rb', line 112

def file_size=(value)
  self. = .merge("file_size" => value)
end

#generate_embeddings!Object

Generate embeddings for this content



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'app/models/ragdoll/unified_content.rb', line 34

def generate_embeddings!
  return unless should_generate_embeddings?

  # Clear existing embeddings
  embeddings.destroy_all

  # Use TextChunker to split content into chunks
  chunks = Ragdoll::TextChunker.chunk(content)

  # Generate embeddings for each chunk
  embedding_service = Ragdoll::EmbeddingService.new

  chunks.each_with_index do |chunk_text, index|
    begin
      vector = embedding_service.generate_embedding(chunk_text)

      embeddings.create!(
        content: chunk_text,
        embedding_vector: vector,
        chunk_index: index
      )
    rescue StandardError => e
      puts "Failed to generate embedding for chunk #{index}: #{e.message}"
    end
  end

  update!(metadata: .merge("embeddings_generated_at" => Time.current))
end

#image_content?Boolean



87
88
89
# File 'app/models/ragdoll/unified_content.rb', line 87

def image_content?
  original_media_type == "image"
end

#image_dimensionsObject



133
134
135
136
137
138
139
# File 'app/models/ragdoll/unified_content.rb', line 133

def image_dimensions
  width = image_width
  height = image_height
  return nil unless width && height

  { width: width, height: height }
end

#image_heightObject



129
130
131
# File 'app/models/ragdoll/unified_content.rb', line 129

def image_height
  .dig("height")
end

#image_widthObject

Image-specific metadata (for backward compatibility)



125
126
127
# File 'app/models/ragdoll/unified_content.rb', line 125

def image_width
  .dig("width")
end

#original_filenameObject

Original media metadata



100
101
102
# File 'app/models/ragdoll/unified_content.rb', line 100

def original_filename
  .dig("original_filename")
end

#original_filename=(value) ⇒ Object



104
105
106
# File 'app/models/ragdoll/unified_content.rb', line 104

def original_filename=(value)
  self. = .merge("original_filename" => value)
end

#should_generate_embeddings?Boolean

Whether this content should generate embeddings



64
65
66
# File 'app/models/ragdoll/unified_content.rb', line 64

def should_generate_embeddings?
  content.present? && embeddings.empty?
end

#text_content?Boolean

Media type specific accessors for backward compatibility



83
84
85
# File 'app/models/ragdoll/unified_content.rb', line 83

def text_content?
  %w[text markdown html pdf docx].include?(original_media_type)
end

#video_content?Boolean



95
96
97
# File 'app/models/ragdoll/unified_content.rb', line 95

def video_content?
  original_media_type == "video"
end

#word_countObject

Statistics



69
70
71
72
# File 'app/models/ragdoll/unified_content.rb', line 69

def word_count
  return 0 unless content.present?
  content.split(/\s+/).length
end