Class: Ragdoll::UnifiedContent

Inherits:

ActiveRecord::Base

Object
ActiveRecord::Base
Ragdoll::UnifiedContent

show all

Defined in:: app/models/ragdoll/unified_content.rb

Overview

Unified content model for text-based RAG system All content types (text, image, audio, video) are converted to text and stored in a single content field for unified embedding generation

Class Method Summary collapse

.search_content(query, **options) ⇒ Object

Search within this content type.
.stats ⇒ Object

Get statistics for all content.

Instance Method Summary collapse

#audio_content? ⇒ Boolean
#audio_duration ⇒ Object

Audio-specific metadata.
#audio_duration=(value) ⇒ Object
#character_count ⇒ Object
#content_quality_score ⇒ Object

Content quality scoring.
#conversion_method ⇒ Object
#conversion_method=(value) ⇒ Object
#embedding_count ⇒ Object
#file_size ⇒ Object
#file_size=(value) ⇒ Object
#generate_embeddings! ⇒ Object

Generate embeddings for this content.
#image_content? ⇒ Boolean
#image_dimensions ⇒ Object
#image_height ⇒ Object
#image_width ⇒ Object

Image-specific metadata (for backward compatibility).
#original_filename ⇒ Object

Original media metadata.
#original_filename=(value) ⇒ Object
#should_generate_embeddings? ⇒ Boolean

Whether this content should generate embeddings.
#text_content? ⇒ Boolean

Media type specific accessors for backward compatibility.
#video_content? ⇒ Boolean
#word_count ⇒ Object

Statistics.

Class Method Details

.search_content(query, **options) ⇒ `Object`

Search within this content type

# File 'app/models/ragdoll/unified_content.rb', line 188

def self.search_content(query, **options)
  return none if query.blank?

  where(
    "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
    query
  ).limit(options[:limit] || 20)
end

.stats ⇒ `Object`

Get statistics for all content

# File 'app/models/ragdoll/unified_content.rb', line 198

def self.stats
  {
    total_contents: count,
    by_media_type: group(:original_media_type).count,
    by_model: group(:embedding_model).count,
    total_embeddings: joins(:embeddings).count,
    with_embeddings: with_embeddings.count,
    without_embeddings: without_embeddings.count,
    average_word_count: average("LENGTH(content) - LENGTH(REPLACE(content, ' ', '')) + 1"),
    average_character_count: average("LENGTH(content)"),
    content_quality_distribution: {
      high: where("LENGTH(content) > 1000").count,
      medium: where("LENGTH(content) BETWEEN 100 AND 1000").count,
      low: where("LENGTH(content) < 100").count
    }
  }
end

Instance Method Details

#audio_content? ⇒ `Boolean`



91
92
93

# File 'app/models/ragdoll/unified_content.rb', line 91

def audio_content?
  original_media_type == "audio"
end

#audio_duration ⇒ `Object`

Audio-specific metadata



142
143
144

# File 'app/models/ragdoll/unified_content.rb', line 142

def audio_duration
  metadata.dig("duration")
end

#audio_duration=(value) ⇒ `Object`



146
147
148

# File 'app/models/ragdoll/unified_content.rb', line 146

def audio_duration=(value)
  self.metadata = metadata.merge("duration" => value)
end

#character_count ⇒ `Object`



74
75
76

# File 'app/models/ragdoll/unified_content.rb', line 74

def character_count
  content&.length || 0
end

#content_quality_score ⇒ `Object`

Content quality scoring

# File 'app/models/ragdoll/unified_content.rb', line 151

def content_quality_score
  return 0.0 if content.blank?

  score = 0.0

  # Base score for having content
  score += 0.3

  # Length scoring (normalized)
  if word_count > 0
    # Score based on reasonable content length (50-2000 words is ideal)
    length_score = case word_count
                  when 0..10 then 0.1
                  when 11..50 then 0.5
                  when 51..500 then 1.0
                  when 501..2000 then 0.9
                  when 2001..5000 then 0.7
                  else 0.5
                  end
    score += length_score * 0.4
  end

  # Content type scoring
  type_score = case original_media_type
              when "text", "markdown" then 1.0
              when "pdf", "docx", "html" then 0.9
              when "image" then content.include?("Image file:") ? 0.3 : 0.8
              when "audio" then content.include?("Audio file:") ? 0.3 : 0.8
              when "video" then content.include?("Video file:") ? 0.3 : 0.7
              else 0.5
              end
  score += type_score * 0.3

  [score, 1.0].min # Cap at 1.0
end

#conversion_method ⇒ `Object`



116
117
118

# File 'app/models/ragdoll/unified_content.rb', line 116

def conversion_method
  metadata.dig("conversion_method")
end

#conversion_method=(value) ⇒ `Object`



120
121
122

# File 'app/models/ragdoll/unified_content.rb', line 120

def conversion_method=(value)
  self.metadata = metadata.merge("conversion_method" => value)
end

#embedding_count ⇒ `Object`



78
79
80

# File 'app/models/ragdoll/unified_content.rb', line 78

def embedding_count
  embeddings.count
end

#file_size ⇒ `Object`



108
109
110

# File 'app/models/ragdoll/unified_content.rb', line 108

def file_size
  metadata.dig("file_size") || 0
end

#file_size=(value) ⇒ `Object`



112
113
114

# File 'app/models/ragdoll/unified_content.rb', line 112

def file_size=(value)
  self.metadata = metadata.merge("file_size" => value)
end

#generate_embeddings! ⇒ `Object`

Generate embeddings for this content

# File 'app/models/ragdoll/unified_content.rb', line 34

def generate_embeddings!
  return unless should_generate_embeddings?

  # Clear existing embeddings
  embeddings.destroy_all

  # Use TextChunker to split content into chunks
  chunks = Ragdoll::TextChunker.chunk(content)

  # Generate embeddings for each chunk
  embedding_service = Ragdoll::EmbeddingService.new

  chunks.each_with_index do |chunk_text, index|
    begin
      vector = embedding_service.generate_embedding(chunk_text)

      embeddings.create!(
        content: chunk_text,
        embedding_vector: vector,
        chunk_index: index
      )
    rescue StandardError => e
      puts "Failed to generate embedding for chunk #{index}: #{e.message}"
    end
  end

  update!(metadata: metadata.merge("embeddings_generated_at" => Time.current))
end

#image_content? ⇒ `Boolean`



87
88
89

# File 'app/models/ragdoll/unified_content.rb', line 87

def image_content?
  original_media_type == "image"
end

#image_dimensions ⇒ `Object`

# File 'app/models/ragdoll/unified_content.rb', line 133

def image_dimensions
  width = image_width
  height = image_height
  return nil unless width && height

  { width: width, height: height }
end

#image_height ⇒ `Object`



129
130
131

# File 'app/models/ragdoll/unified_content.rb', line 129

def image_height
  metadata.dig("height")
end

#image_width ⇒ `Object`

Image-specific metadata (for backward compatibility)



125
126
127

# File 'app/models/ragdoll/unified_content.rb', line 125

def image_width
  metadata.dig("width")
end

#original_filename ⇒ `Object`

Original media metadata



100
101
102

# File 'app/models/ragdoll/unified_content.rb', line 100

def original_filename
  metadata.dig("original_filename")
end

#original_filename=(value) ⇒ `Object`



104
105
106

# File 'app/models/ragdoll/unified_content.rb', line 104

def original_filename=(value)
  self.metadata = metadata.merge("original_filename" => value)
end

#should_generate_embeddings? ⇒ `Boolean`

Whether this content should generate embeddings



64
65
66

# File 'app/models/ragdoll/unified_content.rb', line 64

def should_generate_embeddings?
  content.present? && embeddings.empty?
end

#text_content? ⇒ `Boolean`

Media type specific accessors for backward compatibility



83
84
85

# File 'app/models/ragdoll/unified_content.rb', line 83

def text_content?
  %w[text markdown html pdf docx].include?(original_media_type)
end

#video_content? ⇒ `Boolean`



95
96
97

# File 'app/models/ragdoll/unified_content.rb', line 95

def video_content?
  original_media_type == "video"
end

#word_count ⇒ `Object`

Statistics

# File 'app/models/ragdoll/unified_content.rb', line 69

def word_count
  return 0 unless content.present?
  content.split(/\s+/).length
end

Class: Ragdoll::UnifiedContent

Overview

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.search_content(query, **options) ⇒ Object

.stats ⇒ Object

Instance Method Details

#audio_content? ⇒ Boolean

#audio_duration ⇒ Object

#audio_duration=(value) ⇒ Object

#character_count ⇒ Object

#content_quality_score ⇒ Object

#conversion_method ⇒ Object

#conversion_method=(value) ⇒ Object

#embedding_count ⇒ Object

#file_size ⇒ Object

#file_size=(value) ⇒ Object

#generate_embeddings! ⇒ Object

#image_content? ⇒ Boolean

#image_dimensions ⇒ Object

#image_height ⇒ Object

#image_width ⇒ Object

#original_filename ⇒ Object

#original_filename=(value) ⇒ Object

#should_generate_embeddings? ⇒ Boolean

#text_content? ⇒ Boolean

#video_content? ⇒ Boolean

#word_count ⇒ Object