Class: Ragdoll::DocumentProcessor

Inherits:

Object

Object
Ragdoll::DocumentProcessor

show all

Defined in:: app/services/ragdoll/document_processor.rb

Defined Under Namespace

Classes: ParseError, UnsupportedFormatError

Class Method Summary collapse

.create_document_from_file(file_path, **options) ⇒ Object

Create document from file path.
.create_document_from_upload(uploaded_file, **options) ⇒ Object

Create document from uploaded file (Shrine compatible).
.determine_content_type(file_path) ⇒ Object
.determine_document_type(file_path) ⇒ Object

Helper methods for document type determination.
.determine_document_type_from_content_type(content_type) ⇒ Object
.parse(file_path) ⇒ Object
.parse_attachment(attached_file) ⇒ Object

Parse from Shrine attached file.

Instance Method Summary collapse

#initialize(file_path, attached_file = nil) ⇒ DocumentProcessor constructor

A new instance of DocumentProcessor.
#parse ⇒ Object

Constructor Details

#initialize(file_path, attached_file = nil) ⇒ `DocumentProcessor`

Returns a new instance of DocumentProcessor.

# File 'app/services/ragdoll/document_processor.rb', line 80

def initialize(file_path, attached_file = nil)
  @file_path = file_path
  @attached_file = attached_file
  @file_extension = File.extname(file_path).downcase
end

Class Method Details

.create_document_from_file(file_path, **options) ⇒ `Object`

Create document from file path

# File 'app/services/ragdoll/document_processor.rb', line 26

def self.create_document_from_file(file_path, **options)
  parsed = parse(file_path)

  # Get file modification time
  file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current

  document = Ragdoll::Document.create!(
    location: File.expand_path(file_path),
    title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
    content: parsed[:content],
    document_type: parsed[:document_type] || determine_document_type(file_path),
    metadata: parsed[:metadata] || {},
    status: "processed",
    file_modified_at: file_modified_at,
    **options
  )

  # Attach the file if it exists
  document.file = File.open(file_path) if File.exist?(file_path)

  document
end

.create_document_from_upload(uploaded_file, **options) ⇒ `Object`

Create document from uploaded file (Shrine compatible)

# File 'app/services/ragdoll/document_processor.rb', line 50

def self.create_document_from_upload(uploaded_file, **options)
  # Create document first
  document = Ragdoll::Document.create!(
    location: uploaded_file.original_filename || "uploaded_file",
    title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
                                            File.extname(uploaded_file.original_filename || "")),
    content: "", # Will be extracted after file attachment
    document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
    status: "processing",
    metadata: options[:metadata] || {},
    file_modified_at: Time.current
  )

  # Attach the file
  document.file = uploaded_file

  # Extract content from attached file
  if document.file.present?
    parsed = parse_attachment(document.file)
    document.update!(
      content: parsed[:content],
      title: parsed[:title] || document.title,
      metadata: document.metadata.merge(parsed[:metadata] || {}),
      status: "processed"
    )
  end

  document
end

.determine_content_type(file_path) ⇒ `Object`

# File 'app/services/ragdoll/document_processor.rb', line 141

def self.determine_content_type(file_path)
  case File.extname(file_path).downcase
  when ".pdf" then "application/pdf"
  when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
  when ".txt" then "text/plain"
  when ".md", ".markdown" then "text/markdown"
  when ".html", ".htm" then "text/html"
  when ".jpg", ".jpeg" then "image/jpeg"
  when ".png" then "image/png"
  when ".gif" then "image/gif"
  when ".webp" then "image/webp"
  when ".bmp" then "image/bmp"
  when ".svg" then "image/svg+xml"
  when ".ico" then "image/x-icon"
  when ".tiff", ".tif" then "image/tiff"
  when ".mp3" then "audio/mpeg"
  when ".wav" then "audio/wav"
  when ".m4a" then "audio/mp4"
  when ".flac" then "audio/flac"
  when ".ogg" then "audio/ogg"
  when ".mp4" then "video/mp4"
  when ".mov" then "video/quicktime"
  when ".avi" then "video/x-msvideo"
  when ".webm" then "video/webm"
  else "application/octet-stream"
  end
end

.determine_document_type(file_path) ⇒ `Object`

Helper methods for document type determination



123
124
125

# File 'app/services/ragdoll/document_processor.rb', line 123

def self.determine_document_type(file_path)
  Ragdoll::DocumentConverter.new.determine_document_type(file_path)
end

.determine_document_type_from_content_type(content_type) ⇒ `Object`

# File 'app/services/ragdoll/document_processor.rb', line 127

def self.determine_document_type_from_content_type(content_type)
  case content_type
  when "application/pdf" then "pdf"
  when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
  when "text/plain" then "text"
  when "text/markdown" then "markdown"
  when "text/html" then "html"
  when %r{^image/} then "image"
  when %r{^audio/} then "audio"
  when %r{^video/} then "video"
  else "text"
  end
end

.parse(file_path) ⇒ `Object`



14
15
16

# File 'app/services/ragdoll/document_processor.rb', line 14

def self.parse(file_path)
  new(file_path).parse
end

.parse_attachment(attached_file) ⇒ `Object`

Parse from Shrine attached file

# File 'app/services/ragdoll/document_processor.rb', line 19

def self.parse_attachment(attached_file)
  attached_file.open do |tempfile|
    new(tempfile.path, attached_file).parse
  end
end

Instance Method Details

#parse ⇒ `Object`

# File 'app/services/ragdoll/document_processor.rb', line 86

def parse
  # Check if file exists first
  unless File.exist?(@file_path)
    raise ParseError, "File does not exist: #{@file_path}"
  end

  # Use the new unified document converter
  document_type = determine_document_type(@file_path)

  begin
    # Convert to text using the unified pipeline
    text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)

    # Extract metadata based on document type
    metadata = extract_metadata_for_type(document_type)

    # Add encoding information for text files
    if %w[text markdown html].include?(document_type)
      encoding = detect_file_encoding(@file_path) || "UTF-8"
      metadata[:encoding] = encoding
    end

    # Get title from metadata or filename
    title = metadata[:title] || extract_title_from_filepath

    {
      content: text_content,
      metadata: metadata,
      title: title,
      document_type: document_type
    }
  rescue StandardError => e
    raise ParseError, "Failed to parse document: #{e.message}"
  end
end

Class: Ragdoll::DocumentProcessor

Defined Under Namespace

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path, attached_file = nil) ⇒ DocumentProcessor

Class Method Details

.create_document_from_file(file_path, **options) ⇒ Object

.create_document_from_upload(uploaded_file, **options) ⇒ Object

.determine_content_type(file_path) ⇒ Object

.determine_document_type(file_path) ⇒ Object

.determine_document_type_from_content_type(content_type) ⇒ Object

.parse(file_path) ⇒ Object

.parse_attachment(attached_file) ⇒ Object