Class: Ragdoll::DocumentProcessor

Inherits:
Object
  • Object
show all
Defined in:
app/services/ragdoll/document_processor.rb

Defined Under Namespace

Classes: ParseError, UnsupportedFormatError

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path, attached_file = nil) ⇒ DocumentProcessor

Returns a new instance of DocumentProcessor.



80
81
82
83
84
# File 'app/services/ragdoll/document_processor.rb', line 80

def initialize(file_path, attached_file = nil)
  @file_path = file_path
  @attached_file = attached_file
  @file_extension = File.extname(file_path).downcase
end

Class Method Details

.create_document_from_file(file_path, **options) ⇒ Object

Create document from file path



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'app/services/ragdoll/document_processor.rb', line 26

def self.create_document_from_file(file_path, **options)
  parsed = parse(file_path)

  # Get file modification time
  file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current

  document = Ragdoll::Document.create!(
    location: File.expand_path(file_path),
    title: parsed[:title] || File.basename(file_path, File.extname(file_path)),
    content: parsed[:content],
    document_type: parsed[:document_type] || determine_document_type(file_path),
    metadata: parsed[:metadata] || {},
    status: "processed",
    file_modified_at: file_modified_at,
    **options
  )

  # Attach the file if it exists
  document.file = File.open(file_path) if File.exist?(file_path)

  document
end

.create_document_from_upload(uploaded_file, **options) ⇒ Object

Create document from uploaded file (Shrine compatible)



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'app/services/ragdoll/document_processor.rb', line 50

def self.create_document_from_upload(uploaded_file, **options)
  # Create document first
  document = Ragdoll::Document.create!(
    location: uploaded_file.original_filename || "uploaded_file",
    title: options[:title] || File.basename(uploaded_file.original_filename || "uploaded_file",
                                            File.extname(uploaded_file.original_filename || "")),
    content: "", # Will be extracted after file attachment
    document_type: determine_document_type_from_content_type(uploaded_file.mime_type),
    status: "processing",
    metadata: options[:metadata] || {},
    file_modified_at: Time.current
  )

  # Attach the file
  document.file = uploaded_file

  # Extract content from attached file
  if document.file.present?
    parsed = parse_attachment(document.file)
    document.update!(
      content: parsed[:content],
      title: parsed[:title] || document.title,
      metadata: document..merge(parsed[:metadata] || {}),
      status: "processed"
    )
  end

  document
end

.determine_content_type(file_path) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'app/services/ragdoll/document_processor.rb', line 141

def self.determine_content_type(file_path)
  case File.extname(file_path).downcase
  when ".pdf" then "application/pdf"
  when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
  when ".txt" then "text/plain"
  when ".md", ".markdown" then "text/markdown"
  when ".html", ".htm" then "text/html"
  when ".jpg", ".jpeg" then "image/jpeg"
  when ".png" then "image/png"
  when ".gif" then "image/gif"
  when ".webp" then "image/webp"
  when ".bmp" then "image/bmp"
  when ".svg" then "image/svg+xml"
  when ".ico" then "image/x-icon"
  when ".tiff", ".tif" then "image/tiff"
  when ".mp3" then "audio/mpeg"
  when ".wav" then "audio/wav"
  when ".m4a" then "audio/mp4"
  when ".flac" then "audio/flac"
  when ".ogg" then "audio/ogg"
  when ".mp4" then "video/mp4"
  when ".mov" then "video/quicktime"
  when ".avi" then "video/x-msvideo"
  when ".webm" then "video/webm"
  else "application/octet-stream"
  end
end

.determine_document_type(file_path) ⇒ Object

Helper methods for document type determination



123
124
125
# File 'app/services/ragdoll/document_processor.rb', line 123

def self.determine_document_type(file_path)
  Ragdoll::DocumentConverter.new.determine_document_type(file_path)
end

.determine_document_type_from_content_type(content_type) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'app/services/ragdoll/document_processor.rb', line 127

def self.determine_document_type_from_content_type(content_type)
  case content_type
  when "application/pdf" then "pdf"
  when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx"
  when "text/plain" then "text"
  when "text/markdown" then "markdown"
  when "text/html" then "html"
  when %r{^image/} then "image"
  when %r{^audio/} then "audio"
  when %r{^video/} then "video"
  else "text"
  end
end

.parse(file_path) ⇒ Object



14
15
16
# File 'app/services/ragdoll/document_processor.rb', line 14

def self.parse(file_path)
  new(file_path).parse
end

.parse_attachment(attached_file) ⇒ Object

Parse from Shrine attached file



19
20
21
22
23
# File 'app/services/ragdoll/document_processor.rb', line 19

def self.parse_attachment(attached_file)
  attached_file.open do |tempfile|
    new(tempfile.path, attached_file).parse
  end
end

Instance Method Details

#parseObject



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'app/services/ragdoll/document_processor.rb', line 86

def parse
  # Check if file exists first
  unless File.exist?(@file_path)
    raise ParseError, "File does not exist: #{@file_path}"
  end

  # Use the new unified document converter
  document_type = determine_document_type(@file_path)

  begin
    # Convert to text using the unified pipeline
    text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type)

    # Extract metadata based on document type
     = (document_type)

    # Add encoding information for text files
    if %w[text markdown html].include?(document_type)
      encoding = detect_file_encoding(@file_path) || "UTF-8"
      [:encoding] = encoding
    end

    # Get title from metadata or filename
    title = [:title] || extract_title_from_filepath

    {
      content: text_content,
      metadata: ,
      title: title,
      document_type: document_type
    }
  rescue StandardError => e
    raise ParseError, "Failed to parse document: #{e.message}"
  end
end