Class: Ragdoll::DocumentProcessor
- Inherits:
-
Object
- Object
- Ragdoll::DocumentProcessor
- Defined in:
- app/services/ragdoll/document_processor.rb
Defined Under Namespace
Classes: ParseError, UnsupportedFormatError
Class Method Summary collapse
-
.create_document_from_file(file_path, **options) ⇒ Object
Create document from file path.
-
.create_document_from_upload(uploaded_file, **options) ⇒ Object
Create document from uploaded file (Shrine compatible).
- .determine_content_type(file_path) ⇒ Object
-
.determine_document_type(file_path) ⇒ Object
Helper methods for document type determination.
- .determine_document_type_from_content_type(content_type) ⇒ Object
- .parse(file_path) ⇒ Object
-
.parse_attachment(attached_file) ⇒ Object
Parse from Shrine attached file.
Instance Method Summary collapse
-
#initialize(file_path, attached_file = nil) ⇒ DocumentProcessor
constructor
A new instance of DocumentProcessor.
- #parse ⇒ Object
Constructor Details
#initialize(file_path, attached_file = nil) ⇒ DocumentProcessor
Returns a new instance of DocumentProcessor.
80 81 82 83 84 |
# File 'app/services/ragdoll/document_processor.rb', line 80 def initialize(file_path, attached_file = nil) @file_path = file_path @attached_file = attached_file @file_extension = File.extname(file_path).downcase end |
Class Method Details
.create_document_from_file(file_path, **options) ⇒ Object
Create document from file path
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'app/services/ragdoll/document_processor.rb', line 26 def self.create_document_from_file(file_path, **) parsed = parse(file_path) # Get file modification time file_modified_at = File.exist?(file_path) ? File.mtime(file_path) : Time.current document = Ragdoll::Document.create!( location: File.(file_path), title: parsed[:title] || File.basename(file_path, File.extname(file_path)), content: parsed[:content], document_type: parsed[:document_type] || determine_document_type(file_path), metadata: parsed[:metadata] || {}, status: "processed", file_modified_at: file_modified_at, ** ) # Attach the file if it exists document.file = File.open(file_path) if File.exist?(file_path) document end |
.create_document_from_upload(uploaded_file, **options) ⇒ Object
Create document from uploaded file (Shrine compatible)
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'app/services/ragdoll/document_processor.rb', line 50 def self.create_document_from_upload(uploaded_file, **) # Create document first document = Ragdoll::Document.create!( location: uploaded_file.original_filename || "uploaded_file", title: [:title] || File.basename(uploaded_file.original_filename || "uploaded_file", File.extname(uploaded_file.original_filename || "")), content: "", # Will be extracted after file attachment document_type: determine_document_type_from_content_type(uploaded_file.mime_type), status: "processing", metadata: [:metadata] || {}, file_modified_at: Time.current ) # Attach the file document.file = uploaded_file # Extract content from attached file if document.file.present? parsed = (document.file) document.update!( content: parsed[:content], title: parsed[:title] || document.title, metadata: document..merge(parsed[:metadata] || {}), status: "processed" ) end document end |
.determine_content_type(file_path) ⇒ Object
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'app/services/ragdoll/document_processor.rb', line 141 def self.determine_content_type(file_path) case File.extname(file_path).downcase when ".pdf" then "application/pdf" when ".docx" then "application/vnd.openxmlformats-officedocument.wordprocessingml.document" when ".txt" then "text/plain" when ".md", ".markdown" then "text/markdown" when ".html", ".htm" then "text/html" when ".jpg", ".jpeg" then "image/jpeg" when ".png" then "image/png" when ".gif" then "image/gif" when ".webp" then "image/webp" when ".bmp" then "image/bmp" when ".svg" then "image/svg+xml" when ".ico" then "image/x-icon" when ".tiff", ".tif" then "image/tiff" when ".mp3" then "audio/mpeg" when ".wav" then "audio/wav" when ".m4a" then "audio/mp4" when ".flac" then "audio/flac" when ".ogg" then "audio/ogg" when ".mp4" then "video/mp4" when ".mov" then "video/quicktime" when ".avi" then "video/x-msvideo" when ".webm" then "video/webm" else "application/octet-stream" end end |
.determine_document_type(file_path) ⇒ Object
Helper methods for document type determination
123 124 125 |
# File 'app/services/ragdoll/document_processor.rb', line 123 def self.determine_document_type(file_path) Ragdoll::DocumentConverter.new.determine_document_type(file_path) end |
.determine_document_type_from_content_type(content_type) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'app/services/ragdoll/document_processor.rb', line 127 def self.determine_document_type_from_content_type(content_type) case content_type when "application/pdf" then "pdf" when "application/vnd.openxmlformats-officedocument.wordprocessingml.document" then "docx" when "text/plain" then "text" when "text/markdown" then "markdown" when "text/html" then "html" when %r{^image/} then "image" when %r{^audio/} then "audio" when %r{^video/} then "video" else "text" end end |
.parse(file_path) ⇒ Object
14 15 16 |
# File 'app/services/ragdoll/document_processor.rb', line 14 def self.parse(file_path) new(file_path).parse end |
.parse_attachment(attached_file) ⇒ Object
Parse from Shrine attached file
19 20 21 22 23 |
# File 'app/services/ragdoll/document_processor.rb', line 19 def self.(attached_file) attached_file.open do |tempfile| new(tempfile.path, attached_file).parse end end |
Instance Method Details
#parse ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'app/services/ragdoll/document_processor.rb', line 86 def parse # Check if file exists first unless File.exist?(@file_path) raise ParseError, "File does not exist: #{@file_path}" end # Use the new unified document converter document_type = determine_document_type(@file_path) begin # Convert to text using the unified pipeline text_content = Ragdoll::DocumentConverter.convert_to_text(@file_path, document_type) # Extract metadata based on document type = (document_type) # Add encoding information for text files if %w[text markdown html].include?(document_type) encoding = detect_file_encoding(@file_path) || "UTF-8" [:encoding] = encoding end # Get title from metadata or filename title = [:title] || extract_title_from_filepath { content: text_content, metadata: , title: title, document_type: document_type } rescue StandardError => e raise ParseError, "Failed to parse document: #{e.message}" end end |