Class: OcrFile::Document
- Inherits:
-
Object
- Object
- OcrFile::Document
- Defined in:
- lib/ocr-file/document.rb
Constant Summary collapse
- ACCEPTED_IMAGE_TYPES =
TODO: Skewness / text orientation detection TODO: Better handwriting analysis
['png', 'jpeg', 'jpg', 'tiff', 'bmp']
- PAGE_BREAK =
TODO: Make configurable
"\n\r\n"- DEFAULT_CONFIG =
{ # Images from PDF filetype: 'png', quality: 100, dpi: 300, # Text to PDF font: 'Helvetica', font_size: 5, #8 # 12 text_x: 20, text_y: 800, minimum_word: 5, # Cloud-Vision OCR image_annotator: nil, # Needed for Cloud-Vision type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION, ocr_engine: 'tesseract', # 'cloud-vision' # Image Pre-Processing image_preprocess: true, effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', # Console Output verbose: true, }
Instance Attribute Summary collapse
-
#config ⇒ Object
readonly
Returns the value of attribute config.
-
#filename ⇒ Object
readonly
Returns the value of attribute filename.
-
#final_save_file ⇒ Object
readonly
Returns the value of attribute final_save_file.
-
#ocr_engine ⇒ Object
readonly
Returns the value of attribute ocr_engine.
-
#original_file_path ⇒ Object
readonly
Returns the value of attribute original_file_path.
-
#save_file_path ⇒ Object
readonly
Returns the value of attribute save_file_path.
Instance Method Summary collapse
- #close ⇒ Object
- #image? ⇒ Boolean
-
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
constructor
save_file_path will also generate a tmp path for tmp files.
- #pdf? ⇒ Boolean
-
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text.
-
#to_pdf ⇒ Object
Trigger OCR pipeline.
- #to_s ⇒ Object
- #to_text ⇒ Object
Constructor Details
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation
43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/ocr-file/document.rb', line 43 def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) @original_file_path = original_file_path @filename = original_file_path.split('/').last.split('.').first date = Time.now.to_s.split(' ').first @save_file_path = save_file_path @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}" @config = config @ocr_engine = find_ocr_engine(config[:ocr_engine]) end |
Instance Attribute Details
#config ⇒ Object (readonly)
Returns the value of attribute config.
34 35 36 |
# File 'lib/ocr-file/document.rb', line 34 def config @config end |
#filename ⇒ Object (readonly)
Returns the value of attribute filename.
34 35 36 |
# File 'lib/ocr-file/document.rb', line 34 def filename @filename end |
#final_save_file ⇒ Object (readonly)
Returns the value of attribute final_save_file.
34 35 36 |
# File 'lib/ocr-file/document.rb', line 34 def final_save_file @final_save_file end |
#ocr_engine ⇒ Object (readonly)
Returns the value of attribute ocr_engine.
34 35 36 |
# File 'lib/ocr-file/document.rb', line 34 def ocr_engine @ocr_engine end |
#original_file_path ⇒ Object (readonly)
Returns the value of attribute original_file_path.
34 35 36 |
# File 'lib/ocr-file/document.rb', line 34 def original_file_path @original_file_path end |
#save_file_path ⇒ Object (readonly)
Returns the value of attribute save_file_path.
34 35 36 |
# File 'lib/ocr-file/document.rb', line 34 def save_file_path @save_file_path end |
Instance Method Details
#close ⇒ Object
140 141 142 |
# File 'lib/ocr-file/document.rb', line 140 def close ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) end |
#image? ⇒ Boolean
60 61 62 63 |
# File 'lib/ocr-file/document.rb', line 60 def image? return false if pdf? ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")} end |
#pdf? ⇒ Boolean
56 57 58 |
# File 'lib/ocr-file/document.rb', line 56 def pdf? @original_file_path.downcase.include?('.pdf') end |
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text
66 67 68 |
# File 'lib/ocr-file/document.rb', line 66 def text? !pdf? && !image? end |
#to_pdf ⇒ Object
Trigger OCR pipeline
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/ocr-file/document.rb', line 71 def to_pdf if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) pdfs_to_merge = [] image_paths.each do |image_path| pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config) end merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge) OcrFile::ImageEngines::PdfEngine .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) elsif text? text = ::OcrFile::FileHelpers.open_text_file(@original_file_path) pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) else # is an image ocr_image_to_pdf end close end |
#to_s ⇒ Object
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/ocr-file/document.rb', line 117 def to_s if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) text = '' image_paths.each do |image_path| text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}" end close text elsif text? ::OcrFile::FileHelpers.open_text_file(@original_file_path) else # is an image text = ocr_image_to_text(save: false) close text end end |
#to_text ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/ocr-file/document.rb', line 99 def to_text if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) image_paths.each do |image_path| text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}") end elsif text? ::OcrFile::FileHelpers.open_text_file(@original_file_path) else # is an image ocr_image_to_text(save: true) end close end |