Class: OcrFile::Document
- Inherits:
-
Object
- Object
- OcrFile::Document
- Defined in:
- lib/ocr-file/document.rb
Constant Summary collapse
- ACCEPTED_IMAGE_TYPES =
['png', 'jpeg', 'jpg', 'tiff', 'bmp']
- PAGE_BREAK =
TODO: Make configurable
"\n\r\n"- DEFAULT_CONFIG =
{ # Images from PDF filetype: 'png', quality: 100, dpi: 300, # Text to PDF font: 'Helvetica', font_size: 5, #8 # 12 text_x: 20, text_y: 800, minimum_word: 5, # Cloud-Vision OCR image_annotator: nil, # Needed for Cloud-Vision type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION, ocr_engine: 'tesseract', # 'cloud-vision' # Image Pre-Processing image_pre_preprocess: true, effects: ['bw', 'norm'], threshold: 0.25, # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', # Console Output verbose: true, }
Instance Attribute Summary collapse
-
#config ⇒ Object
readonly
Returns the value of attribute config.
-
#filename ⇒ Object
readonly
Returns the value of attribute filename.
-
#final_save_file ⇒ Object
readonly
Returns the value of attribute final_save_file.
-
#ocr_engine ⇒ Object
readonly
Returns the value of attribute ocr_engine.
-
#original_file_path ⇒ Object
readonly
Returns the value of attribute original_file_path.
-
#save_file_path ⇒ Object
readonly
Returns the value of attribute save_file_path.
Instance Method Summary collapse
- #close ⇒ Object
- #image? ⇒ Boolean
-
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
constructor
save_file_path will also generate a tmp path for tmp files.
- #pdf? ⇒ Boolean
-
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text.
- #to_pdf ⇒ Object
- #to_s ⇒ Object
- #to_text ⇒ Object
Constructor Details
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation
41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/ocr-file/document.rb', line 41 def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) @original_file_path = original_file_path @filename = original_file_path.split('/').last.split('.').first date = Time.now.to_s.split(' ').first @save_file_path = save_file_path @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}" @config = config @ocr_engine = find_ocr_engine(config[:ocr_engine]) end |
Instance Attribute Details
#config ⇒ Object (readonly)
Returns the value of attribute config.
32 33 34 |
# File 'lib/ocr-file/document.rb', line 32 def config @config end |
#filename ⇒ Object (readonly)
Returns the value of attribute filename.
32 33 34 |
# File 'lib/ocr-file/document.rb', line 32 def filename @filename end |
#final_save_file ⇒ Object (readonly)
Returns the value of attribute final_save_file.
32 33 34 |
# File 'lib/ocr-file/document.rb', line 32 def final_save_file @final_save_file end |
#ocr_engine ⇒ Object (readonly)
Returns the value of attribute ocr_engine.
32 33 34 |
# File 'lib/ocr-file/document.rb', line 32 def ocr_engine @ocr_engine end |
#original_file_path ⇒ Object (readonly)
Returns the value of attribute original_file_path.
32 33 34 |
# File 'lib/ocr-file/document.rb', line 32 def original_file_path @original_file_path end |
#save_file_path ⇒ Object (readonly)
Returns the value of attribute save_file_path.
32 33 34 |
# File 'lib/ocr-file/document.rb', line 32 def save_file_path @save_file_path end |
Instance Method Details
#close ⇒ Object
134 135 136 |
# File 'lib/ocr-file/document.rb', line 134 def close ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) end |
#image? ⇒ Boolean
58 59 60 61 |
# File 'lib/ocr-file/document.rb', line 58 def image? return false if pdf? ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")} end |
#pdf? ⇒ Boolean
54 55 56 |
# File 'lib/ocr-file/document.rb', line 54 def pdf? @original_file_path.include?('.pdf') end |
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text
64 65 66 |
# File 'lib/ocr-file/document.rb', line 64 def text? !pdf? && !image? end |
#to_pdf ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/ocr-file/document.rb', line 68 def to_pdf if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) pdfs_to_merge = [] image_paths.each do |image_path| pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config) end merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge) OcrFile::ImageEngines::PdfEngine .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) close elsif text? text = ::OcrFile::FileHelpers.open_text_file(@original_file_path) pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) else # is an image ocr_image_to_pdf end end |
#to_s ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/ocr-file/document.rb', line 114 def to_s if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) text = '' image_paths.each do |image_path| text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}" end close text elsif text? ::OcrFile::FileHelpers.open_text_file(@original_file_path) else # is an image ocr_image_to_text(save: false) end end |
#to_text ⇒ Object
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/ocr-file/document.rb', line 96 def to_text if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) image_paths.each do |image_path| text = @ocr_engine.ocr_to_text(image_path, options: @config) ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}") end close elsif text? ::OcrFile::FileHelpers.open_text_file(@original_file_path) else # is an image ocr_image_to_text(save: true) end end |