Class: OcrFile::Document
- Inherits:
-
Object
- Object
- OcrFile::Document
- Defined in:
- lib/ocr-file/document.rb
Constant Summary collapse
- ACCEPTED_IMAGE_TYPES =
TODO: Skewness / text orientation detection TODO: Better handwriting analysis
['png', 'jpeg', 'jpg', 'tiff', 'bmp']
- PAGE_BREAK =
TODO: Make configurable
"\n\r\n"
- EFFECTS_TO_REMOVE =
['', 'norm', 'remove_shadow', 'bw']
- DEFAULT_CONFIG =
{ # Images from PDF filetype: 'png', quality: 100, dpi: 300, # Text to PDF font: 'Helvetica', font_size: 5, #8 # 12 text_x: 20, text_y: 800, minimum_word: 5, # Cloud-Vision OCR image_annotator: nil, # Needed for Cloud-Vision type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION, ocr_engine: 'tesseract', # 'cloud-vision' # Image Pre-Processing image_preprocess: true, effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], automatic_reprocess: true, # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', spelling_correction: true, keep_files: false, # Console Output verbose: true, timing: true }
Instance Attribute Summary collapse
-
#config ⇒ Object
readonly
Returns the value of attribute config.
-
#end_time ⇒ Object
readonly
Returns the value of attribute end_time.
-
#filename ⇒ Object
readonly
Returns the value of attribute filename.
-
#final_save_file ⇒ Object
readonly
Returns the value of attribute final_save_file.
-
#ocr_engine ⇒ Object
readonly
Returns the value of attribute ocr_engine.
-
#original_file_path ⇒ Object
readonly
Returns the value of attribute original_file_path.
-
#save_file_path ⇒ Object
readonly
Returns the value of attribute save_file_path.
-
#start_time ⇒ Object
readonly
Returns the value of attribute start_time.
Instance Method Summary collapse
- #close ⇒ Object
- #image? ⇒ Boolean
-
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
constructor
save_file_path will also generate a tmp path for tmp files.
- #pdf? ⇒ Boolean
-
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text.
-
#to_pdf ⇒ Object
Trigger OCR pipeline.
- #to_s ⇒ Object
- #to_text ⇒ Object
Constructor Details
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation
50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/ocr-file/document.rb', line 50 def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) @original_file_path = original_file_path @filename = original_file_path.split('/').last.split('.').first date = Time.now.to_s.split(' ').first @save_file_path = save_file_path @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}" @config = config @ocr_engine = find_ocr_engine(config[:ocr_engine]) end |
Instance Attribute Details
#config ⇒ Object (readonly)
Returns the value of attribute config.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def config @config end |
#end_time ⇒ Object (readonly)
Returns the value of attribute end_time.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def end_time @end_time end |
#filename ⇒ Object (readonly)
Returns the value of attribute filename.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def filename @filename end |
#final_save_file ⇒ Object (readonly)
Returns the value of attribute final_save_file.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def final_save_file @final_save_file end |
#ocr_engine ⇒ Object (readonly)
Returns the value of attribute ocr_engine.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def ocr_engine @ocr_engine end |
#original_file_path ⇒ Object (readonly)
Returns the value of attribute original_file_path.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def original_file_path @original_file_path end |
#save_file_path ⇒ Object (readonly)
Returns the value of attribute save_file_path.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def save_file_path @save_file_path end |
#start_time ⇒ Object (readonly)
Returns the value of attribute start_time.
39 40 41 |
# File 'lib/ocr-file/document.rb', line 39 def start_time @start_time end |
Instance Method Details
#close ⇒ Object
121 122 123 124 |
# File 'lib/ocr-file/document.rb', line 121 def close return if keep_files? ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) end |
#image? ⇒ Boolean
67 68 69 70 |
# File 'lib/ocr-file/document.rb', line 67 def image? return false if pdf? ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") } end |
#pdf? ⇒ Boolean
63 64 65 |
# File 'lib/ocr-file/document.rb', line 63 def pdf? @original_file_path.downcase.include?('.pdf') end |
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text
73 74 75 |
# File 'lib/ocr-file/document.rb', line 73 def text? !pdf? && !image? end |
#to_pdf ⇒ Object
Trigger OCR pipeline
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/ocr-file/document.rb', line 78 def to_pdf @start_time = Time.now find_best_image_processing(save: false) if config[:automatic_reprocess] && !text? if pdf? ocr_pdf_to_searchable_pdf elsif text? text_to_pdf else # is an image ocr_image_to_pdf end close @end_time = Time.now print_time end |
#to_s ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/ocr-file/document.rb', line 107 def to_s @start_time = Time.now return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text? text = find_best_image_processing(save: false) close @end_time = Time.now print_time text end |
#to_text ⇒ Object
96 97 98 99 100 101 102 103 104 105 |
# File 'lib/ocr-file/document.rb', line 96 def to_text @start_time = Time.now return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text? find_best_image_processing(save: true) close @end_time = Time.now print_time end |