Class: OcrFile::Document
- Inherits:
-
Object
- Object
- OcrFile::Document
- Defined in:
- lib/ocr-file/document.rb
Constant Summary collapse
- ACCEPTED_IMAGE_TYPES =
TODO: Skewness / text orientation detection TODO: Better handwriting analysis
['png', 'jpeg', 'jpg', 'tiff', 'bmp']
- PAGE_BREAK =
TODO: Make configurable
"\n\r\n"
- EFFECTS_TO_REMOVE =
['', 'norm', 'remove_shadow', 'bw']
- DEFAULT_CONFIG =
{ # Images from PDF filetype: 'png', quality: 100, dpi: 300, # Text to PDF font: 'Helvetica', font_size: 5, #8 # 12 text_x: 20, text_y: 800, minimum_word: 5, # Cloud-Vision OCR image_annotator: nil, # Needed for Cloud-Vision type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION, ocr_engine: 'tesseract', # 'cloud-vision' # Image Pre-Processing image_preprocess: true, effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], automatic_reprocess: true, dimensions: nil, # width, height. Will lock images to these dimensions # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', spelling_correction: true, keep_files: false, # Console Output verbose: true, timing: true }
Instance Attribute Summary collapse
-
#config ⇒ Object
readonly
Returns the value of attribute config.
-
#end_time ⇒ Object
readonly
Returns the value of attribute end_time.
-
#filename ⇒ Object
readonly
Returns the value of attribute filename.
-
#final_save_file ⇒ Object
readonly
Returns the value of attribute final_save_file.
-
#ocr_engine ⇒ Object
readonly
Returns the value of attribute ocr_engine.
-
#original_file_path ⇒ Object
readonly
Returns the value of attribute original_file_path.
-
#save_file_path ⇒ Object
readonly
Returns the value of attribute save_file_path.
-
#start_time ⇒ Object
readonly
Returns the value of attribute start_time.
Instance Method Summary collapse
- #close ⇒ Object
- #image? ⇒ Boolean
-
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
constructor
save_file_path will also generate a tmp path for tmp files.
- #pdf? ⇒ Boolean
-
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text.
-
#to_pdf ⇒ Object
Trigger OCR pipeline.
- #to_s ⇒ Object
- #to_text ⇒ Object
Constructor Details
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document
save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation
51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/ocr-file/document.rb', line 51 def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) @original_file_path = original_file_path @filename = original_file_path.split('/').last.split('.').first date = Time.now.to_s.split(' ').first @save_file_path = save_file_path @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}" @config = config @ocr_engine = find_ocr_engine(config[:ocr_engine]) end |
Instance Attribute Details
#config ⇒ Object (readonly)
Returns the value of attribute config.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def config @config end |
#end_time ⇒ Object (readonly)
Returns the value of attribute end_time.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def end_time @end_time end |
#filename ⇒ Object (readonly)
Returns the value of attribute filename.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def filename @filename end |
#final_save_file ⇒ Object (readonly)
Returns the value of attribute final_save_file.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def final_save_file @final_save_file end |
#ocr_engine ⇒ Object (readonly)
Returns the value of attribute ocr_engine.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def ocr_engine @ocr_engine end |
#original_file_path ⇒ Object (readonly)
Returns the value of attribute original_file_path.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def original_file_path @original_file_path end |
#save_file_path ⇒ Object (readonly)
Returns the value of attribute save_file_path.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def save_file_path @save_file_path end |
#start_time ⇒ Object (readonly)
Returns the value of attribute start_time.
40 41 42 |
# File 'lib/ocr-file/document.rb', line 40 def start_time @start_time end |
Instance Method Details
#close ⇒ Object
122 123 124 125 |
# File 'lib/ocr-file/document.rb', line 122 def close return if keep_files? ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) end |
#image? ⇒ Boolean
68 69 70 71 |
# File 'lib/ocr-file/document.rb', line 68 def image? return false if pdf? ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") } end |
#pdf? ⇒ Boolean
64 65 66 |
# File 'lib/ocr-file/document.rb', line 64 def pdf? @original_file_path.downcase.include?('.pdf') end |
#text? ⇒ Boolean
Treat anything which isnt a PDF or image as text
74 75 76 |
# File 'lib/ocr-file/document.rb', line 74 def text? !pdf? && !image? end |
#to_pdf ⇒ Object
Trigger OCR pipeline
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/ocr-file/document.rb', line 79 def to_pdf @start_time = Time.now find_best_image_processing(save: false) if config[:automatic_reprocess] && !text? if pdf? ocr_pdf_to_searchable_pdf elsif text? text_to_pdf else # is an image ocr_image_to_pdf end close @end_time = Time.now print_time end |
#to_s ⇒ Object
108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/ocr-file/document.rb', line 108 def to_s @start_time = Time.now return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text? text = find_best_image_processing(save: false) close @end_time = Time.now print_time text end |
#to_text ⇒ Object
97 98 99 100 101 102 103 104 105 106 |
# File 'lib/ocr-file/document.rb', line 97 def to_text @start_time = Time.now return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text? find_best_image_processing(save: true) close @end_time = Time.now print_time end |