Class: OcrFile::Document

Inherits:

Object

Object
OcrFile::Document

show all

Defined in:: lib/ocr-file/document.rb

Constant Summary collapse

ACCEPTED_IMAGE_TYPES = TODO: Skewness / text orientation detection TODO: Better handwriting analysis

['png', 'jpeg', 'jpg', 'tiff', 'bmp']

PAGE_BREAK = TODO: Make configurable

"\n\r\n"

EFFECTS_TO_REMOVE =

['', 'norm', 'remove_shadow', 'bw']

DEFAULT_CONFIG =

{
  # Images from PDF
  filetype: 'png',
  quality: 100,
  dpi: 300,
  # Text to PDF
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
  # Cloud-Vision OCR
  image_annotator: nil, # Needed for Cloud-Vision
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
  ocr_engine: 'tesseract', # 'cloud-vision'
  # Image Pre-Processing
  image_preprocess: true,
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
  automatic_reprocess: true,
  dimensions: nil, # width, height. Will lock images to these dimensions
  # PDF to Image Processing
  optimise_pdf: true,
  extract_pdf_images: true, # if false will screenshot each PDF page
  temp_filename_prefix: 'image',
  spelling_correction: true,
  keep_files: false,
  # Console Output
  verbose: true,
  timing: true
}

Instance Attribute Summary collapse

#config ⇒ Object readonly

Returns the value of attribute config.
#end_time ⇒ Object readonly

Returns the value of attribute end_time.
#filename ⇒ Object readonly

Returns the value of attribute filename.
#final_save_file ⇒ Object readonly

Returns the value of attribute final_save_file.
#ocr_engine ⇒ Object readonly

Returns the value of attribute ocr_engine.
#original_file_path ⇒ Object readonly

Returns the value of attribute original_file_path.
#save_file_path ⇒ Object readonly

Returns the value of attribute save_file_path.
#start_time ⇒ Object readonly

Returns the value of attribute start_time.

Instance Method Summary collapse

#close ⇒ Object
#image? ⇒ Boolean
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document constructor

save_file_path will also generate a tmp path for tmp files.
#pdf? ⇒ Boolean
#text? ⇒ Boolean

Treat anything which isnt a PDF or image as text.
#to_pdf ⇒ Object

Trigger OCR pipeline.
#to_s ⇒ Object
#to_text ⇒ Object

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ `Document`

save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation

# File 'lib/ocr-file/document.rb', line 51

def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
  @original_file_path = original_file_path
  @filename = original_file_path.split('/').last.split('.').first

  date = Time.now.to_s.split(' ').first

  @save_file_path = save_file_path
  @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}"

  @config = config
  @ocr_engine = find_ocr_engine(config[:ocr_engine])
end

Instance Attribute Details

#config ⇒ `Object` (readonly)

Returns the value of attribute config.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def config
  @config
end

#end_time ⇒ `Object` (readonly)

Returns the value of attribute end_time.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def end_time
  @end_time
end

#filename ⇒ `Object` (readonly)

Returns the value of attribute filename.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def filename
  @filename
end

#final_save_file ⇒ `Object` (readonly)

Returns the value of attribute final_save_file.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def final_save_file
  @final_save_file
end

#ocr_engine ⇒ `Object` (readonly)

Returns the value of attribute ocr_engine.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def ocr_engine
  @ocr_engine
end

#original_file_path ⇒ `Object` (readonly)

Returns the value of attribute original_file_path.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def original_file_path
  @original_file_path
end

#save_file_path ⇒ `Object` (readonly)

Returns the value of attribute save_file_path.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def save_file_path
  @save_file_path
end

#start_time ⇒ `Object` (readonly)

Returns the value of attribute start_time.



40
41
42

# File 'lib/ocr-file/document.rb', line 40

def start_time
  @start_time
end

Instance Method Details

#close ⇒ `Object`

# File 'lib/ocr-file/document.rb', line 122

def close
  return if keep_files?
  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end

#image? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/ocr-file/document.rb', line 68

def image?
  return false if pdf?
  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
end

#pdf? ⇒ `Boolean`

Returns:

(Boolean)



64
65
66

# File 'lib/ocr-file/document.rb', line 64

def pdf?
  @original_file_path.downcase.include?('.pdf')
end

#text? ⇒ `Boolean`

Treat anything which isnt a PDF or image as text

Returns:

(Boolean)



74
75
76

# File 'lib/ocr-file/document.rb', line 74

def text?
  !pdf? && !image?
end

#to_pdf ⇒ `Object`

Trigger OCR pipeline

# File 'lib/ocr-file/document.rb', line 79

def to_pdf
  @start_time = Time.now
  find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?

  if pdf?
    ocr_pdf_to_searchable_pdf
  elsif text?
    text_to_pdf
  else # is an image
    ocr_image_to_pdf
  end

  close

  @end_time = Time.now
  print_time
end

#to_s ⇒ `Object`

# File 'lib/ocr-file/document.rb', line 108

def to_s
  @start_time = Time.now
  return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?

  text = find_best_image_processing(save: false)

  close

  @end_time = Time.now
  print_time

  text
end

#to_text ⇒ `Object`

# File 'lib/ocr-file/document.rb', line 97

def to_text
  @start_time = Time.now
  return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?

  find_best_image_processing(save: true)
  close

  @end_time = Time.now
  print_time
end

Class: OcrFile::Document

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document

Instance Attribute Details

#config ⇒ Object (readonly)

#end_time ⇒ Object (readonly)

#filename ⇒ Object (readonly)

#final_save_file ⇒ Object (readonly)

#ocr_engine ⇒ Object (readonly)

#original_file_path ⇒ Object (readonly)

#save_file_path ⇒ Object (readonly)

#start_time ⇒ Object (readonly)

Instance Method Details

#close ⇒ Object

#image? ⇒ Boolean

#pdf? ⇒ Boolean

#text? ⇒ Boolean

#to_pdf ⇒ Object

#to_s ⇒ Object

#to_text ⇒ Object

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ `Document`

#config ⇒ `Object` (readonly)

#end_time ⇒ `Object` (readonly)

#filename ⇒ `Object` (readonly)

#final_save_file ⇒ `Object` (readonly)

#ocr_engine ⇒ `Object` (readonly)

#original_file_path ⇒ `Object` (readonly)

#save_file_path ⇒ `Object` (readonly)

#start_time ⇒ `Object` (readonly)

#close ⇒ `Object`

#image? ⇒ `Boolean`

#pdf? ⇒ `Boolean`

#text? ⇒ `Boolean`

#to_pdf ⇒ `Object`

#to_s ⇒ `Object`

#to_text ⇒ `Object`