Class: OcrFile::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/ocr-file/document.rb

Constant Summary collapse

ACCEPTED_IMAGE_TYPES =

TODO: Skewness / text orientation detection TODO: Better handwriting analysis

['png', 'jpeg', 'jpg', 'tiff', 'bmp']
PAGE_BREAK =

TODO: Make configurable

"\n\r\n"
EFFECTS_TO_REMOVE =
['', 'norm', 'remove_shadow', 'bw']
DEFAULT_CONFIG =
{
  # Images from PDF
  filetype: 'png',
  quality: 100,
  dpi: 300,
  # Text to PDF
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
  # Cloud-Vision OCR
  image_annotator: nil, # Needed for Cloud-Vision
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
  ocr_engine: 'tesseract', # 'cloud-vision'
  # Image Pre-Processing
  image_preprocess: true,
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
  automatic_reprocess: true,
  dimensions: nil, # width, height. Will lock images to these dimensions
  # PDF to Image Processing
  optimise_pdf: true,
  extract_pdf_images: true, # if false will screenshot each PDF page
  temp_filename_prefix: 'image',
  spelling_correction: true,
  keep_files: false,
  # Console Output
  verbose: true,
  timing: true
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document

save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation



51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/ocr-file/document.rb', line 51

def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
  @original_file_path = original_file_path
  @filename = original_file_path.split('/').last.split('.').first

  date = Time.now.to_s.split(' ').first

  @save_file_path = save_file_path
  @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}"

  @config = config
  @ocr_engine = find_ocr_engine(config[:ocr_engine])
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def config
  @config
end

#end_timeObject (readonly)

Returns the value of attribute end_time.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def end_time
  @end_time
end

#filenameObject (readonly)

Returns the value of attribute filename.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def filename
  @filename
end

#final_save_fileObject (readonly)

Returns the value of attribute final_save_file.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def final_save_file
  @final_save_file
end

#ocr_engineObject (readonly)

Returns the value of attribute ocr_engine.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def ocr_engine
  @ocr_engine
end

#original_file_pathObject (readonly)

Returns the value of attribute original_file_path.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def original_file_path
  @original_file_path
end

#save_file_pathObject (readonly)

Returns the value of attribute save_file_path.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def save_file_path
  @save_file_path
end

#start_timeObject (readonly)

Returns the value of attribute start_time.



40
41
42
# File 'lib/ocr-file/document.rb', line 40

def start_time
  @start_time
end

Instance Method Details

#closeObject



122
123
124
125
# File 'lib/ocr-file/document.rb', line 122

def close
  return if keep_files?
  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end

#image?Boolean

Returns:

  • (Boolean)


68
69
70
71
# File 'lib/ocr-file/document.rb', line 68

def image?
  return false if pdf?
  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}") }
end

#pdf?Boolean

Returns:

  • (Boolean)


64
65
66
# File 'lib/ocr-file/document.rb', line 64

def pdf?
  @original_file_path.downcase.include?('.pdf')
end

#text?Boolean

Treat anything which isnt a PDF or image as text

Returns:

  • (Boolean)


74
75
76
# File 'lib/ocr-file/document.rb', line 74

def text?
  !pdf? && !image?
end

#to_pdfObject

Trigger OCR pipeline



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/ocr-file/document.rb', line 79

def to_pdf
  @start_time = Time.now
  find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?

  if pdf?
    ocr_pdf_to_searchable_pdf
  elsif text?
    text_to_pdf
  else # is an image
    ocr_image_to_pdf
  end

  close

  @end_time = Time.now
  print_time
end

#to_sObject



108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/ocr-file/document.rb', line 108

def to_s
  @start_time = Time.now
  return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?

  text = find_best_image_processing(save: false)

  close

  @end_time = Time.now
  print_time

  text
end

#to_textObject



97
98
99
100
101
102
103
104
105
106
# File 'lib/ocr-file/document.rb', line 97

def to_text
  @start_time = Time.now
  return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?

  find_best_image_processing(save: true)
  close

  @end_time = Time.now
  print_time
end