Class: OcrFile::Document

Inherits:

Object

Object
OcrFile::Document

show all

Defined in:: lib/ocr-file/document.rb

Constant Summary collapse

ACCEPTED_IMAGE_TYPES = TODO: Skewness / text orientation detection TODO: Better handwriting analysis

['png', 'jpeg', 'jpg', 'tiff', 'bmp']

PAGE_BREAK = TODO: Make configurable

"\n\r\n"

DEFAULT_CONFIG =

{
  # Images from PDF
  filetype: 'png',
  quality: 100,
  dpi: 300,
  # Text to PDF
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
  # Cloud-Vision OCR
  image_annotator: nil, # Needed for Cloud-Vision
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
  ocr_engine: 'tesseract', # 'cloud-vision'
  # Image Pre-Processing
  image_preprocess: true,
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
  # PDF to Image Processing
  optimise_pdf: true,
  extract_pdf_images: true, # if false will screenshot each PDF page
  temp_filename_prefix: 'image',
  # Console Output
  verbose: true,
}

Instance Attribute Summary collapse

#config ⇒ Object readonly

Returns the value of attribute config.
#filename ⇒ Object readonly

Returns the value of attribute filename.
#final_save_file ⇒ Object readonly

Returns the value of attribute final_save_file.
#ocr_engine ⇒ Object readonly

Returns the value of attribute ocr_engine.
#original_file_path ⇒ Object readonly

Returns the value of attribute original_file_path.
#save_file_path ⇒ Object readonly

Returns the value of attribute save_file_path.

Instance Method Summary collapse

#close ⇒ Object
#image? ⇒ Boolean
#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document constructor

save_file_path will also generate a tmp path for tmp files.
#pdf? ⇒ Boolean
#text? ⇒ Boolean

Treat anything which isnt a PDF or image as text.
#to_pdf ⇒ Object

Trigger OCR pipeline.
#to_s ⇒ Object
#to_text ⇒ Object

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ `Document`

save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation

# File 'lib/ocr-file/document.rb', line 43

def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
  @original_file_path = original_file_path
  @filename = original_file_path.split('/').last.split('.').first

  date = Time.now.to_s.split(' ').first

  @save_file_path = save_file_path
  @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}"

  @config = config
  @ocr_engine = find_ocr_engine(config[:ocr_engine])
end

Instance Attribute Details

#config ⇒ `Object` (readonly)

Returns the value of attribute config.



34
35
36

# File 'lib/ocr-file/document.rb', line 34

def config
  @config
end

#filename ⇒ `Object` (readonly)

Returns the value of attribute filename.



34
35
36

# File 'lib/ocr-file/document.rb', line 34

def filename
  @filename
end

#final_save_file ⇒ `Object` (readonly)

Returns the value of attribute final_save_file.



34
35
36

# File 'lib/ocr-file/document.rb', line 34

def final_save_file
  @final_save_file
end

#ocr_engine ⇒ `Object` (readonly)

Returns the value of attribute ocr_engine.



34
35
36

# File 'lib/ocr-file/document.rb', line 34

def ocr_engine
  @ocr_engine
end

#original_file_path ⇒ `Object` (readonly)

Returns the value of attribute original_file_path.



34
35
36

# File 'lib/ocr-file/document.rb', line 34

def original_file_path
  @original_file_path
end

#save_file_path ⇒ `Object` (readonly)

Returns the value of attribute save_file_path.



34
35
36

# File 'lib/ocr-file/document.rb', line 34

def save_file_path
  @save_file_path
end

Instance Method Details

#close ⇒ `Object`



140
141
142

# File 'lib/ocr-file/document.rb', line 140

def close
  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end

#image? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/ocr-file/document.rb', line 60

def image?
  return false if pdf?
  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
end

#pdf? ⇒ `Boolean`

Returns:

(Boolean)



56
57
58

# File 'lib/ocr-file/document.rb', line 56

def pdf?
  @original_file_path.downcase.include?('.pdf')
end

#text? ⇒ `Boolean`

Treat anything which isnt a PDF or image as text

Returns:

(Boolean)



66
67
68

# File 'lib/ocr-file/document.rb', line 66

def text?
  !pdf? && !image?
end

#to_pdf ⇒ `Object`

Trigger OCR pipeline

# File 'lib/ocr-file/document.rb', line 71

def to_pdf
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    pdfs_to_merge = []

    image_paths.each do |image_path|
      pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
    end

    merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)

    OcrFile::ImageEngines::PdfEngine
      .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
  elsif text?
    text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
    pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)

    OcrFile::ImageEngines::PdfEngine
      .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
  else # is an image
    ocr_image_to_pdf
  end

  close
end

#to_s ⇒ `Object`

# File 'lib/ocr-file/document.rb', line 117

def to_s
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    text = ''

    image_paths.each do |image_path|
      text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
    end

    close
    text
  elsif text?
    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
  else # is an image
    text = ocr_image_to_text(save: false)

    close
    text
  end
end

#to_text ⇒ `Object`

# File 'lib/ocr-file/document.rb', line 99

def to_text
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    image_paths.each do |image_path|
      text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
      ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
    end
  elsif text?
    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
  else # is an image
    ocr_image_to_text(save: true)
  end

  close
end

Class: OcrFile::Document

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document

Instance Attribute Details

#config ⇒ Object (readonly)

#filename ⇒ Object (readonly)

#final_save_file ⇒ Object (readonly)

#ocr_engine ⇒ Object (readonly)

#original_file_path ⇒ Object (readonly)

#save_file_path ⇒ Object (readonly)

Instance Method Details

#close ⇒ Object

#image? ⇒ Boolean

#pdf? ⇒ Boolean

#text? ⇒ Boolean

#to_pdf ⇒ Object

#to_s ⇒ Object

#to_text ⇒ Object

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ `Document`

#config ⇒ `Object` (readonly)

#filename ⇒ `Object` (readonly)

#final_save_file ⇒ `Object` (readonly)

#ocr_engine ⇒ `Object` (readonly)

#original_file_path ⇒ `Object` (readonly)

#save_file_path ⇒ `Object` (readonly)

#close ⇒ `Object`

#image? ⇒ `Boolean`

#pdf? ⇒ `Boolean`

#text? ⇒ `Boolean`

#to_pdf ⇒ `Object`

#to_s ⇒ `Object`

#to_text ⇒ `Object`