Class: OcrFile::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/ocr-file/document.rb

Constant Summary collapse

ACCEPTED_IMAGE_TYPES =

TODO: Skewness / text orientation detection TODO: Better handwriting analysis

['png', 'jpeg', 'jpg', 'tiff', 'bmp']
PAGE_BREAK =

TODO: Make configurable

"\n\r\n"
DEFAULT_CONFIG =
{
  # Images from PDF
  filetype: 'png',
  quality: 100,
  dpi: 300,
  # Text to PDF
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
  # Cloud-Vision OCR
  image_annotator: nil, # Needed for Cloud-Vision
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
  ocr_engine: 'tesseract', # 'cloud-vision'
  # Image Pre-Processing
  image_preprocess: true,
  effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
  # PDF to Image Processing
  optimise_pdf: true,
  extract_pdf_images: true, # if false will screenshot each PDF page
  temp_filename_prefix: 'image',
  # Console Output
  verbose: true,
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document

save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation



43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/ocr-file/document.rb', line 43

def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
  @original_file_path = original_file_path
  @filename = original_file_path.split('/').last.split('.').first

  date = Time.now.to_s.split(' ').first

  @save_file_path = save_file_path
  @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}"

  @config = config
  @ocr_engine = find_ocr_engine(config[:ocr_engine])
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



34
35
36
# File 'lib/ocr-file/document.rb', line 34

def config
  @config
end

#filenameObject (readonly)

Returns the value of attribute filename.



34
35
36
# File 'lib/ocr-file/document.rb', line 34

def filename
  @filename
end

#final_save_fileObject (readonly)

Returns the value of attribute final_save_file.



34
35
36
# File 'lib/ocr-file/document.rb', line 34

def final_save_file
  @final_save_file
end

#ocr_engineObject (readonly)

Returns the value of attribute ocr_engine.



34
35
36
# File 'lib/ocr-file/document.rb', line 34

def ocr_engine
  @ocr_engine
end

#original_file_pathObject (readonly)

Returns the value of attribute original_file_path.



34
35
36
# File 'lib/ocr-file/document.rb', line 34

def original_file_path
  @original_file_path
end

#save_file_pathObject (readonly)

Returns the value of attribute save_file_path.



34
35
36
# File 'lib/ocr-file/document.rb', line 34

def save_file_path
  @save_file_path
end

Instance Method Details

#closeObject



140
141
142
# File 'lib/ocr-file/document.rb', line 140

def close
  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end

#image?Boolean

Returns:

  • (Boolean)


60
61
62
63
# File 'lib/ocr-file/document.rb', line 60

def image?
  return false if pdf?
  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
end

#pdf?Boolean

Returns:

  • (Boolean)


56
57
58
# File 'lib/ocr-file/document.rb', line 56

def pdf?
  @original_file_path.downcase.include?('.pdf')
end

#text?Boolean

Treat anything which isnt a PDF or image as text

Returns:

  • (Boolean)


66
67
68
# File 'lib/ocr-file/document.rb', line 66

def text?
  !pdf? && !image?
end

#to_pdfObject

Trigger OCR pipeline



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/ocr-file/document.rb', line 71

def to_pdf
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    pdfs_to_merge = []

    image_paths.each do |image_path|
      pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
    end

    merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)

    OcrFile::ImageEngines::PdfEngine
      .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
  elsif text?
    text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
    pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)

    OcrFile::ImageEngines::PdfEngine
      .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
  else # is an image
    ocr_image_to_pdf
  end

  close
end

#to_sObject



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/ocr-file/document.rb', line 117

def to_s
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    text = ''

    image_paths.each do |image_path|
      text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
    end

    close
    text
  elsif text?
    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
  else # is an image
    text = ocr_image_to_text(save: false)

    close
    text
  end
end

#to_textObject



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/ocr-file/document.rb', line 99

def to_text
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    image_paths.each do |image_path|
      text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
      ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
    end
  elsif text?
    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
  else # is an image
    ocr_image_to_text(save: true)
  end

  close
end