Class: OcrFile::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/ocr-file/document.rb

Constant Summary collapse

ACCEPTED_IMAGE_TYPES =
['png', 'jpeg', 'jpg', 'tiff', 'bmp']
PAGE_BREAK =

TODO: Make configurable

"\n\r\n"
DEFAULT_CONFIG =
{
  # Images from PDF
  filetype: 'png',
  quality: 100,
  dpi: 300,
  # Text to PDF
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
  # Cloud-Vision OCR
  image_annotator: nil, # Needed for Cloud-Vision
  type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
  ocr_engine: 'tesseract', # 'cloud-vision'
  # Image Pre-Processing
  image_pre_preprocess: true,
  effects: ['bw', 'norm'],
  threshold: 0.25,
  # PDF to Image Processing
  optimise_pdf: true,
  extract_pdf_images: true, # if false will screenshot each PDF page
  temp_filename_prefix: 'image',
  # Console Output
  verbose: true,
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) ⇒ Document

save_file_path will also generate a tmp path for tmp files. Expected folder path TODO: Add in more input validation



41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/ocr-file/document.rb', line 41

def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
  @original_file_path = original_file_path
  @filename = original_file_path.split('/').last.split('.').first

  date = Time.now.to_s.split(' ').first

  @save_file_path = save_file_path
  @final_save_file = "#{@save_file_path}/#{@filename}-#{date}-#{Time.now.to_i}"

  @config = config
  @ocr_engine = find_ocr_engine(config[:ocr_engine])
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



32
33
34
# File 'lib/ocr-file/document.rb', line 32

def config
  @config
end

#filenameObject (readonly)

Returns the value of attribute filename.



32
33
34
# File 'lib/ocr-file/document.rb', line 32

def filename
  @filename
end

#final_save_fileObject (readonly)

Returns the value of attribute final_save_file.



32
33
34
# File 'lib/ocr-file/document.rb', line 32

def final_save_file
  @final_save_file
end

#ocr_engineObject (readonly)

Returns the value of attribute ocr_engine.



32
33
34
# File 'lib/ocr-file/document.rb', line 32

def ocr_engine
  @ocr_engine
end

#original_file_pathObject (readonly)

Returns the value of attribute original_file_path.



32
33
34
# File 'lib/ocr-file/document.rb', line 32

def original_file_path
  @original_file_path
end

#save_file_pathObject (readonly)

Returns the value of attribute save_file_path.



32
33
34
# File 'lib/ocr-file/document.rb', line 32

def save_file_path
  @save_file_path
end

Instance Method Details

#closeObject



134
135
136
# File 'lib/ocr-file/document.rb', line 134

def close
  ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end

#image?Boolean

Returns:

  • (Boolean)


58
59
60
61
# File 'lib/ocr-file/document.rb', line 58

def image?
  return false if pdf?
  ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
end

#pdf?Boolean

Returns:

  • (Boolean)


54
55
56
# File 'lib/ocr-file/document.rb', line 54

def pdf?
  @original_file_path.include?('.pdf')
end

#text?Boolean

Treat anything which isnt a PDF or image as text

Returns:

  • (Boolean)


64
65
66
# File 'lib/ocr-file/document.rb', line 64

def text?
  !pdf? && !image?
end

#to_pdfObject



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/ocr-file/document.rb', line 68

def to_pdf
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    pdfs_to_merge = []

    image_paths.each do |image_path|
      pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
    end

    merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)

    OcrFile::ImageEngines::PdfEngine
      .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])

    close
  elsif text?
    text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
    pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)

    OcrFile::ImageEngines::PdfEngine
      .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
  else # is an image
    ocr_image_to_pdf
  end
end

#to_sObject



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/ocr-file/document.rb', line 114

def to_s
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    text = ''

    image_paths.each do |image_path|
      text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
    end

    close
    text
  elsif text?
    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
  else # is an image
    ocr_image_to_text(save: false)
  end
end

#to_textObject



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/ocr-file/document.rb', line 96

def to_text
  if pdf?
    create_temp_folder
    image_paths = extract_image_paths_from_pdf(@original_file_path)

    image_paths.each do |image_path|
      text = @ocr_engine.ocr_to_text(image_path, options: @config)
      ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
    end

    close
  elsif text?
    ::OcrFile::FileHelpers.open_text_file(@original_file_path)
  else # is an image
    ocr_image_to_text(save: true)
  end
end