Module: OcrFile::ImageEngines::PdfEngine

Extended by:
PdfEngine
Included in:
PdfEngine
Defined in:
lib/ocr-file/image_engines/pdf_engine.rb

Constant Summary collapse

PAGE_BREAK =
"\n\r\n"
DEFAULT_PAGE_OPTIONS =
{
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
}

Instance Method Summary collapse

Instance Method Details

#add_page(document, text, options) ⇒ Object



27
28
29
30
31
32
33
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 27

def add_page(document, text, options)
  canvas = document.pages.add.canvas
  canvas.font(options[:font], size: options[:font_size])
  canvas.text(text, at: [options[:text_x], options[:text_y]])

  document
end

#extract_images(document, save_path, verbose: false) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 43

def extract_images(document, save_path, verbose: false)
  image_paths = []

  ::HexaPDF::CLI::Images.new.send(:each_image, document) do |image, index, pindex, (_x_ppi, _y_ppi)|
    puts "Processing page: #{pindex} ..."
    info = image.info

    if info.writable
      image_filename = "#{index}.#{image.info.extension}"
      image_path = "#{save_path}/#{image_filename}"
      image.write(image_path)

      image_paths << image_path
    elsif command_parser.verbosity_warning?
      puts style("Warning (image #{index}, page #{pindex}): PDF image format not supported for writing", RED)
    end
  end

  image_paths
end

#merge(documents) ⇒ Object



64
65
66
67
68
69
70
71
72
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 64

def merge(documents)
  target = ::HexaPDF::Document.new

  documents.each do |document|
    document.pages.each { |page| target.pages << target.import(page) }
  end

  target
end

#open_pdf(file, password: '') ⇒ Object



39
40
41
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 39

def open_pdf(file, password: '')
  ::HexaPDF::Document.open(file, decryption_opts: { password: password })
end

#pdf_from_text(text, options = DEFAULT_PAGE_OPTIONS) ⇒ Object



16
17
18
19
20
21
22
23
24
25
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 16

def pdf_from_text(text, options = DEFAULT_PAGE_OPTIONS)
  document = ::HexaPDF::Document.new

  text
    .split(PAGE_BREAK)
    .reject { |line| line.size < options[:minimum_word] }
    .each { |page_text| document = add_page(document, page_text, options) }

  document
end

#save_pdf(document, save_file_path, optimise: true) ⇒ Object



35
36
37
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 35

def save_pdf(document, save_file_path, optimise: true)
  document.write(save_file_path, optimize: true)
end