Module: OcrFile::ImageEngines::PdfEngine

Extended by:
PdfEngine
Included in:
PdfEngine
Defined in:
lib/ocr-file/image_engines/pdf_engine.rb

Constant Summary collapse

PAGE_BREAK =
"\n\r\n"
DEFAULT_PAGE_OPTIONS =
{
  font: 'Helvetica',
  font_size: 5, #8 # 12
  text_x: 20,
  text_y: 800,
  minimum_word: 5,
}

Instance Method Summary collapse

Instance Method Details

#add_page(document, text, options) ⇒ Object



27
28
29
30
31
32
33
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 27

def add_page(document, text, options)
  canvas = document.pages.add.canvas
  canvas.font(options[:font], size: options[:font_size])
  canvas.text(text, at: [options[:text_x], options[:text_y]])

  document
end

#combine(text, pdf_of_images) ⇒ Object



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 86

def combine(text, pdf_of_images)
  return unless pdf_of_images.is_a?(::HexaPDF::Document)

  if text.is_a?(::HexaPDF::Document)
    pages_of_text = text.pages
  else # Assume raw text with PAGE_BREAK
    pages_of_text = text.split(PAGE_BREAK)
  end

  return unless pages_of_text.size == pdf_of_images.pages.size

  if text.is_a?(::HexaPDF::Document) # Keep the page structure

  else # Just text to embed

  end
end

#extract_images(document, save_path, verbose: false) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 43

def extract_images(document, save_path, verbose: false)
  image_paths = []

  ::HexaPDF::CLI::Images.new.send(:each_image, document) do |image, index, pindex, (_x_ppi, _y_ppi)|
    puts "Processing page: #{pindex} ..."
    info = image.info

    if info.writable
      image_filename = "#{index}.#{image.info.extension}"
      image_path = "#{save_path}/#{image_filename}"
      image.write(image_path)

      image_paths << image_path
    elsif command_parser.verbosity_warning?
      puts style("Warning (image #{index}, page #{pindex}): PDF image format not supported for writing", RED)
    end
  end

  image_paths
end

#insert_image(document, image_path, dimensions: nil) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 64

def insert_image(document, image_path, dimensions: nil)
  image_processor = OcrFile::ImageEngines::ImageMagick.new(
    image_path: image_path,
    temp_path: @temp_folder_path,
    save_file_path: '',
    config: @config
  )

  if dimensions
    width = dimensions[0]
    height = dimensions[1]
  else
    width = image_processor.width
    height = image_processor.height
  end

  page = document.pages.add([0, 0, width, height])
  page.canvas.image(@image || image_path, at: [0, 0], width: width, height: height)

  document
end

#merge(documents) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 104

def merge(documents)
  target = ::HexaPDF::Document.new

  documents.each do |document|
    if document.is_a?(::HexaPDF::Document)
      document.pages.each { |page| target.pages << target.import(page) }
    else # Assume an image
      insert_image(target, document)
    end
  end

  target
end

#open_pdf(file, password: '') ⇒ Object



39
40
41
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 39

def open_pdf(file, password: '')
  ::HexaPDF::Document.open(file, decryption_opts: { password: password })
end

#pdf_from_text(text, options = DEFAULT_PAGE_OPTIONS) ⇒ Object



16
17
18
19
20
21
22
23
24
25
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 16

def pdf_from_text(text, options = DEFAULT_PAGE_OPTIONS)
  document = ::HexaPDF::Document.new

  text
    .split(PAGE_BREAK)
    .reject { |line| line.size < options[:minimum_word] }
    .each { |page_text| document = add_page(document, page_text, options) }

  document
end

#save_pdf(document, save_file_path, optimise: true) ⇒ Object



35
36
37
# File 'lib/ocr-file/image_engines/pdf_engine.rb', line 35

def save_pdf(document, save_file_path, optimise: true)
  document.write(save_file_path, optimize: true)
end