Class: Docsplit::ImageExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/docsplit/image_extractor.rb

Overview

Delegates to GraphicsMagick in order to convert PDF documents into nicely sized images.

Constant Summary collapse

MEMORY_ARGS =
"-limit memory 256MiB -limit map 512MiB"
DEFAULT_FORMAT =
:png
DEFAULT_DENSITY =
'150'

Instance Method Summary collapse

Instance Method Details

#convert(pdf, size, format, previous = nil) ⇒ Object

Convert a single PDF into page images at the specified size and format. If ‘–rolling`, and we have a previous image at a larger size to work with, we simply downsample that image, instead of re-rendering the entire PDF. Now we generate one page at a time, a counterintuitive opimization suggested by the GraphicsMagick list, that seems to work quite well.



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/docsplit/image_extractor.rb', line 30

def convert(pdf, size, format, previous=nil)
  tempdir   = Dir.mktmpdir
  basename  = File.basename(pdf, File.extname(pdf))
  directory = directory_for(size)
  pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
  escaped_pdf = ESCAPE[pdf]
  FileUtils.mkdir_p(directory) unless File.exists?(directory)
  common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
  if previous
    FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
    result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
    raise ExtractionFailed, result if $? != 0
  else
    page_list(pages).each do |page|
      out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
      cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
      result = `#{cmd}`.chomp
      raise ExtractionFailed, result if $? != 0
    end
  end
ensure
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end

#extract(pdfs, options) ⇒ Object

Extract a list of PDFs as rasterized page images, according to the configuration in options.



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/docsplit/image_extractor.rb', line 13

def extract(pdfs, options)
  @pdfs = [pdfs].flatten
  extract_options(options)
  @pdfs.each do |pdf|
    previous = nil
    @sizes.each_with_index do |size, i|
      @formats.each {|format| convert(pdf, size, format, previous) }
      previous = size if @rolling
    end
  end
end