Class: Docsplit::InfoExtractor

Inherits:

Object

Object
Docsplit::InfoExtractor

show all

Defined in:: lib/docsplit/info_extractor.rb

Overview

Delegates to pdfinfo in order to extract information about a PDF file.

Constant Summary collapse

MATCHERS = Regex matchers for different bits of information.

{
  author: /^Author:\s+([^\n]+)/,
  date: /^CreationDate:\s+([^\n]+)/,
  creator: /^Creator:\s+([^\n]+)/,
  keywords: /^Keywords:\s+([^\n]+)/,
  producer: /^Producer:\s+([^\n]+)/,
  subject: /^Subject:\s+([^\n]+)/,
  title: /^Title:\s+([^\n]+)/,
  length: /^Pages:\s+([^\n]+)/
}.freeze

Instance Method Summary collapse

#extract(key, pdfs, opts) ⇒ Object

Pull out a single datum from a pdf.
#extract_all(pdfs, _opts) ⇒ Object

Instance Method Details

#extract(key, pdfs, opts) ⇒ `Object`

Pull out a single datum from a pdf.



17
18
19

# File 'lib/docsplit/info_extractor.rb', line 17

def extract(key, pdfs, opts)
  extract_all(pdfs, opts)[key]
end

#extract_all(pdfs, _opts) ⇒ `Object`

Raises:

(ExtractionFailed)

# File 'lib/docsplit/info_extractor.rb', line 21

def extract_all(pdfs, _opts)
  pdf = [pdfs].flatten.first
  cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
  result = `#{cmd}`.chomp
  raise ExtractionFailed, result if $?.exitstatus.nonzero?
  # ruby  1.8 (iconv) and 1.9 (String#encode) :
  if String.method_defined?(:encode)
    result.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') unless result.valid_encoding?
  else
    require 'iconv' unless defined?(Iconv)
    ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
    result = ic.iconv(result)
  end
  info = {}
  MATCHERS.each do |key, matcher|
    match = result.match(matcher)
    answer = match && match[1]
    if answer
      answer = answer.to_i if key == :length
      info[key] = answer
    end
  end
  info
end