Class: RDig::ContentExtractors::PdfContentExtractor

Inherits:
ContentExtractor show all
Includes:
ExternalAppHelper
Defined in:
lib/rdig/content_extractors/pdf.rb

Overview

Extract text from pdf content.

Requires the pdftotext and pdfinfo utilities from the xpdf-utils package (on debian and friends do ‘apt-get install xpdf-utils’)

Instance Method Summary collapse

Methods included from ExternalAppHelper

#as_file, #can_do

Methods inherited from ContentExtractor

#can_do, extractor_instances, extractors, inherited, process

Constructor Details

#initialize(config) ⇒ PdfContentExtractor

Returns a new instance of PdfContentExtractor.



12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/rdig/content_extractors/pdf.rb', line 12

def initialize(config)
  super(config)
  @pattern = /^application\/pdf/
  @pdftotext = 'pdftotext'
  @pdfinfo = 'pdfinfo'
  @available = true
  [ @pdftotext, @pdfinfo].each { |program|
    unless %x{#{program} -h 2>&1} =~ /Copyright 1996/ 
      @available = false 
      break
    end
  }
end

Instance Method Details

#get_content(path_to_tempfile) ⇒ Object



35
36
37
# File 'lib/rdig/content_extractors/pdf.rb', line 35

def get_content(path_to_tempfile)
  %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
end

#get_title(path_to_tempfile) ⇒ Object

extracts the title from pdf meta data needs pdfinfo returns the title or nil if no title was found



42
43
44
45
# File 'lib/rdig/content_extractors/pdf.rb', line 42

def get_title(path_to_tempfile)
  %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
rescue
end

#process(content) ⇒ Object



26
27
28
29
30
31
32
33
# File 'lib/rdig/content_extractors/pdf.rb', line 26

def process(content)
  result = {}
  as_file(content) do |file|
    result[:content] = get_content(file.path).strip
    result[:title] = get_title(file.path)
  end
  result
end