Class: RDig::ContentExtractors::PdfContentExtractor

Inherits:

Object
ContentExtractor
RDig::ContentExtractors::PdfContentExtractor

show all

Includes:: ExternalAppHelper

Defined in:: lib/rdig/content_extractors/pdf.rb

Overview

Extract text from pdf content.

Requires the pdftotext and pdfinfo utilities from the xpdf-utils package (on debian and friends do ‘apt-get install xpdf-utils’)

Instance Method Summary collapse

#get_content(path_to_tempfile) ⇒ Object
#get_title(path_to_tempfile) ⇒ Object

extracts the title from pdf meta data needs pdfinfo returns the title or nil if no title was found.
#initialize(config) ⇒ PdfContentExtractor constructor

A new instance of PdfContentExtractor.
#process(content) ⇒ Object

Methods included from ExternalAppHelper

#as_file, #can_do

Methods inherited from ContentExtractor

#can_do, extractor_instances, extractors, inherited, process

Constructor Details

#initialize(config) ⇒ `PdfContentExtractor`

Returns a new instance of PdfContentExtractor.

# File 'lib/rdig/content_extractors/pdf.rb', line 12

def initialize(config)
  super(config)
  @pattern = /^application\/pdf/
  @pdftotext = 'pdftotext'
  @pdfinfo = 'pdfinfo'
  @available = true
  [ @pdftotext, @pdfinfo].each { |program|
    unless %x{#{program} -h 2>&1} =~ /Copyright 1996/ 
      @available = false 
      break
    end
  }
end

Instance Method Details

#get_content(path_to_tempfile) ⇒ `Object`



35
36
37

# File 'lib/rdig/content_extractors/pdf.rb', line 35

def get_content(path_to_tempfile)
  %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
end

#get_title(path_to_tempfile) ⇒ `Object`

extracts the title from pdf meta data needs pdfinfo returns the title or nil if no title was found

# File 'lib/rdig/content_extractors/pdf.rb', line 42

def get_title(path_to_tempfile)
  %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
rescue
end

#process(content) ⇒ `Object`

# File 'lib/rdig/content_extractors/pdf.rb', line 26

def process(content)
  result = {}
  as_file(content) do |file|
    result[:content] = get_content(file.path).strip
    result[:title] = get_title(file.path)
  end
  result
end