Class: Treat::Workers::Formatters::Readers::PDF

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/formatters/readers/pdf.rb

Overview

A wrapper for the Poppler pdf2text utility, which extracts the text from a PDF file.

Class Method Summary collapse

Class Method Details

.create_temp_file(ext, value = nil, &block) ⇒ Object

Create a temporary file which is deleted after execution of the block.



34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/treat/workers/formatters/readers/pdf.rb', line 34

def self.create_temp_file(ext, value = nil, &block)
  if not FileTest.directory?(Treat.paths.tmp)
    FileUtils.mkdir(Treat.paths.tmp)
  end
  fname = Treat.paths.tmp + 
  "#{Random.rand(10000000).to_s}.#{ext}"
  File.open(fname, 'w') do |f|
    f.write(value) if value
    block.call(f.path)
  end
ensure
  File.delete(fname)
end

.read(document, options = {}) ⇒ Object

Read a PDF file using the Poppler pdf2text utility.

Options: none.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/treat/workers/formatters/readers/pdf.rb', line 11

def self.read(document, options = {})
  
  self.create_temp_file(:txt) do |tmp|
    `pdftotext #{document.file} #{tmp} `.strip
    f = File.read(tmp)
    f.gsub!("\t\r ", '')
    f.gsub!('-­‐', '-')
    f.gsub!("\n\n", '#keep#')
    f.gsub!("\n", ' ')
    # Fix for an incompatible space character.
    f.gsub!(" ", ' ')  
    f.gsub!('#keep#', "\n\n")
    
    document.value = f
    document.set :format, 'pdf'
    document
    
  end
  
end