Module: Pdftohtml

Defined in:
lib/pdftohtml.rb,
lib/pdftohtml/version.rb

Overview

PDF To HTML Module

Constant Summary collapse

VERSION =

Version

'1.0.0'

Class Method Summary collapse

Class Method Details

.convert(pdf_file) ⇒ Hash

Convert PDF to HTML: Converts the file pointed to by pdf_file into a hash of HTML pages.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/pdftohtml.rb', line 15

def self.convert pdf_file

  # Generate Output Directory
  out_path = "/tmp/pdftohtml-#{Time.now.to_f.to_s.gsub '.', (rand * 10000000000).to_i.to_s}"
  FileUtils.rmtree out_path
  FileUtils.mkdir out_path

  # Run pdftohtml
  `pdftohtml -c -i "#{pdf_file}" "#{out_path}/output"`

  # Drop shit files
  File.unlink "#{out_path}/output.html"
  File.unlink "#{out_path}/output_ind.html"

  # Acquire Files
  files = Dir["#{out_path}/*.html"]

  # Load up Document Pages
  pages = files.sort.collect { |f| { /#{out_path}\/output-([0-9]+).html/.match(f)[1].to_i => File.readlines(f).collect { |l| l.chomp } } }.inject({}, :merge)

  # Drop temp files
  FileUtils.rmtree out_path

  pages
end