Module: Pdftohtml

Defined in:
lib/pdftohtml.rb,
lib/pdftohtml/version.rb

Overview

PDF To HTML Module

Constant Summary collapse

VERSION =

Version

'1.0.0'

Class Method Summary collapse

Class Method Details

.convert(pdf_file) ⇒ Hash

Convert PDF to HTML: Converts the file pointed to by pdf_file into a hash of HTML pages.

Parameters:

  • pdf_file (String)

    Path to a PDF file

Returns:

  • (Hash)

    A hash of HTML Pages { 0 => [‘Line0’, ‘Line1’, …], 1 => [‘Line0’, ‘Line1’, …], … }



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/pdftohtml.rb', line 15

def self.convert pdf_file

	# Generate Output Directory
	out_path = "/tmp/pdftohtml-#{Time.now.to_f.to_s.gsub '.', (rand * 10000000000).to_i.to_s}"
	FileUtils.rmtree out_path
	FileUtils.mkdir out_path

	# Run pdftohtml
	`pdftohtml -c -i "#{pdf_file}" "#{out_path}/output"`

	# Drop shit files
	File.unlink "#{out_path}/output.html"
	File.unlink "#{out_path}/output_ind.html"

	# Acquire Files
	files = Dir["#{out_path}/*.html"]

	# Load up Document Pages
	pages = files.sort.collect { |f| { /#{out_path}\/output-([0-9]+).html/.match(f)[1].to_i => File.readlines(f).collect { |l| l.chomp } } }.inject({}, :merge)

	# Drop temp files
	FileUtils.rmtree out_path

	pages
end