Module: Pdftohtml

Defined in:
lib/pdftohtml.rb,
lib/pdftohtml/version.rb

Overview

PDF To HTML Module

Constant Summary collapse

VERSION =

Version

'0.2.3'

Class Method Summary collapse

Class Method Details

.convert(pdf_file) ⇒ Object

Convert PDF to HTML



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/pdftohtml.rb', line 11

def self.convert pdf_file

	# Generate Output Directory
	out_path = "/tmp/pdftohtml-#{Time.now.to_f.to_s.gsub '.', (rand * 10000000000).to_i.to_s}"
	FileUtils.rmtree out_path
	FileUtils.mkdir out_path

	# Run pdftohtml
	`pdftohtml -c -i "#{pdf_file}" "#{out_path}/output"`

	# Drop shit files
	File.unlink "#{out_path}/output.html"
	File.unlink "#{out_path}/output_ind.html"

	# Acquire Files
	files = Dir["#{out_path}/*.html"]

	# Load up Document Pages
	pages = files.sort.collect { |f| { /#{out_path}\/output-([0-9]+).html/.match(f)[1].to_i => File.readlines(f).collect { |l| l.chomp } } }.inject({}, :merge)

	# Drop temp files
	FileUtils.rmtree out_path

	pages
end