Class: PageExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf_extract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page) ⇒ PageExtractor

Returns a new instance of PageExtractor.



9
10
11
12
13
14
15
# File 'lib/pdf_extract.rb', line 9

def initialize(page)
	@image_path = page[:image_path]
	@pdf_path = page[:pdf_path]
	@items = page[:items]
	@page_num = page[:page] ||= 1
	@results = {}
end

Instance Attribute Details

#image_pathObject

Returns the value of attribute image_path.



8
9
10
# File 'lib/pdf_extract.rb', line 8

def image_path
  @image_path
end

#itemsObject

Returns the value of attribute items.



8
9
10
# File 'lib/pdf_extract.rb', line 8

def items
  @items
end

#pageObject

Returns the value of attribute page.



8
9
10
# File 'lib/pdf_extract.rb', line 8

def page
  @page
end

#pdf_pathObject

Returns the value of attribute pdf_path.



8
9
10
# File 'lib/pdf_extract.rb', line 8

def pdf_path
  @pdf_path
end

#resultsObject

Returns the value of attribute results.



8
9
10
# File 'lib/pdf_extract.rb', line 8

def results
  @results
end

Instance Method Details

#crop_image(d) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
# File 'lib/pdf_extract.rb', line 32

def crop_image(d)
	new_image_name = "CR.png"
#	ImageVoodoo.with_image(image_path) do |img|
		x1 = d[:x1]	
		x2 = d[:x2]
		y1 = d[:y1]
		y2 = d[:y2]
#		img.with_crop(x1,y1,x2,y2) { |img2| img2.save new_image_name }
#	end
	return new_image_name
end

#extract_ocr(item) ⇒ Object



27
28
29
30
# File 'lib/pdf_extract.rb', line 27

def extract_ocr(item)
	dimensions = item[:dimensions]
	@results[item[:name]] = ocr_text(crop_image(dimensions))	
end

#extract_table(item) ⇒ Object



44
45
46
47
# File 'lib/pdf_extract.rb', line 44

def extract_table(item)
	table = run_tabula(item[:dimensions])
	@results[item[:name]] = lines_to_array(table)
end

#lines_to_array(table) ⇒ Object



55
56
57
58
59
# File 'lib/pdf_extract.rb', line 55

def lines_to_array(table)
  table.lines.map(&:chomp).map { |l|
    l.split(",")
  }
end

#ocr_text(image_path, blacklist = '|', language = :eng) ⇒ Object



61
62
63
64
65
66
67
# File 'lib/pdf_extract.rb', line 61

def ocr_text(image_path,blacklist='|',language=:eng)
	e = Tesseract::Engine.new {|e|
	  e.language  = language
	  e.blacklist = blacklist
	}
	return e.text_for(image_path).strip
end

#processObject



17
18
19
20
21
22
23
24
25
# File 'lib/pdf_extract.rb', line 17

def process
	items.each do |item|
		case item[:kind]
		when 'ocr' then extract_ocr(item)
		when 'table' then extract_table(item)
		end
	end

end

#run_tabula(d) ⇒ Object



49
50
51
52
53
# File 'lib/pdf_extract.rb', line 49

def run_tabula(d)
area = [d[:y1],d[:x1],d[:y2],d[:x2]].join(", ")
table = `tabula --area='#{area}' #{pdf_path} --page=#{page_num}`
return table
end