Class: PageExtractor

Inherits:

Object

Object
PageExtractor

show all

Defined in:: lib/pdf_extract.rb

Instance Attribute Summary collapse

#image_path ⇒ Object

Returns the value of attribute image_path.
#items ⇒ Object

Returns the value of attribute items.
#page ⇒ Object

Returns the value of attribute page.
#pdf_path ⇒ Object

Returns the value of attribute pdf_path.
#results ⇒ Object

Returns the value of attribute results.

Instance Method Summary collapse

#crop_image(d) ⇒ Object
#extract_ocr(item) ⇒ Object
#extract_table(item) ⇒ Object
#initialize(page) ⇒ PageExtractor constructor

A new instance of PageExtractor.
#lines_to_array(table) ⇒ Object
#ocr_text(image_path, blacklist = '|', language = :eng) ⇒ Object
#process ⇒ Object
#run_tabula(d) ⇒ Object

Constructor Details

#initialize(page) ⇒ `PageExtractor`

Returns a new instance of PageExtractor.

# File 'lib/pdf_extract.rb', line 9

def initialize(page)
	@image_path = page[:image_path]
	@pdf_path = page[:pdf_path]
	@items = page[:items]
	@page_num = page[:page] ||= 1
	@results = {}
end

Instance Attribute Details

#image_path ⇒ `Object`

Returns the value of attribute image_path.



8
9
10

# File 'lib/pdf_extract.rb', line 8

def image_path
  @image_path
end

#items ⇒ `Object`

Returns the value of attribute items.



8
9
10

# File 'lib/pdf_extract.rb', line 8

def items
  @items
end

#page ⇒ `Object`

Returns the value of attribute page.



8
9
10

# File 'lib/pdf_extract.rb', line 8

def page
  @page
end

#pdf_path ⇒ `Object`

Returns the value of attribute pdf_path.



8
9
10

# File 'lib/pdf_extract.rb', line 8

def pdf_path
  @pdf_path
end

#results ⇒ `Object`

Returns the value of attribute results.



8
9
10

# File 'lib/pdf_extract.rb', line 8

def results
  @results
end

Instance Method Details

#crop_image(d) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 32

def crop_image(d)
	new_image_name = "CR.png"
#	ImageVoodoo.with_image(image_path) do |img|
		x1 = d[:x1]	
		x2 = d[:x2]
		y1 = d[:y1]
		y2 = d[:y2]
#		img.with_crop(x1,y1,x2,y2) { |img2| img2.save new_image_name }
#	end
	return new_image_name
end

#extract_ocr(item) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 27

def extract_ocr(item)
	dimensions = item[:dimensions]
	@results[item[:name]] = ocr_text(crop_image(dimensions))	
end

#extract_table(item) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 44

def extract_table(item)
	table = run_tabula(item[:dimensions])
	@results[item[:name]] = lines_to_array(table)
end

#lines_to_array(table) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 55

def lines_to_array(table)
  table.lines.map(&:chomp).map { |l|
    l.split(",")
  }
end

#ocr_text(image_path, blacklist = '|', language = :eng) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 61

def ocr_text(image_path,blacklist='|',language=:eng)
	e = Tesseract::Engine.new {|e|
	  e.language  = language
	  e.blacklist = blacklist
	}
	return e.text_for(image_path).strip
end

#process ⇒ `Object`

# File 'lib/pdf_extract.rb', line 17

def process
	items.each do |item|
		case item[:kind]
		when 'ocr' then extract_ocr(item)
		when 'table' then extract_table(item)
		end
	end

end

#run_tabula(d) ⇒ `Object`

# File 'lib/pdf_extract.rb', line 49

def run_tabula(d)
area = [d[:y1],d[:x1],d[:y2],d[:x2]].join(", ")
table = `tabula --area='#{area}' #{pdf_path} --page=#{page_num}`
return table
end

Class: PageExtractor

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page) ⇒ PageExtractor

Instance Attribute Details

#image_path ⇒ Object

#items ⇒ Object

#page ⇒ Object

#pdf_path ⇒ Object

#results ⇒ Object

Instance Method Details

#crop_image(d) ⇒ Object

#extract_ocr(item) ⇒ Object

#extract_table(item) ⇒ Object

#lines_to_array(table) ⇒ Object

#ocr_text(image_path, blacklist = '|', language = :eng) ⇒ Object

#process ⇒ Object

#run_tabula(d) ⇒ Object