Class: PageExtractor
- Inherits:
-
Object
- Object
- PageExtractor
- Defined in:
- lib/pdf_extract.rb
Instance Attribute Summary collapse
-
#image_path ⇒ Object
Returns the value of attribute image_path.
-
#items ⇒ Object
Returns the value of attribute items.
-
#page ⇒ Object
Returns the value of attribute page.
-
#pdf_path ⇒ Object
Returns the value of attribute pdf_path.
-
#results ⇒ Object
Returns the value of attribute results.
Instance Method Summary collapse
- #crop_image(d) ⇒ Object
- #extract_ocr(item) ⇒ Object
- #extract_table(item) ⇒ Object
-
#initialize(page) ⇒ PageExtractor
constructor
A new instance of PageExtractor.
- #lines_to_array(table) ⇒ Object
- #ocr_text(image_path, blacklist = '|', language = :eng) ⇒ Object
- #process ⇒ Object
- #run_tabula(d) ⇒ Object
Constructor Details
#initialize(page) ⇒ PageExtractor
Returns a new instance of PageExtractor.
9 10 11 12 13 14 15 |
# File 'lib/pdf_extract.rb', line 9 def initialize(page) @image_path = page[:image_path] @pdf_path = page[:pdf_path] @items = page[:items] @page_num = page[:page] ||= 1 @results = {} end |
Instance Attribute Details
#image_path ⇒ Object
Returns the value of attribute image_path.
8 9 10 |
# File 'lib/pdf_extract.rb', line 8 def image_path @image_path end |
#items ⇒ Object
Returns the value of attribute items.
8 9 10 |
# File 'lib/pdf_extract.rb', line 8 def items @items end |
#page ⇒ Object
Returns the value of attribute page.
8 9 10 |
# File 'lib/pdf_extract.rb', line 8 def page @page end |
#pdf_path ⇒ Object
Returns the value of attribute pdf_path.
8 9 10 |
# File 'lib/pdf_extract.rb', line 8 def pdf_path @pdf_path end |
#results ⇒ Object
Returns the value of attribute results.
8 9 10 |
# File 'lib/pdf_extract.rb', line 8 def results @results end |
Instance Method Details
#crop_image(d) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/pdf_extract.rb', line 32 def crop_image(d) new_image_name = "CR.png" # ImageVoodoo.with_image(image_path) do |img| x1 = d[:x1] x2 = d[:x2] y1 = d[:y1] y2 = d[:y2] # img.with_crop(x1,y1,x2,y2) { |img2| img2.save new_image_name } # end return new_image_name end |
#extract_ocr(item) ⇒ Object
27 28 29 30 |
# File 'lib/pdf_extract.rb', line 27 def extract_ocr(item) dimensions = item[:dimensions] @results[item[:name]] = ocr_text(crop_image(dimensions)) end |
#extract_table(item) ⇒ Object
44 45 46 47 |
# File 'lib/pdf_extract.rb', line 44 def extract_table(item) table = run_tabula(item[:dimensions]) @results[item[:name]] = lines_to_array(table) end |
#lines_to_array(table) ⇒ Object
55 56 57 58 59 |
# File 'lib/pdf_extract.rb', line 55 def lines_to_array(table) table.lines.map(&:chomp).map { |l| l.split(",") } end |
#ocr_text(image_path, blacklist = '|', language = :eng) ⇒ Object
61 62 63 64 65 66 67 |
# File 'lib/pdf_extract.rb', line 61 def ocr_text(image_path,blacklist='|',language=:eng) e = Tesseract::Engine.new {|e| e.language = language e.blacklist = blacklist } return e.text_for(image_path).strip end |
#process ⇒ Object
17 18 19 20 21 22 23 24 25 |
# File 'lib/pdf_extract.rb', line 17 def process items.each do |item| case item[:kind] when 'ocr' then extract_ocr(item) when 'table' then extract_table(item) end end end |
#run_tabula(d) ⇒ Object
49 50 51 52 53 |
# File 'lib/pdf_extract.rb', line 49 def run_tabula(d) area = [d[:y1],d[:x1],d[:y2],d[:x2]].join(", ") table = `tabula --area='#{area}' #{pdf_path} --page=#{page_num}` return table end |