Class: HocrTurtletext::Reader
- Inherits:
-
Object
- Object
- HocrTurtletext::Reader
- Defined in:
- lib/hocr_turtletext/reader.rb
Overview
pdf-reader-turtletext methods such as text_in_region, text_position and fuzzed_y method modified from the original at github.com/tardate/pdf-reader-turtletext
Instance Method Summary collapse
- #bounding_box(&block) ⇒ Object
- #content ⇒ Object
-
#initialize(hocr_path, options = {}) ⇒ Reader
constructor
A new instance of Reader.
- #text_in_region(xmin, xmax, ymin, ymax, inclusive = false) ⇒ Object
- #text_position(text) ⇒ Object
Constructor Details
#initialize(hocr_path, options = {}) ⇒ Reader
Returns a new instance of Reader.
6 7 8 9 |
# File 'lib/hocr_turtletext/reader.rb', line 6 def initialize(hocr_path, = {}) @hocr_path = hocr_path = end |
Instance Method Details
#bounding_box(&block) ⇒ Object
55 56 57 |
# File 'lib/hocr_turtletext/reader.rb', line 55 def bounding_box(&block) HocrTurtletext::Textangle.new(self, &block) end |
#content ⇒ Object
11 12 13 14 15 16 17 18 |
# File 'lib/hocr_turtletext/reader.rb', line 11 def content hocr_content = File.read(@hocr_path) html = Nokogiri::HTML(hocr_content) pos_info_words = extract_words_from_html(html) pos_hash = to_pos_hash pos_info_words fuzzed_y = fuzzed_y(pos_hash) concat_words_in_lines(fuzzed_y) end |
#text_in_region(xmin, xmax, ymin, ymax, inclusive = false) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/hocr_turtletext/reader.rb', line 20 def text_in_region(xmin, xmax, ymin, ymax, inclusive=false) return [] unless xmin && xmax && ymin && ymax text_map = content box = [] text_map.each do |y,text_row| if inclusive ? (y >= ymin && y <= ymax) : (y > ymin && y < ymax) row = [] text_row.each do |x,element| if inclusive ? (x >= xmin && x <= xmax) : (x > xmin && x < xmax) row << element end end box << row unless row.empty? end end box end |
#text_position(text) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/hocr_turtletext/reader.rb', line 39 def text_position(text) item = if text.class <= Regexp content.map do |k,v| if x = v.reduce(nil){ |memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } [k,x] end end else content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end } end item = item.compact.flatten unless item.empty? { :x => item[1], :y => item[0] } end end |