Class: HocrTurtletext::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/hocr_turtletext/reader.rb

Overview

pdf-reader-turtletext methods such as text_in_region, text_position and fuzzed_y method modified from the original at github.com/tardate/pdf-reader-turtletext

Instance Method Summary collapse

Constructor Details

#initialize(hocr_path, options = {}) ⇒ Reader

Returns a new instance of Reader.



6
7
8
9
# File 'lib/hocr_turtletext/reader.rb', line 6

def initialize(hocr_path, options = {})
  @hocr_path = hocr_path
  @options = options
end

Instance Method Details

#bounding_box(&block) ⇒ Object



55
56
57
# File 'lib/hocr_turtletext/reader.rb', line 55

def bounding_box(&block)
  HocrTurtletext::Textangle.new(self, &block)
end

#contentObject



11
12
13
14
15
16
17
18
# File 'lib/hocr_turtletext/reader.rb', line 11

def content
  hocr_content = File.read(@hocr_path)
  html = Nokogiri::HTML(hocr_content)
  pos_info_words = extract_words_from_html(html)
  pos_hash = to_pos_hash pos_info_words
  fuzzed_y = fuzzed_y(pos_hash)
  concat_words_in_lines(fuzzed_y)
end

#text_in_region(xmin, xmax, ymin, ymax, inclusive = false) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/hocr_turtletext/reader.rb', line 20

def text_in_region(xmin, xmax, ymin, ymax, inclusive=false)
  return [] unless xmin && xmax && ymin && ymax
  text_map = content
  box = []

  text_map.each do |y,text_row|
    if inclusive ? (y >= ymin && y <= ymax) : (y > ymin && y < ymax)
      row = []
      text_row.each do |x,element|
        if inclusive ? (x >= xmin && x <= xmax) : (x > xmin && x < xmax)
          row << element
        end
      end
      box << row unless row.empty?
    end
  end
  box
end

#text_position(text) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/hocr_turtletext/reader.rb', line 39

def text_position(text)
  item = if text.class <= Regexp
           content.map do |k,v|
             if x = v.reduce(nil){ |memo,vv|  memo = (vv[1] =~ text) ? vv[0] : memo }
               [k,x]
             end
           end
         else
           content.map { |k,v| if x = v.rassoc(text) ; [k,x] ; end }
         end
  item = item.compact.flatten
  unless item.empty?
    { :x => item[1], :y => item[0] }
  end
end