Class: OCRPage
- Inherits:
-
OCRElement
- Object
- HOCRBox
- OCRElement
- OCRPage
- Defined in:
- lib/ocr_page.rb
Instance Attribute Summary collapse
-
#dimensions ⇒ Object
readonly
Returns the value of attribute dimensions.
-
#image ⇒ Object
Returns the value of attribute image.
-
#lines ⇒ Object
readonly
Returns the value of attribute lines.
-
#meta_data ⇒ Object
readonly
Returns the value of attribute meta_data.
-
#page_number ⇒ Object
readonly
Returns the value of attribute page_number.
Attributes inherited from OCRElement
#children, #features, #ocr_class
Attributes inherited from HOCRBox
#bottom, #coordinates, #height, #left, #right, #top, #width
Instance Method Summary collapse
- #each_enclosed_word(ocr_box) ⇒ Object
- #each_line ⇒ Object
- #each_paragraph ⇒ Object
- #each_word ⇒ Object
- #enclosed_words(ocr_box) ⇒ Object
- #extract_bbox_ppageno(ocr_html_text_fragment) ⇒ Object
-
#initialize(file_path, image_path = nil) ⇒ OCRPage
constructor
A new instance of OCRPage.
- #process_hocr_html_file(filename) ⇒ Object
- #to_image_html(options = {}) ⇒ Object
- #to_text ⇒ Object
Methods inherited from OCRElement
create, create_from_html, #css_class_string, #each, extract_children, extract_coordinates, extract_coordinates_from_string, extract_ocr_class, extract_word_children, #features_to_css_class, #mark_in_rspec, #to_html, #to_s
Methods inherited from HOCRBox
#bottom_distance_to, #coordinates_to_s, #enclosed_by?, #encloses?, #left_distance_to, #left_of?, #right_distance_to, #right_of?, #to_css_style, #to_s, #top_distance_to
Constructor Details
#initialize(file_path, image_path = nil) ⇒ OCRPage
Returns a new instance of OCRPage.
13 14 15 16 17 18 19 20 21 22 |
# File 'lib/ocr_page.rb', line 13 def initialize(file_path , image_path = nil ) doc = process_hocr_html_file(file_path) page_content = doc.at_css("div.ocr_page") coordinates, @page_number = extract_bbox_ppageno( page_content['title'] ) @page_content = doc.at_css("div.ocr_page") children = OCRElement.extract_children(@page_content) super('ocr_page', children, coordinates) @image = image_path end |
Instance Attribute Details
#dimensions ⇒ Object (readonly)
Returns the value of attribute dimensions.
8 9 10 |
# File 'lib/ocr_page.rb', line 8 def dimensions @dimensions end |
#image ⇒ Object
Returns the value of attribute image.
9 10 11 |
# File 'lib/ocr_page.rb', line 9 def image @image end |
#lines ⇒ Object (readonly)
Returns the value of attribute lines.
8 9 10 |
# File 'lib/ocr_page.rb', line 8 def lines @lines end |
#meta_data ⇒ Object (readonly)
Returns the value of attribute meta_data.
8 9 10 |
# File 'lib/ocr_page.rb', line 8 def @meta_data end |
#page_number ⇒ Object (readonly)
Returns the value of attribute page_number.
8 9 10 |
# File 'lib/ocr_page.rb', line 8 def page_number @page_number end |
Instance Method Details
#each_enclosed_word(ocr_box) ⇒ Object
83 84 85 86 87 88 89 |
# File 'lib/ocr_page.rb', line 83 def each_enclosed_word(ocr_box) each_word do |w| if w.enclosed_by? ocr_box then yield w end end end |
#each_line ⇒ Object
33 34 35 36 37 38 39 40 41 |
# File 'lib/ocr_page.rb', line 33 def each_line for block in blocks do for paragraph in block do for line in paragraph do yield line end end end end |
#each_paragraph ⇒ Object
25 26 27 28 29 30 31 |
# File 'lib/ocr_page.rb', line 25 def each_paragraph for block in blocks do for paragraph in block do yield paragraph end end end |
#each_word ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/ocr_page.rb', line 43 def each_word for block in blocks do for paragraph in block do for line in paragraph do for word in line do yield word end end end end end |
#enclosed_words(ocr_box) ⇒ Object
78 79 80 81 |
# File 'lib/ocr_page.rb', line 78 def enclosed_words(ocr_box) enum = Enumerator.new(self,:each_enclosed_word,ocr_box) enum.inject([]) { |acc,w| acc << w} end |
#extract_bbox_ppageno(ocr_html_text_fragment) ⇒ Object
56 57 58 59 60 |
# File 'lib/ocr_page.rb', line 56 def extract_bbox_ppageno( ocr_html_text_fragment ) bbox, ppageno = ocr_html_text_fragment.split(';') ppageno =~ /(\d+)/ [ OCRElement.extract_coordinates_from_string(bbox) , $1.to_i ] end |
#process_hocr_html_file(filename) ⇒ Object
62 63 64 65 |
# File 'lib/ocr_page.rb', line 62 def process_hocr_html_file(filename) html_string = File.open(filename,"r").read Nokogiri::HTML(html_string).elements end |
#to_image_html(options = {}) ⇒ Object
71 72 73 74 75 76 |
# File 'lib/ocr_page.rb', line 71 def to_image_html( = {}) zoom = [:zoom] || 1 display_class = [:css_class] || css_class_string children_html = @children.map {|c| c.to_image_html(:zoom => zoom) }.join("") "<div class='#{ display_class }' style='position:absoulte; left:#{@left}px; top:#{@top}px; background-image: url(#{@image}); width:#{@width * zoom}px; height:#{@height * zoom}px;'>#{children_html}</div>" end |
#to_text ⇒ Object
67 68 69 |
# File 'lib/ocr_page.rb', line 67 def to_text Enumerator.new(self,:each_line).map {|line| line.to_text}.join("\n") end |