Class: OCRPage

Inherits:
OCRElement show all
Defined in:
lib/ocr_page.rb

Instance Attribute Summary collapse

Attributes inherited from OCRElement

#children, #features, #ocr_class

Attributes inherited from HOCRBox

#bottom, #coordinates, #height, #left, #right, #top, #width

Instance Method Summary collapse

Methods inherited from OCRElement

create, create_from_html, #css_class_string, #each, extract_children, extract_coordinates, extract_coordinates_from_string, extract_ocr_class, extract_word_children, #features_to_css_class, #mark_in_rspec, #to_html, #to_s

Methods inherited from HOCRBox

#bottom_distance_to, #coordinates_to_s, #enclosed_by?, #encloses?, #left_distance_to, #left_of?, #right_distance_to, #right_of?, #to_css_style, #to_s, #top_distance_to

Constructor Details

#initialize(file_path, image_path = nil) ⇒ OCRPage

Returns a new instance of OCRPage.



13
14
15
16
17
18
19
20
21
22
# File 'lib/ocr_page.rb', line 13

def initialize(file_path , image_path = nil )
    doc = process_hocr_html_file(file_path)
    page_content = doc.at_css("div.ocr_page")
    coordinates, @page_number = extract_bbox_ppageno( page_content['title'] )
    
    @page_content  = doc.at_css("div.ocr_page")
    children = OCRElement.extract_children(@page_content)
    super('ocr_page', children, coordinates)
    @image = image_path
end

Instance Attribute Details

#dimensionsObject (readonly)

Returns the value of attribute dimensions.



8
9
10
# File 'lib/ocr_page.rb', line 8

def dimensions
  @dimensions
end

#imageObject

Returns the value of attribute image.



9
10
11
# File 'lib/ocr_page.rb', line 9

def image
  @image
end

#linesObject (readonly)

Returns the value of attribute lines.



8
9
10
# File 'lib/ocr_page.rb', line 8

def lines
  @lines
end

#meta_dataObject (readonly)

Returns the value of attribute meta_data.



8
9
10
# File 'lib/ocr_page.rb', line 8

def 
  @meta_data
end

#page_numberObject (readonly)

Returns the value of attribute page_number.



8
9
10
# File 'lib/ocr_page.rb', line 8

def page_number
  @page_number
end

Instance Method Details

#each_enclosed_word(ocr_box) ⇒ Object



83
84
85
86
87
88
89
# File 'lib/ocr_page.rb', line 83

def each_enclosed_word(ocr_box)
    each_word do |w|
        if w.enclosed_by? ocr_box then
            yield w
        end
    end
end

#each_lineObject



33
34
35
36
37
38
39
40
41
# File 'lib/ocr_page.rb', line 33

def each_line
    for block in blocks do
        for paragraph in block do
            for line in  paragraph do
                yield line
            end
        end
    end
end

#each_paragraphObject



25
26
27
28
29
30
31
# File 'lib/ocr_page.rb', line 25

def each_paragraph
    for block in blocks do
        for paragraph in block do
            yield paragraph
        end
    end
end

#each_wordObject



43
44
45
46
47
48
49
50
51
52
53
# File 'lib/ocr_page.rb', line 43

def each_word
    for block in blocks do
        for paragraph in block do
            for line in  paragraph do
                for word in line do
                    yield word
                end
            end
        end
    end
end

#enclosed_words(ocr_box) ⇒ Object



78
79
80
81
# File 'lib/ocr_page.rb', line 78

def enclosed_words(ocr_box)
    enum = Enumerator.new(self,:each_enclosed_word,ocr_box)
    enum.inject([]) { |acc,w| acc << w}
end

#extract_bbox_ppageno(ocr_html_text_fragment) ⇒ Object



56
57
58
59
60
# File 'lib/ocr_page.rb', line 56

def extract_bbox_ppageno( ocr_html_text_fragment )
    bbox, ppageno = ocr_html_text_fragment.split(';')
    ppageno =~ /(\d+)/
    [ OCRElement.extract_coordinates_from_string(bbox) , $1.to_i ]
end

#process_hocr_html_file(filename) ⇒ Object



62
63
64
65
# File 'lib/ocr_page.rb', line 62

def process_hocr_html_file(filename)
    html_string = File.open(filename,"r").read
    Nokogiri::HTML(html_string).elements
end

#to_image_html(options = {}) ⇒ Object



71
72
73
74
75
76
# File 'lib/ocr_page.rb', line 71

def to_image_html(options = {})
    zoom = options[:zoom] || 1
    display_class = options[:css_class] || css_class_string
    children_html = @children.map {|c| c.to_image_html(:zoom => zoom) }.join("")
    "<div class='#{ display_class }' style='position:absoulte; left:#{@left}px; top:#{@top}px; background-image: url(#{@image}); width:#{@width * zoom}px; height:#{@height * zoom}px;'>#{children_html}</div>"
end

#to_textObject



67
68
69
# File 'lib/ocr_page.rb', line 67

def to_text
    Enumerator.new(self,:each_line).map {|line| line.to_text}.join("\n")
end