Class: IiifPrint::TextExtraction::HOCRReader::HOCRDocStream

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/iiif_print/text_extraction/hocr_reader.rb

Overview

SAX Document Stream class to gather text and word tokens from hOCR

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeHOCRDocStream

Returns a new instance of HOCRDocStream.



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 18

def initialize
  super()
  # plain text buffer:
  @text = ''
  # list of word hash, containing word+coord:
  @words = []
  # page width and height to be found in hOCR for `div.ocr_page`
  @width = nil
  @height = nil
  # to hold current word data state across #start_element, #characters,
  #   and #end_element methods (to associate word with coordinates).
  @current = nil
  # to preserve element classname from start to use by #end_element
  @element_class_name = nil
end

Instance Attribute Details

#heightObject

Returns the value of attribute height.



16
17
18
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 16

def height
  @height
end

#textObject

Returns the value of attribute text.



16
17
18
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 16

def text
  @text
end

#widthObject

Returns the value of attribute width.



16
17
18
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 16

def width
  @width
end

#wordsObject

Returns the value of attribute words.



16
17
18
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 16

def words
  @words
end

Instance Method Details

#characters(value) ⇒ Object



112
113
114
115
116
117
118
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 112

def characters(value)
  return if @current.nil?
  return if @current[:coordinates].nil?
  @current[:word] ||= ''
  @current[:word] += value
  @text += value
end

#consider?(name, class_name) ⇒ Boolean

Consider element for processing?

- `div.ocr_page` 

Parameters:

  • name (String)

    Element name

  • class_name (String)

    HTML class name

Returns:

  • (Boolean)

    true if element should be processed; otherwise false



56
57
58
59
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 56

def consider?(name, class_name)
  selector = "#{name}.#{class_name}"
  ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
end

#end_documentObject

Callback for completion of parsing hOCR, used to normalize generated

text content (strip unneeded whitespace incidental to output).


134
135
136
137
138
139
140
141
142
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 134

def end_document
  # postprocess @text to remove trailing spaces on lines
  @text = @text.split("\n").map(&:strip).join("\n")
  # remove excess line break
  @text.gsub!(/\n+/, "\n")
  @text.delete("\r")
  # remove trailing whitespace at end of buffer
  @text.strip!
end

#end_element(name) ⇒ Object

Callback for element end; at this time, flush word coordinate state

for current word, and append line endings to plain text:

Parameters:

  • name (String)

    element name.



124
125
126
127
128
129
130
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 124

def end_element(name)
  if name == 'span'
    end_word if @element_class_name == 'ocrx_word'
    @text += "\n" if @element_class_name.nil?
  end
  @element_class_name = nil
end

#end_lineObject



90
91
92
93
94
95
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 90

def end_line
  # strip trailing whitespace
  @text.strip!
  # then insert a line break
  @text += "\n"
end

#end_wordObject



83
84
85
86
87
88
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 83

def end_word
  # add trailing space to plaintext buffer for between words:
  @text += ' '
  @words.push(@current) if word_complete?
  @current = nil # clear the current word
end

#s_coords(attrs) ⇒ Array

Return coordinates from ‘span.ocrx_word` element attribute hash

Parameters:

  • attrs (Hash)

    hash with hOCR ‘span.ocrx_word` element attributes

Returns:

  • (Array)

    Array of position x, y, width, height in px.



38
39
40
41
42
43
44
45
46
47
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 38

def s_coords(attrs)
  element_title = attrs['title']
  bbox = element_title.split(';')[0].split('bbox ')[-1]
  x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
  height = y2 - y1
  width = x2 - x1
  hpos = x1
  vpos = y1
  [hpos, vpos, width, height]
end

#start_element(name, attrs = []) ⇒ Object

Callback for element start, ignores elements except for:

- `div.ocr_page` 

Parameters:

  • name (String)

    element name.

  • attrs (Array) (defaults to: [])

    Array of key, value pair Arrays.



104
105
106
107
108
109
110
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 104

def start_element(name, attrs = [])
  attributes = attrs.to_h
  @element_class_name = attributes['class']
  return unless consider?(name, @element_class_name)
  start_word(attributes) if @element_class_name == 'ocrx_word'
  start_page(attributes) if @element_class_name == 'ocr_page'
end

#start_page(attrs) ⇒ Object



68
69
70
71
72
73
74
75
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 68

def start_page(attrs)
  title = attrs['title']
  fields = title.split(';')
  bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
  # width and height:
  @width = bbox[2]
  @height = bbox[3]
end

#start_word(attrs) ⇒ Object



61
62
63
64
65
66
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 61

def start_word(attrs)
  @current = {}
  # will be replaced during #characters method call:
  @current[:word] = nil
  @current[:coordinates] = s_coords(attrs)
end

#word_complete?Boolean

Returns:

  • (Boolean)


77
78
79
80
81
# File 'lib/iiif_print/text_extraction/hocr_reader.rb', line 77

def word_complete?
  return false if @current.nil?
  coords = @current[:coordinates]
  @current[:word] && !@current[:word].empty? && coords.size == 4
end