Class: IiifPrint::TextExtraction::AltoReader::AltoDocStream

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/iiif_print/text_extraction/alto_reader.rb

Overview

SAX Document Stream class to gather text and word tokens from ALTO

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(image_width = nil) ⇒ AltoDocStream

Returns a new instance of AltoDocStream.



17
18
19
20
21
22
23
24
25
26
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 17

def initialize(image_width = nil)
  super()
  # scaling matters:
  @image_width = image_width
  @scaling = 1.0 # pt to px, if ALTO using points
  # plain text buffer:
  @text = ''
  # list of word hash, containing word+coord:
  @words = []
end

Instance Attribute Details

#textObject

Returns the value of attribute text.



15
16
17
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 15

def text
  @text
end

#wordsObject

Returns the value of attribute words.



15
16
17
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 15

def words
  @words
end

Instance Method Details

#compute_scaling(attrs) ⇒ Object



40
41
42
43
44
45
46
47
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 40

def compute_scaling(attrs)
  return if @image_width.nil?
  match = attrs.find { |e| e[0].casecmp?('WIDTH') }
  return if match.empty?
  page_width = match[1].to_i
  return if @image_width == page_width
  @scaling = page_width / @image_width.to_f
end

#end_documentObject

Callback for completion of parsing ALTO, used to normalize generated

text content (strip unneeded whitespace incidental to output).


82
83
84
85
86
87
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 82

def end_document
  # postprocess @text to remove trailing spaces on lines
  @text = @text.split("\n").map(&:strip).join("\n")
  # remove trailing whitespace at end of buffer
  @text.strip!
end

#end_element(name) ⇒ Object

Callback for element end, used here to manage endings of lines and

blocks.

Parameters:

  • name (String)

    element name.



74
75
76
77
78
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 74

def end_element(name)
  @text << " " if name == 'String'
  @text << "\n" if name == 'TextBlock'
  @text << "\n" if name == 'TextLine'
end

#s_coords(attrs) ⇒ Array

Return coordinates from String element attribute hash

Parameters:

  • attrs (Hash)

    hash containing ALTO ‘String` element attributes.

Returns:

  • (Array)

    Array of position x, y, width, height in px.



32
33
34
35
36
37
38
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 32

def s_coords(attrs)
  height = scale_value((attrs['HEIGHT'] || 0).to_i)
  width = scale_value((attrs['WIDTH'] || 0).to_i)
  hpos = scale_value((attrs['HPOS'] || 0).to_i)
  vpos = scale_value((attrs['VPOS'] || 0).to_i)
  [hpos, vpos, width, height]
end

#scale_value(v) ⇒ Object



49
50
51
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 49

def scale_value(v)
  (v / @scaling).to_i
end

#start_element(name, attrs = []) ⇒ Object

Callback for element start, implementation of which ignores

non-String elements.

Parameters:

  • name (String)

    element name.

  • attrs (Array) (defaults to: [])

    Array of key, value pair Arrays.



58
59
60
61
62
63
64
65
66
67
68
# File 'lib/iiif_print/text_extraction/alto_reader.rb', line 58

def start_element(name, attrs = [])
  values = attrs.to_h
  compute_scaling(attrs) if name == 'Page'
  return if name != 'String'
  token = values['CONTENT']
  @text << token
  @words << {
    word: token,
    coordinates: s_coords(values)
  }
end