Class: Govspeak::StructuredHeaderExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/govspeak/structured_header_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ StructuredHeaderExtractor

Returns a new instance of StructuredHeaderExtractor.



19
20
21
22
23
# File 'lib/govspeak/structured_header_extractor.rb', line 19

def initialize(document)
  @doc = document
  @structured_headers = []
  reset_stack
end

Instance Method Details

#add_auto_numbering(structured_headers, levels, prefix: "") ⇒ Object



101
102
103
104
105
106
# File 'lib/govspeak/structured_header_extractor.rb', line 101

def add_auto_numbering(structured_headers, levels, prefix: "")
  structured_headers.each.with_index(1) do |header, index|
    header[:text] = "#{prefix}#{index}#{'.' if prefix == ''} #{header[:text]}" if levels.include?(header[:level])
    add_auto_numbering(header[:headers], levels, prefix: "#{prefix}#{index}.")
  end
end

#add_child(header) ⇒ Object



65
66
67
# File 'lib/govspeak/structured_header_extractor.rb', line 65

def add_child(header)
  stack.last.headers << header
end

#add_sibling(header) ⇒ Object



60
61
62
63
# File 'lib/govspeak/structured_header_extractor.rb', line 60

def add_sibling(header)
  stack.pop
  stack.last.headers << header
end

#add_top_level(header) ⇒ Object



55
56
57
58
# File 'lib/govspeak/structured_header_extractor.rb', line 55

def add_top_level(header)
  structured_headers.push(header)
  reset_stack
end

#add_uncle_or_aunt(header) ⇒ Object



69
70
71
72
# File 'lib/govspeak/structured_header_extractor.rb', line 69

def add_uncle_or_aunt(header)
  pop_stack_to_level(header)
  stack.last.headers << header
end

#callObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/govspeak/structured_header_extractor.rb', line 25

def call
  headers_list.each do |header|
    next if header_higher_than_top_level?(header)

    if header.top_level?
      add_top_level(header)
    elsif header_at_same_level_as_prev?(header)
      add_sibling(header)
    elsif header_one_level_lower_than_prev?(header)
      add_child(header)
    elsif header_at_higher_level_than_prev?(header)
      add_uncle_or_aunt(header)
    else
      next # ignore semantically invalid headers
    end

    stack.push(header)
  end

  add_auto_numbering(structured_headers, doc.auto_numbered_header_levels) if doc.auto_numbered_headers

  structured_headers
end

#header_at_higher_level_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


87
88
89
90
# File 'lib/govspeak/structured_header_extractor.rb', line 87

def header_at_higher_level_than_prev?(header)
  # higher level means level integer is lower
  stack.last && (stack.last.level > header.level)
end

#header_at_same_level_as_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


78
79
80
# File 'lib/govspeak/structured_header_extractor.rb', line 78

def header_at_same_level_as_prev?(header)
  stack.last && stack.last.level == header.level
end

#header_higher_than_top_level?(header) ⇒ Boolean

Returns:

  • (Boolean)


74
75
76
# File 'lib/govspeak/structured_header_extractor.rb', line 74

def header_higher_than_top_level?(header)
  header.level < header.top_level
end

#header_one_level_lower_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


82
83
84
85
# File 'lib/govspeak/structured_header_extractor.rb', line 82

def header_one_level_lower_than_prev?(header)
  # lower level means level integer is higher
  stack.last && (stack.last.level - header.level == -1)
end

#headers_listObject



49
50
51
52
53
# File 'lib/govspeak/structured_header_extractor.rb', line 49

def headers_list
  @headers_list ||= doc.headers.map do |h|
    StructuredHeader.new(h.text, h.level, h.id, [])
  end
end

#pop_stack_to_level(header) ⇒ Object



92
93
94
95
# File 'lib/govspeak/structured_header_extractor.rb', line 92

def pop_stack_to_level(header)
  times_to_pop = stack.last.level - header.level + 1
  times_to_pop.times { stack.pop }
end

#reset_stackObject



97
98
99
# File 'lib/govspeak/structured_header_extractor.rb', line 97

def reset_stack
  @stack = []
end