Class: Govspeak::StructuredHeaderExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/govspeak/structured_header_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ StructuredHeaderExtractor

Returns a new instance of StructuredHeaderExtractor.



19
20
21
22
23
# File 'lib/govspeak/structured_header_extractor.rb', line 19

def initialize(document)
  @doc = document
  @structured_headers = []
  reset_stack
end

Instance Method Details

#add_child(header) ⇒ Object



66
67
68
# File 'lib/govspeak/structured_header_extractor.rb', line 66

def add_child(header)
  stack.last.headers << header
end

#add_sibling(header) ⇒ Object



61
62
63
64
# File 'lib/govspeak/structured_header_extractor.rb', line 61

def add_sibling(header)
  stack.pop
  stack.last.headers << header
end

#add_top_level(header) ⇒ Object



56
57
58
59
# File 'lib/govspeak/structured_header_extractor.rb', line 56

def add_top_level(header)
  structured_headers.push(header)
  reset_stack
end

#add_uncle_or_aunt(header) ⇒ Object



70
71
72
73
# File 'lib/govspeak/structured_header_extractor.rb', line 70

def add_uncle_or_aunt(header)
  pop_stack_to_level(header)
  stack.last.headers << header
end

#callObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/govspeak/structured_header_extractor.rb', line 25

def call
  headers_list.each do |header|
    next if header_higher_than_top_level?(header)

    if header.top_level?
      add_top_level(header)
    elsif header_at_same_level_as_prev?(header)
      add_sibling(header)
    elsif header_one_level_lower_than_prev?(header)
      add_child(header)
    elsif header_at_higher_level_than_prev?(header)
      add_uncle_or_aunt(header)
    else
      next # ignore semantically invalid headers
    end

    stack.push(header)
  end

  structured_headers
end

#header_at_higher_level_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


88
89
90
91
# File 'lib/govspeak/structured_header_extractor.rb', line 88

def header_at_higher_level_than_prev?(header)
  # higher level means level integer is lower
  stack.last && (stack.last.level > header.level)
end

#header_at_same_level_as_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


79
80
81
# File 'lib/govspeak/structured_header_extractor.rb', line 79

def header_at_same_level_as_prev?(header)
  stack.last && stack.last.level == header.level
end

#header_higher_than_top_level?(header) ⇒ Boolean

Returns:

  • (Boolean)


75
76
77
# File 'lib/govspeak/structured_header_extractor.rb', line 75

def header_higher_than_top_level?(header)
  header.level < header.top_level
end

#header_one_level_lower_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


83
84
85
86
# File 'lib/govspeak/structured_header_extractor.rb', line 83

def header_one_level_lower_than_prev?(header)
  # lower level means level integer is higher
  stack.last && (stack.last.level - header.level == -1)
end

#headers_listObject



50
51
52
53
54
# File 'lib/govspeak/structured_header_extractor.rb', line 50

def headers_list
  @headers_list ||= doc.headers.map { |h|
    StructuredHeader.new(h.text, h.level, h.id, [])
  }
end

#pop_stack_to_level(header) ⇒ Object



93
94
95
96
# File 'lib/govspeak/structured_header_extractor.rb', line 93

def pop_stack_to_level(header)
  times_to_pop = stack.last.level - header.level + 1
  times_to_pop.times { stack.pop }
end

#reset_stackObject



98
99
100
# File 'lib/govspeak/structured_header_extractor.rb', line 98

def reset_stack
  @stack = []
end