Class: Govspeak::StructuredHeaderExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/govspeak/structured_header_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ StructuredHeaderExtractor

Returns a new instance of StructuredHeaderExtractor.



20
21
22
23
24
# File 'lib/govspeak/structured_header_extractor.rb', line 20

def initialize(document)
  @doc = document
  @structured_headers = []
  reset_stack
end

Instance Method Details

#add_child(header) ⇒ Object



67
68
69
# File 'lib/govspeak/structured_header_extractor.rb', line 67

def add_child(header)
  stack.last.headers << header
end

#add_sibling(header) ⇒ Object



62
63
64
65
# File 'lib/govspeak/structured_header_extractor.rb', line 62

def add_sibling(header)
  stack.pop
  stack.last.headers << header
end

#add_top_level(header) ⇒ Object



57
58
59
60
# File 'lib/govspeak/structured_header_extractor.rb', line 57

def add_top_level(header)
  structured_headers.push(header)
  reset_stack
end

#add_uncle_or_aunt(header) ⇒ Object



71
72
73
74
# File 'lib/govspeak/structured_header_extractor.rb', line 71

def add_uncle_or_aunt(header)
  pop_stack_to_level(header)
  stack.last.headers << header
end

#callObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/govspeak/structured_header_extractor.rb', line 26

def call
  headers_list.each do |header|
    next if header_higher_than_top_level?(header)

    if header.top_level?
      add_top_level(header)
    elsif header_at_same_level_as_prev?(header)
      add_sibling(header)
    elsif header_one_level_lower_than_prev?(header)
      add_child(header)
    elsif header_at_higher_level_than_prev?(header)
      add_uncle_or_aunt(header)
    else
      next # ignore semantically invalid headers
    end

    stack.push(header)
  end

  structured_headers
end

#header_at_higher_level_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


89
90
91
92
# File 'lib/govspeak/structured_header_extractor.rb', line 89

def header_at_higher_level_than_prev?(header)
  # higher level means level integer is lower
  stack.last && (stack.last.level > header.level)
end

#header_at_same_level_as_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


80
81
82
# File 'lib/govspeak/structured_header_extractor.rb', line 80

def header_at_same_level_as_prev?(header)
  stack.last && stack.last.level == header.level
end

#header_higher_than_top_level?(header) ⇒ Boolean

Returns:

  • (Boolean)


76
77
78
# File 'lib/govspeak/structured_header_extractor.rb', line 76

def header_higher_than_top_level?(header)
  header.level < header.top_level
end

#header_one_level_lower_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


84
85
86
87
# File 'lib/govspeak/structured_header_extractor.rb', line 84

def header_one_level_lower_than_prev?(header)
  # lower level means level integer is higher
  stack.last && (stack.last.level - header.level == -1)
end

#headers_listObject



51
52
53
54
55
# File 'lib/govspeak/structured_header_extractor.rb', line 51

def headers_list
  @headers_list ||= doc.headers.map { |h|
    StructuredHeader.new(h.text, h.level, h.id, [])
  }
end

#pop_stack_to_level(header) ⇒ Object



94
95
96
97
# File 'lib/govspeak/structured_header_extractor.rb', line 94

def pop_stack_to_level(header)
  times_to_pop = stack.last.level - header.level + 1
  times_to_pop.times { stack.pop }
end

#reset_stackObject



99
100
101
# File 'lib/govspeak/structured_header_extractor.rb', line 99

def reset_stack
  @stack = []
end