Module: Slaw::Parse::Blocklists

Includes:
Namespace
Defined in:
lib/slaw/parse/blocklists.rb

Defined Under Namespace

Classes: NumberingFormat

Constant Summary

Constants included from Namespace

Namespace::NS

Class Method Summary collapse

Class Method Details

.fix_intros(doc) ⇒ Object

Change p tags preceding a blocklist into listIntroductions within the blocklist



171
172
173
174
175
176
177
178
179
# File 'lib/slaw/parse/blocklists.rb', line 171

def self.fix_intros(doc)
  doc.xpath('//a:blockList', a: NS).each do |blocklist|
    prev = blocklist.previous
    if prev and prev.name == 'p'
      prev.name = 'listIntroduction'
      blocklist.prepend_child(prev)
    end
  end
end

.guess_number_format(item, prev_format = nil) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/slaw/parse/blocklists.rb', line 120

def self.guess_number_format(item, prev_format=nil)
  return nil unless item.num

  prev = item.previous_element
  nxt  = item.next_element

  case item.num
  when "(i)"
    # Special case to detect difference between:
    #
    # (h) foo
    # (i) bar
    # (j) baz
    #
    # and
    #
    # (h) foo
    #   (i)  bar
    #   (ii) baz
    #
    # (i) is NOT a sublist if:
    #   - there was a previous item (h), and
    #     - there is not a next item, or
    #     - the next item is something other than (ii)
    if prev and prev.num =~ /^\(h/ and (!nxt or nxt.num != "(ii)")
      NumberingFormat.a
    else
      NumberingFormat.i
    end
  when "(u)", "(v)", "(x)"
    prev_format
  when /^\([ivx]+/
    NumberingFormat.i
  when /^\([IVX]+/
    NumberingFormat.I
  when /^\([a-z]{2}/
    NumberingFormat.aa
  when /^\([A-Z]{2}/
    NumberingFormat.AA
  when /^\([a-z]+/
    NumberingFormat.a
  when /^\([A-Z]+/
    NumberingFormat.A
  when /^\d+(\.\d+)+$/
    NumberingFormat.new(:'i.i', item.num.count('.'))
  else
    NumberingFormat.unknown
  end
end

.nest_blocklist_items(items, our_number_format, list, prev) ⇒ Object

New blocklist nesting, starting with item as its first element.



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/slaw/parse/blocklists.rb', line 47

def self.nest_blocklist_items(items, our_number_format, list, prev)
  return if items.empty?
  item = items.shift

  sublist_count = 0
  number_format = our_number_format

  while item and item.name == 'item'
    number_format = guess_number_format(item, number_format)
    break unless number_format

    # (aa) after (z) is same numbering type, pretend we've always
    # been this format
    if item.num == "(aa)" and item.previous_element and item.previous_element.num == "(z)"
      our_number_format = number_format
    end

    if number_format != our_number_format
      # new sublist, or back to the old list?
      if number_format < our_number_format
        # back to the old list
        items.unshift(item)
        break
      else
        # new sublist.
        #
        # The blockList is inserted as a child of the sibling just before
        # +item+, and that sibling's content is moved into the
        # +listIntroduction+ of the new list.
        sublist = item.document.create_element('blockList', id: prev['id'] + ".list#{sublist_count}")
        sublist_count += 1

        # list intro
        num = prev.at_xpath('a:num', a: NS)
        if intro = num.next_element
          intro.name = 'listIntroduction'
          sublist << intro
        end

        # make +item+ the first in this list
        item['id'] = sublist['id'] + ".#{item.num.gsub(/[()]/, '')}"
        sublist << item

        # insert this list as a child of the previous item
        prev << sublist

        # now keep walking item's (old) siblings
        # and pull in those elements that match our numbering
        # scheme
        nest_blocklist_items(items, number_format, sublist, item)
      end
    else
      # same number format

      # if this num is (i), we're numbering in :i, this isn't the first
      # element in this list, then assume we're following (h) with (i)
      if number_format.type == :i && item.num == "(i)" && prev
        items.unshift(item)
        break
      else
        # keep it with this list
        if list
          list << item
          item['id'] = list['id'] + ".#{item.num.gsub(/[()]/, '')}"
        end
      end
    end

    prev = item
    item = items.shift
  end
end

.nest_blocklists(doc) ⇒ Object

Correctly re-nest nested block lists that are tagged with the “renest” attribute.

We do this by identifying the numbering format of each item in the list and comparing it with the surrounding elements. When the numbering format changes, we start a new nested list.

We make sure to handle special cases such as ‘(i)` coming between `(h)` and `(j)` versus being at the start of a `(i), (ii), (iii)` list.

(a)
(b)
(i)
(ii)
(aa)
(bb)
(c)
(d)

becomes

(a)
(b)
  (i)
  (ii)
    (aa)
    (bb)
(c)
(d)

Parameters:

  • doc (Nokogiri::XML::Document)

    the document



37
38
39
40
41
42
43
# File 'lib/slaw/parse/blocklists.rb', line 37

def self.nest_blocklists(doc)
  doc.xpath('//a:blockList[@renest]', a: NS).each do |blocklist|
    blocklist.remove_attribute('renest')
    items = blocklist.xpath('a:item', a: NS)
    nest_blocklist_items(items.to_a, guess_number_format(items.first), nil, nil) unless items.empty?
  end
end