Class: Factbook::ItemBuilder

Inherits:
Object
  • Object
show all
Includes:
NormalizeHelper, LogUtils::Logging
Defined in:
lib/factbook-readers/builder_item.rb

Overview

renameto ItemReader, ItemParser - why? why not??

Instance Method Summary collapse

Methods included from NormalizeHelper

#normalize_category

Constructor Details

#initialize(html, name) ⇒ ItemBuilder

Returns a new instance of ItemBuilder.



9
10
11
12
# File 'lib/factbook-readers/builder_item.rb', line 9

def initialize( html, name )
  @html = html
  @name = name     # add category/field name e.g. Area, Location, etc.
end

Instance Method Details

#readObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/factbook-readers/builder_item.rb', line 26

def read
  ## return hash from html snippet
  doc = Nokogiri::HTML.fragment( @html )

  data = {}

  ## note:
  ##   skip whitespace text nodes (e.g. \n\n etc); just use divs
  doc_children = doc.children.filter('div')

  puts "  parsing >#{@name}< - #{doc_children.size} category_data divs(s):"

  ## hanlde special case for
  ##  multiple 'grouped_subfield' first
  ##  e.g. used in
  ##   - Drinking water source:
  ##   - Sanitation facility access:

  grouped_children = []
  other_children   = []

  doc_children.each do |div|
     if div['class'].index( 'grouped_subfield' )
        grouped_children << div
     else
        other_children << div
     end
  end


  ## note: only use special rule if more than one div marked grouped_
  if grouped_children.size > 1
    ## continue processing the rest as usual
    doc_children =  other_children

    key = nil
    grouped_children.each do |div|
       if !div.css( 'span.subfield-group').empty?
         # start a new group
         span_group = div.at( 'span.subfield-group')
         key  = normalize_category( span_group.text.strip )
         span_group.replace( '' )

         text = squish( div.text.strip )
         puts "new group - category_data key >#{key}<: >#{text}<"
         data[ key ] = { 'text' => text }
       else
         ## append to (last) group
         text = squish( div.text.strip )
         puts "add group - category_data key >#{key}<: >#{text}<"
         data[ key ]['text'] += " / #{text}"
       end
    end
  end


  doc_children.each_with_index do |div,i|
    if div['class'].index( 'note' )
      text = squish( div.text.strip )
      puts "category_data: >#{text}<"

      ## note: for now only allow one note per subsection/field data block
      if data['note']
        puts "!! ERROR: note already taken:"
        puts data['note']
        puts  div.to_html
        exit 1
      end

      data['note'] = { 'text' => text }
    elsif div['class'].index( 'historic' )
      ## add all historic together into one for now
        text = squish( div.text.strip )
        puts "category_data: >#{text}<"

        if data['text']
          ## append with / for now
          data['text'] += " / #{text}"
        else
          data['text'] = text
          ## check if history is first node
          if i != 0
            puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:"
            puts div.to_html
            exit 1
          end
        end
      elsif div.css( 'span.subfield-name').empty?
        ## assume "implied text field"
        ## check for index == 1 / child count == 1 - why? why not
        text = squish( div.text.strip )    ## fix/todo: use strip
        puts "category_data: >#{text}<"

        data['text'] = text

        ## must be always first node for now
        if i != 0
          puts "!! ERROR - 'implied' category W/O name NOT first div / node:"
          puts div.to_html
          exit 1
        end
    elsif div['class'].index( 'grouped_subfield' )
## split grouped subfield!!
##   <span class="subfield-name">arable land:</span>
## <span class="subfield-number">8.6%</span>
## <span class="subfield-date">(2011 est.)</span>
##  /
## <span class="subfield-name">permanent crops:</span>
## <span class="subfield-number">0.8%</span>
## <span class="subfield-date">(2011 est.)</span>
##   /
## <span class="subfield-name">permanent pasture:</span>
## <span class="subfield-number">23.5%</span>
## <span class="subfield-date">(2011 est.)</span>

## join names for now - why? why not?
##  e.g. becomes:
##   arable land / permanent crops / permanent pasture: for key ??
     span_names = div.css( 'span.subfield-name')
     keys = []
     span_names.each do |span|
       keys << normalize_category( span.text.strip )
       span.replace( '' )
     end
     key = keys.join( ' / ')
     text = squish( div.text.strip )
     puts "category_data key >#{key}<: >#{text}<"
     data[ key ] = { 'text' => text }
    else
      ## get subfield name
      span_names = div.css( 'span.subfield-name')
      if span_names.size > 1
        puts "!! ERROR - found more than one subfield-name:"
        puts div.to_html
        exit 1
      end
      key = normalize_category( span_names[0].text.strip )
      span_names[0].replace( '' )

      text = squish( div.text.strip )
      puts "category_data key >#{key}<: >#{text}<"
      data[ key ] = { 'text' => text }
    end
  end


  pp data
  data
end

#squish(str) ⇒ Object



179
180
181
# File 'lib/factbook-readers/builder_item.rb', line 179

def squish( str )
  str.gsub( /[ \t\n\r]{2,}/, ' ')  ## replace multi-spaces (incl. newlines with once space)
end