Class: Factbook::ItemBuilder
- Inherits:
-
Object
- Object
- Factbook::ItemBuilder
- Includes:
- NormalizeHelper, LogUtils::Logging
- Defined in:
- lib/factbook-readers/builder_item.rb
Overview
renameto ItemReader, ItemParser - why? why not??
Instance Method Summary collapse
-
#initialize(html, name) ⇒ ItemBuilder
constructor
A new instance of ItemBuilder.
- #read ⇒ Object
- #squish(str) ⇒ Object
Methods included from NormalizeHelper
Constructor Details
#initialize(html, name) ⇒ ItemBuilder
Returns a new instance of ItemBuilder.
9 10 11 12 |
# File 'lib/factbook-readers/builder_item.rb', line 9 def initialize( html, name ) @html = html @name = name # add category/field name e.g. Area, Location, etc. end |
Instance Method Details
#read ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/factbook-readers/builder_item.rb', line 26 def read ## return hash from html snippet doc = Nokogiri::HTML.fragment( @html ) data = {} ## note: ## skip whitespace text nodes (e.g. \n\n etc); just use divs doc_children = doc.children.filter('div') puts " parsing >#{@name}< - #{doc_children.size} category_data divs(s):" ## hanlde special case for ## multiple 'grouped_subfield' first ## e.g. used in ## - Drinking water source: ## - Sanitation facility access: grouped_children = [] other_children = [] doc_children.each do |div| if div['class'].index( 'grouped_subfield' ) grouped_children << div else other_children << div end end ## note: only use special rule if more than one div marked grouped_ if grouped_children.size > 1 ## continue processing the rest as usual doc_children = other_children key = nil grouped_children.each do |div| if !div.css( 'span.subfield-group').empty? # start a new group span_group = div.at( 'span.subfield-group') key = normalize_category( span_group.text.strip ) span_group.replace( '' ) text = squish( div.text.strip ) puts "new group - category_data key >#{key}<: >#{text}<" data[ key ] = { 'text' => text } else ## append to (last) group text = squish( div.text.strip ) puts "add group - category_data key >#{key}<: >#{text}<" data[ key ]['text'] += " / #{text}" end end end doc_children.each_with_index do |div,i| if div['class'].index( 'note' ) text = squish( div.text.strip ) puts "category_data: >#{text}<" ## note: for now only allow one note per subsection/field data block if data['note'] puts "!! ERROR: note already taken:" puts data['note'] puts div.to_html exit 1 end data['note'] = { 'text' => text } elsif div['class'].index( 'historic' ) ## add all historic together into one for now text = squish( div.text.strip ) puts "category_data: >#{text}<" if data['text'] ## append with / for now data['text'] += " / #{text}" else data['text'] = text ## check if history is first node if i != 0 puts "!! ERROR: expected first historic node to be first node but it is #{i+1}:" puts div.to_html exit 1 end end elsif div.css( 'span.subfield-name').empty? ## assume "implied text field" ## check for index == 1 / child count == 1 - why? why not text = squish( div.text.strip ) ## fix/todo: use strip puts "category_data: >#{text}<" data['text'] = text ## must be always first node for now if i != 0 puts "!! ERROR - 'implied' category W/O name NOT first div / node:" puts div.to_html exit 1 end elsif div['class'].index( 'grouped_subfield' ) ## split grouped subfield!! ## <span class="subfield-name">arable land:</span> ## <span class="subfield-number">8.6%</span> ## <span class="subfield-date">(2011 est.)</span> ## / ## <span class="subfield-name">permanent crops:</span> ## <span class="subfield-number">0.8%</span> ## <span class="subfield-date">(2011 est.)</span> ## / ## <span class="subfield-name">permanent pasture:</span> ## <span class="subfield-number">23.5%</span> ## <span class="subfield-date">(2011 est.)</span> ## join names for now - why? why not? ## e.g. becomes: ## arable land / permanent crops / permanent pasture: for key ?? span_names = div.css( 'span.subfield-name') keys = [] span_names.each do |span| keys << normalize_category( span.text.strip ) span.replace( '' ) end key = keys.join( ' / ') text = squish( div.text.strip ) puts "category_data key >#{key}<: >#{text}<" data[ key ] = { 'text' => text } else ## get subfield name span_names = div.css( 'span.subfield-name') if span_names.size > 1 puts "!! ERROR - found more than one subfield-name:" puts div.to_html exit 1 end key = normalize_category( span_names[0].text.strip ) span_names[0].replace( '' ) text = squish( div.text.strip ) puts "category_data key >#{key}<: >#{text}<" data[ key ] = { 'text' => text } end end pp data data end |
#squish(str) ⇒ Object
179 180 181 |
# File 'lib/factbook-readers/builder_item.rb', line 179 def squish( str ) str.gsub( /[ \t\n\r]{2,}/, ' ') ## replace multi-spaces (incl. newlines with once space) end |