Class: Factbook::Sanitizer
- Inherits:
-
Object
- Object
- Factbook::Sanitizer
- Includes:
- Utils, LogUtils::Logging
- Defined in:
- lib/factbook-readers/sanitizer.rb
Constant Summary collapse
- ARIA_ATTR_RE =
<span class=“subfield-date” aria-label=“Date of information: 2018”>(2018)</span>
remove aria labels
/\s* aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+? /xim
- BR_BR_RE =
find double breaks e.g.
/(<br> \s* <br>) /xim
Constants included from Utils
Utils::COUNTRY_CODE_REGEX, Utils::MONTH_EN_TO_S, Utils::PAGE_INFO_REGEX, Utils::PAGE_LAST_UPDATED_REGEX
Instance Method Summary collapse
- #find_country_profile(html) ⇒ Object
- #sanitize(html) ⇒ Object
- #sanitize_data(el, title:) ⇒ Object
- #squish(str) ⇒ Object
Methods included from Utils
#data_to_csv, #find_country_code, #find_page_info, #find_page_last_updated, #values_to_csv
Instance Method Details
#find_country_profile(html) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
# File 'lib/factbook-readers/sanitizer.rb', line 46 def find_country_profile( html ) #### ## remove header (everything before) ## <ul class="expandcollapse"> ## ## fix know broken html bugs ## in co (Columbia) page (Nov/11 2020): ## <div class="photogallery_captiontext"> ## <p>slightly less than twice the size of Texas</p ## </div> ## note: </p => unclosed p!! change to </p> ## note: in regex use negative looakhead e.g. (?!patttern) html = html.gsub( %r{</p(?![>])} ) do |m| puts "!! WARN: fixing unclosed </p => </p>" puts "#{m}" '</p>' end doc = Nokogiri::HTML( html ) ul = doc.css( 'ul.expandcollapse' )[0] puts ul.to_html[0..100] ### ## sanitize ## remove link items ## assume two <li>s are a section html = String.new('') ## filter all li's ul_children = ul.children.select { |el| if el.name == 'li' true else # puts "skipping #{el.name} >#{el.to_html}<" false end } ## ul_children = ul.css( 'li' ) puts " #{ul_children.size} li(s):" ul_children.each_slice(2) do |lis| li = lis[0] div = li.at( 'div[sectiontitle]' ) if div.nil? puts "!! ERROR: no section title found in div:" puts li.to_html exit 1 end section_title = div['sectiontitle'].to_s html << "<h2>#{section_title}</h2>\n" li = lis[1] ## filter all div's li_children = li.children.select { |el| if el.name =='div' true else # puts "skipping #{el.name} >#{el.to_html}<" false end } puts " #{li_children.size} div(s) in >#{section_title}<:" ## check special case in world Geographic overview: # <div class="category oce_light" style="padding-left:5px;" # id="field-anchor-geography-geographic-overview"> # Geographic overview: # <span class="field-listing-link"> # <a href="../fields/275.html#XX"> # <img alt="Geographic overview field listing" # title="Geographic overview field listing" # src="../images/field_listing.gif" /></a> # </span> #</div> # vs regular # # <div class="category oce_light" style="padding-left:5px;" # id="field-anchor-geography-area-comparative"> # <span class="btn-tooltip definition" role="tooltip" aria-hidden='true'> # <a aria-label="Use this link to access a description of the Area - comparative field" # href="../docs/notesanddefs.html#280"> # Area - comparative # </a>: # <span class="tooltip-content"> # This entry provides an area comparison based on total area equivalents. Most entities are compared with the entire US or one of the 50 states based on area measurements (1990 revised) provided by the US Bureau of the Census. The smaller entities are compared with Washington, DC (178 sq km, 69 sq mi) or The Mall in Washington, DC (0.59 sq km, 0.23 sq mi, 146 acres). # </span> # </span> # <span class="field-listing-link"> # <a href="../fields/280.html#XX"><img alt="Area - comparative field listing" title="Area - comparative field listing" src="../images/field_listing.gif" /></a> # </span> # </div> li_children.each_slice(2) do |divs| div = divs[0] ## try new way - try clean-up / rm first span_tooltip_content = div.at( 'span.tooltip-content' ) if span_tooltip_content span_tooltip_content.inner_html = '' span_tooltip_content.replace( '' ) ## check for how to delete/remove - why? why not!! end span_field_listing_link = div.at( 'span.field-listing-link' ) if span_field_listing_link span_field_listing_link.inner_html = '' span_field_listing_link.replace( '' ) end subsection_title = div.text.strip html << "\n<h3>#{subsection_title}</h3>\n" # a = div.css('a')[0] # if a # subsection_title = a.text ## todo/check/rename: use field_name or such - why? why not? # html << "\n<h3>#{subsection_title}:</h3>\n" # else # subsection_title = '???' # puts "!! WARN: no anchor found:" # puts div.to_html # end div = divs[1] div_children = div.children.select {|el| el.name == 'div' ? true : false } puts " #{div_children.size} div(s) in field >#{subsection_title}<:" ## use more robust version - only get divs with category_data ## div_children = div.css( 'div.category_data' ) ## puts " #{div_children.size} div(s) in field >#{subsection_title}< v2:" # if div_children.size > 14 # ## us labor force has 11 divs # ## possibly an error # puts "!! ERROR - too many category_data divs found:" # puts div.to_html[0..200] # puts "\n...\n" # puts puts div.to_html[-400..-1] # exit 1 # end div_children.each do |catdiv| if catdiv['class'] && catdiv['class'].index( 'category_data' ) if catdiv['class'].index( 'attachment' ) ## skip attachments e.g. maps, pop pyramids, etc. else html << sanitize_data( catdiv, title: subsection_title ) html << "\n" end else if catdiv.to_html.index( 'country comparison to the world' ) ## simplify/unlinkify country comparision ## <div> ## <span class='category'>country comparison to the world:</span> ## <span class='category_data'> ## <a href="../fields/335rank.html#AU">97</a> ## </span> ## </div> ## e.g. to => ## <div> ## country comparison to the world: 97 ## </div> html << "<div>\n #{squish( catdiv.text.strip )}\n</div>" html << "\n" else puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:" puts catdiv.to_html exit 1 end end end end end html end |
#sanitize(html) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/factbook-readers/sanitizer.rb', line 8 def sanitize( html ) ## todo: add option for (html source) encoding - why?? why not?? ## note: ## returns 1) html profile withouth headers, footers, scripts,etc. ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc. ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.) page_info = PageInfo.new ## todo: ## make page info optional? why? why not? ## not always available (if page structure changes) - check ## what page info is required?? h = find_page_info( html ) if h page_info.country_code = h[:country_code] page_info.country_name = h[:country_name] page_info.country_affiliation = h[:country_affiliation] page_info.region_code = h[:region_code] page_info.region_name = h[:region_name] else page_info.country_code = find_country_code( html ) ## print/warn: no page info found end page_info.last_updated = find_page_last_updated( html ) html_profile = find_country_profile( html ) ## cut-off headers, footers, scripts, etc. ## todo/check: remove 3rd args old errors array - why? why not? [html_profile, page_info, []] end |
#sanitize_data(el, title:) ⇒ Object
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 |
# File 'lib/factbook-readers/sanitizer.rb', line 246 def sanitize_data( el, title: ) ## todo/fix/check: ## check if more than one p(aragraph) ## get squezzed together without space inbetween? ## step 0: replace all possible a(nchor) links with just inner text el.css( 'a').each do |a| a.replace( " #{a.text.strip} " ) end inner_html = String.new('') ## step 1 - unwrap paragraphs if present ## and convert dom/nokogiri doc/tree to html string p_count = 0 el.children.each do |child| if child.name == 'p' ## puts " [debug ] unwrap <p> no.#{p_count+1}" p_inner_html = child.inner_html.strip ## note: unwrap! use inner_html NOT to_html/html if p_inner_html.empty? ## note: skip empty paragraphs for now else inner_html << ' ++ ' if p_count > 0 inner_html << p_inner_html inner_html << " \n\n " p_count += 1 end else inner_html << child.to_html end end ## note: keep container div!! just replace inner html!!! ## note: right strip all trailing spaces/newlines for now ## plus add back a single one for pretty printing ## note: replace all non-breaking spaces with spaces for now ## see fr (france) in political parties section for example ## todo/check/fix: check if we need to use unicode char!! and NOT html entity inner_html = inner_html.gsub( " ", ' ' ) ## Unicode Character 'NO-BREAK SPACE' (U+00A0) inner_html = inner_html.gsub( "\u00A0", ' ' ) ## use unicode char el.inner_html = inner_html.rstrip + "\n" # finally - convert back to html (string) html = el.to_html html = html.gsub( ARIA_ATTR_RE ) do |m| ## do not report / keep silent for now ## puts "in >#{title}< remove aria-label attr:" ## puts "#{m}" '' end html = html.gsub( BR_BR_RE ) do |m| puts "in >#{title}< squish two <br>s into one:" puts "#{m}" '<br>' end html = html.gsub( /<br>/i ) do |m| puts "in >#{title}< replace <br> with inline (plain) text ++:" puts "#{m}" ' ++ ' end ## cleanup/remove ++ before subfield e.g. ## of: ++ => of: or such ## ## todo/fix: add negative lookahead e.g. not another + to be more specific!! html = html.gsub( %r{ (?<=([a-z]:)|(:</span>)) # note: use zero-length positive lookbehind \s+ \+{2}}xim ) do |m| puts "in >#{title} remove ++ before <field>: marker:" puts "#{m}" ' ' end ##### # "unfancy" smart quotes to ascii - why? why not? # e.g. # Following Britain’s victory => Following Britain's victory html = html.tr( "’", "'" ) # “full floor” House vote => "full floor" House vote html = html.tr( "“”", '""' ) html end |
#squish(str) ⇒ Object
344 345 346 |
# File 'lib/factbook-readers/sanitizer.rb', line 344 def squish( str ) str.gsub( /[ \t\n\r]{2,}/, ' ' ) ## replace multi-spaces (incl. newlines with once space) end |