Class: Factbook::Sanitizer

Inherits:
Object
  • Object
show all
Includes:
Utils, LogUtils::Logging
Defined in:
lib/factbook-readers/sanitizer.rb

Constant Summary collapse

ARIA_ATTR_RE =

<span class=“subfield-date” aria-label=“Date of information: 2018”>(2018)</span>

remove aria labels
/\s*
   aria-label=('|").+?\1     ## note: use non-greedy match e.g. .+?
/xim
BR_BR_RE =

find double breaks e.g.

/(<br> \s* <br>)
/xim

Constants included from Utils

Utils::COUNTRY_CODE_REGEX, Utils::MONTH_EN_TO_S, Utils::PAGE_INFO_REGEX, Utils::PAGE_LAST_UPDATED_REGEX

Instance Method Summary collapse

Methods included from Utils

#data_to_csv, #find_country_code, #find_page_info, #find_page_last_updated, #values_to_csv

Instance Method Details

#find_country_profile(html) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/factbook-readers/sanitizer.rb', line 46

def find_country_profile( html )
  ####
  ## remove header (everything before)
  ##   <ul class="expandcollapse">

  ##
  ## fix know broken html bugs
  ##  in co (Columbia) page (Nov/11 2020):
  ## <div class="photogallery_captiontext">
  ##   <p>slightly less than twice the size of Texas</p
  ## </div>
  ##    note: </p    => unclosed p!! change to </p>

  ## note: in regex use negative looakhead e.g. (?!patttern)
  html = html.gsub( %r{</p(?![>])} ) do |m|
    puts "!! WARN: fixing unclosed </p => </p>"
    puts "#{m}"
    '</p>'
  end


  doc = Nokogiri::HTML( html )

  ul = doc.css( 'ul.expandcollapse' )[0]

  puts ul.to_html[0..100]


  ###
  ## sanitize
  ## remove link items
  ##   assume two <li>s are a section

  html = String.new('')

  ##  filter all li's
   ul_children = ul.children.select { |el| if el.name == 'li'
                                             true
                                          else
                                            # puts "skipping #{el.name} >#{el.to_html}<"
                                            false
                                          end
                                     }
  ## ul_children = ul.css( 'li' )

  puts "  #{ul_children.size} li(s):"
  ul_children.each_slice(2) do |lis|
    li  = lis[0]
    div = li.at( 'div[sectiontitle]' )
    if div.nil?
      puts "!! ERROR: no section title found in div:"
      puts li.to_html
      exit 1
    end

    section_title = div['sectiontitle'].to_s

    html << "<h2>#{section_title}</h2>\n"


    li  = lis[1]
    ## filter all div's
    li_children = li.children.select { |el| if el.name =='div'
                                                true
                                            else
                                             # puts "skipping #{el.name} >#{el.to_html}<"
                                             false
                                            end
                                      }
    puts " #{li_children.size} div(s) in >#{section_title}<:"

    li_children.each_slice(2) do |divs|
      div = divs[0]
      a = div.css('a')[0]

      if a
        subsection_title = a.text   ## todo/check/rename: use field_name or such - why? why not?
        html << "\n<h3>#{subsection_title}:</h3>\n"
      else
        subsection_title = '???'
        puts "!! WARN: no anchor found:"
        puts div.to_html
      end


      div = divs[1]
      div_children = div.children.select {|el| el.name == 'div' ? true : false }
      puts "   #{div_children.size} div(s) in field >#{subsection_title}<:"

      ## use more robust version - only get divs with category_data
      ## div_children = div.css( 'div.category_data' )
      ## puts "   #{div_children.size} div(s) in field >#{subsection_title}< v2:"

      # if div_children.size > 14
      #  ## us labor force has 11 divs
      #  ## possibly an error
      #  puts "!! ERROR - too many category_data divs found:"
      #  puts div.to_html[0..200]
      #  puts "\n...\n"
      #  puts puts div.to_html[-400..-1]
      #  exit 1
      # end

      div_children.each do |catdiv|
         if catdiv['class'] && catdiv['class'].index( 'category_data' )

          if catdiv['class'].index( 'attachment' )
            ## skip attachments e.g. maps, pop pyramids, etc.
          else
            html << sanitize_data( catdiv, title: subsection_title )
            html << "\n"
          end
         else
            if catdiv.to_html.index( 'country comparison to the world' )
              ## silently skip for now country comparision
            else
              puts "!! ERROR: div (W/O category_data class) in >#{subsection_title}<:"
              puts catdiv.to_html
              exit 1
            end
         end
      end
    end
  end

  html
end

#sanitize(html) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/factbook-readers/sanitizer.rb', line 8

def sanitize( html )
  ## todo: add option for (html source) encoding - why?? why not??

  ## note:
  ##   returns 1) html profile withouth headers, footers, scripts,etc.
  ##           2) page (meta) info e.g. country_name, country_code, last_updated, etc.
  ##           3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)

  page_info = PageInfo.new

  ## todo:
  ##   make page info optional? why? why not?
  ##   not always available (if page structure changes) - check
  ##   what page info is required??
  h = find_page_info( html )
  if h
    page_info.country_code        = h[:country_code]
    page_info.country_name        = h[:country_name]
    page_info.country_affiliation = h[:country_affiliation]
    page_info.region_code         = h[:region_code]
    page_info.region_name         = h[:region_name]
  else
    page_info.country_code = find_country_code( html )
    ## print/warn: no page info found
  end


  page_info.last_updated  = find_page_last_updated( html )


  html_profile = find_country_profile( html )    ## cut-off headers, footers, scripts, etc.

  ## todo/check: remove 3rd args old errors array - why? why not?
  [html_profile, page_info, []]
end

#sanitize_data(el, title:) ⇒ Object



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/factbook-readers/sanitizer.rb', line 188

def sanitize_data( el, title: )
  ## todo/fix/check:
  ##  check if more than one p(aragraph)
  ##    get squezzed together without space inbetween?


  ## step 0: replace all possible a(nchor) links with just inner text
  el.css( 'a').each do |a|
     a.replace( " #{a.text.strip} " )
  end



  inner_html = String.new('')

  ## step 1 - unwrap paragraphs if present
  ##          and convert dom/nokogiri doc/tree to html string
  p_count = 0
  el.children.each do |child|
    if child.name == 'p'
      ## puts "  [debug ] unwrap <p> no.#{p_count+1}"

      p_inner_html = child.inner_html.strip  ## note: unwrap! use inner_html NOT to_html/html
      if p_inner_html.empty?
        ## note: skip empty paragraphs for now
      else
        inner_html << ' ++ '    if p_count > 0
        inner_html << p_inner_html
        inner_html << " \n\n "

        p_count += 1
      end
    else
      inner_html << child.to_html
    end
  end
  ## note: keep container div!! just replace inner html!!!
  ##  note: right strip all trailing spaces/newlines for now
  ##        plus add back a single one for pretty printing

  ## note: replace all non-breaking spaces with spaces for now
  ##  see fr (france) in political parties section for example
  ##  todo/check/fix:  check if we need to use unicode char!! and NOT html entity
  inner_html = inner_html.gsub( "&nbsp;", ' ' )

  el.inner_html = inner_html.rstrip + "\n"

  # finally - convert back to html (string)
  html = el.to_html



  html = html.gsub( ARIA_ATTR_RE ) do |m|
    ## do not report / keep silent for now
    ## puts "in >#{title}< remove aria-label attr:"
    ## puts "#{m}"
    ''
  end

  html = html.gsub( BR_BR_RE ) do |m|
    puts "in >#{title}< squish two <br>s into one:"
    puts "#{m}"
    '<br>'
  end

  html = html.gsub( /<br>/i ) do |m|
    puts "in >#{title}< replace <br> with inline (plain) text ++:"
    puts "#{m}"
    ' ++ '
  end

  ## cleanup/remove ++   before subfield e.g.
  ##  of: ++   => of:    or such
  ##
  ##  todo/fix: add negative lookahead e.g. not another + to be more specific!!
  html = html.gsub( %r{
                       (?<=([a-z]:)|(:</span>))  # note: use zero-length positive lookbehind
                          \s+
                          \+{2}}xim ) do |m|
     puts "in >#{title} remove ++ before <field>: marker:"
     puts "#{m}"
    ' '
  end

  #####
  # "unfancy" smart quotes to ascii - why? why not?
  # e.g.
  # Following Britain’s victory => Following Britain's victory
  html = html.tr( "", "'" )


  html
end