Module: Factbook::Utils

Included in:
Sanitizer
Defined in:
lib/factbook-readers/utils.rb,
lib/factbook-readers/utils_info.rb

Constant Summary collapse

MONTH_EN_TO_S =

e.g. Page last updated on September 16, 2015

{
  'January'   => '1',
  'February'  => '2',
  'March'     => '3',
  'April'     => '4',
  'May'       => '5',
  'June'      => '6',
  'July'      => '7',
  'August'    => '8',
  'September' => '9',
  'October'   => '10',
  'November'  => '11',
  'December'  => '12'
}
PAGE_LAST_UPDATED_REGEX =

examples (to match):

Page last updated on November 03, 2016
Page last updated on September 24, 2015
/
 Page \s last \s updated \s on \s
  (?<month_en>[a-z]+) \s
  (?<day>\d{1,2}), \s
  (?<year>\d{4})
/imx
COUNTRY_CODE_REGEX =

fallback: find “standalone” country coude e.g.

ccode='au'
/ccode='(?<cc>[a-z]+)'/
PAGE_INFO_REGEX =
/
  regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1>
    \s+
  countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2>       ## is k<3> backref
    \s+
   countryname=(?<q3>"|')(?<country>.+?)\k<q3>
    \s+
     [^>]+?  ## allow any attribs (note: non-greedy)
   countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4>     ## note: might be empty
    \s+
     [^>]+?  ## allow any attribs (note: non-greedy)
   region=(?<q5>"|')(?<region>.+?)\k<q5>    ## check world - might be empty ?? or for ocean ??
/imx

Instance Method Summary collapse

Instance Method Details

#data_to_csv(recs, headers) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/factbook-readers/utils.rb', line 31

def data_to_csv( recs, headers )
  text = ""

  text << values_to_csv( headers )
  text << "\n"

  recs.each do |rec|
    text << values_to_csv( rec )
    text << "\n"
  end

  text
end

#find_country_code(html) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/factbook-readers/utils_info.rb', line 67

def find_country_code( html )
  m = COUNTRY_CODE_REGEX.match( html )
  if m
    pp m
    cc = m[:cc]
    puts "** bingo - country code #{cc}"
    cc
  else
    nil
  end
end

#find_page_info(html) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/factbook-readers/utils_info.rb', line 109

def find_page_info( html )
  m = PAGE_INFO_REGEX.match( html )
  if m
    pp m

    h = { country_code:        m[:country_code],
          country_name:        m[:country],
          country_affiliation: m[:affiliation],
          region_code:         m[:region_code],
          region_name:         m[:region] }

    puts "** bingo - #{h.inspect}"
    h    ## return hash w/ name-value pairs
  else
    nil   ## or return empty struct with nils/empty strings - why?? why not??
  end
end

#find_page_last_updated(html) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/factbook-readers/utils_info.rb', line 41

def find_page_last_updated( html )
  m = PAGE_LAST_UPDATED_REGEX.match( html )
  if m
    pp m
    month_en = m[:month_en]
    day      = m[:day]
    year     = m[:year]
    puts "** bingo - month #{month_en}, day #{day}, year #{year}"

    month = MONTH_EN_TO_S[ month_en ]
    date_str = "#{year}-#{month}-#{day}"
    pp date_str
    date = Date.strptime( date_str, '%Y-%m-%d' )
    date
  else
    nil
  end
end

#values_to_csv(values) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/factbook-readers/utils.rb', line 6

def values_to_csv( values )
  buf = ""
  values.each_with_index do |value,i|
     buf << ','  if i > 0    ## add comma (except for first value)
     ## note: allow optional $ sign e.g. $100,000,000
     ##  !!!! todo/fix: allow optional minus e.g. -44,000
     if value =~ /^\$?[1-9][,0-9]+[0-9]$/    ### find a better regex - why? why not??
       ## check if number e.g. 17,098,242  or $17,098,242
       ##   remove commas  17098242
       buf << value.gsub( ',', '' )
     elsif value.index( ',').nil?
       ## add as is 1:1 (no commana)
       buf << value
     else
       ## escape comma with double quote
       #   e.g. Guam, The becomes "Guam, The"
       buf << '"'
       buf << value
       buf << '"'
     end
  end
  buf
end