Module: Factbook::Utils
- Included in:
- Sanitizer
- Defined in:
- lib/factbook-readers/utils.rb,
lib/factbook-readers/utils_info.rb
Constant Summary collapse
- MONTH_EN_TO_S =
e.g. Page last updated on September 16, 2015
{ 'January' => '1', 'February' => '2', 'March' => '3', 'April' => '4', 'May' => '5', 'June' => '6', 'July' => '7', 'August' => '8', 'September' => '9', 'October' => '10', 'November' => '11', 'December' => '12' }
- PAGE_LAST_UPDATED_REGEX =
/ Page \s last \s updated \s on \s (?<month_en>[a-z]+) \s (?<day>\d{1,2}), \s (?<year>\d{4}) /imx
- COUNTRY_CODE_REGEX =
fallback: find “standalone” country coude e.g.
ccode='au'
/ccode='(?<cc>[a-z]+)'/
- PAGE_INFO_REGEX =
/ regioncode=(?<q1>"|')(?<region_code>.+?)\k<q1> \s+ countrycode=(?<q2>"|')(?<country_code>.+?)\k<q2> ## is k<3> backref \s+ countryname=(?<q3>"|')(?<country>.+?)\k<q3> \s+ [^>]+? ## allow any attribs (note: non-greedy) countryaffiliation=(?<q4>"|')(?<affiliation>.*?)\k<q4> ## note: might be empty \s+ [^>]+? ## allow any attribs (note: non-greedy) region=(?<q5>"|')(?<region>.+?)\k<q5> ## check world - might be empty ?? or for ocean ?? /imx
Instance Method Summary collapse
- #data_to_csv(recs, headers) ⇒ Object
- #find_country_code(html) ⇒ Object
- #find_page_info(html) ⇒ Object
- #find_page_last_updated(html) ⇒ Object
- #values_to_csv(values) ⇒ Object
Instance Method Details
#data_to_csv(recs, headers) ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/factbook-readers/utils.rb', line 31 def data_to_csv( recs, headers ) text = "" text << values_to_csv( headers ) text << "\n" recs.each do |rec| text << values_to_csv( rec ) text << "\n" end text end |
#find_country_code(html) ⇒ Object
67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/factbook-readers/utils_info.rb', line 67 def find_country_code( html ) m = COUNTRY_CODE_REGEX.match( html ) if m pp m cc = m[:cc] puts "** bingo - country code #{cc}" cc else nil end end |
#find_page_info(html) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/factbook-readers/utils_info.rb', line 109 def find_page_info( html ) m = PAGE_INFO_REGEX.match( html ) if m pp m h = { country_code: m[:country_code], country_name: m[:country], country_affiliation: m[:affiliation], region_code: m[:region_code], region_name: m[:region] } puts "** bingo - #{h.inspect}" h ## return hash w/ name-value pairs else nil ## or return empty struct with nils/empty strings - why?? why not?? end end |
#find_page_last_updated(html) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/factbook-readers/utils_info.rb', line 41 def find_page_last_updated( html ) m = PAGE_LAST_UPDATED_REGEX.match( html ) if m pp m month_en = m[:month_en] day = m[:day] year = m[:year] puts "** bingo - month #{month_en}, day #{day}, year #{year}" month = MONTH_EN_TO_S[ month_en ] date_str = "#{year}-#{month}-#{day}" pp date_str date = Date.strptime( date_str, '%Y-%m-%d' ) date else nil end end |
#values_to_csv(values) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/factbook-readers/utils.rb', line 6 def values_to_csv( values ) buf = "" values.each_with_index do |value,i| buf << ',' if i > 0 ## add comma (except for first value) ## note: allow optional $ sign e.g. $100,000,000 ## !!!! todo/fix: allow optional minus e.g. -44,000 if value =~ /^\$?[1-9][,0-9]+[0-9]$/ ### find a better regex - why? why not?? ## check if number e.g. 17,098,242 or $17,098,242 ## remove commas 17098242 buf << value.gsub( ',', '' ) elsif value.index( ',').nil? ## add as is 1:1 (no commana) buf << value else ## escape comma with double quote # e.g. Guam, The becomes "Guam, The" buf << '"' buf << value buf << '"' end end buf end |