Class: CountriesOfTheWorld::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/countries_of_the_world/scraper.rb

Overview

scraper class scrapes the pages for information to build country info due to size of country, only name and url for all countries are fully built initially details on each country are retrieved as user calls for a country

Instance Method Summary collapse

Instance Method Details

#country_page(url) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/countries_of_the_world/scraper.rb', line 18

def country_page(url)
#this method returns a hash containing detailed information for a country 
  finance_info =[]

  if url.include?("israel") #israel's link is not pointed to the right page
    info = more_info_page(url)
    {
      :income_level => info[0],
      :region => info[1],
      :description => "BROKEN LINK, NO_INFO"
    }
    
  elsif  url.include?("gcc") #gcc's link doesn't contain financial info
    {:description => "NO_INFO"}

  elsif url.include?("country") #regular link
    doc = Nokogiri::HTML(open(url))

    doc.css("td.c01v1-country-amounts").text.gsub("\t","_").split(" ").map {|e| finance_info << e.gsub("_"," ")}
    #the output from css is in the following format that needs to be parsed into seperate attributes
    #"32.53\tmillion $19.33\tbillion 0.8% -1.5% "

    raw_text = doc.css("div.c01v1-country-banner-text").text.strip
    description = raw_text[0..raw_text.index(" \n")].strip
    #description of the country returns a lot of misc texts at the end. need to trim down to description
    
    info_url =  doc.css("span.c01v1-country-chart-text a").attr('href').value
    info_url[0]== "/"? info_url = "http://www.worldbank.org"+ info_url : info_url
    more_info_page(info_url).each {|e| finance_info<< e}
    # region and income_level info are contained in additional page
    # the url for more info is not consistent between different countries
    # some links are absolute, some are relative
    
    {
      :population => finance_info[0],
      :gdp => finance_info[1],
      :gdp_growth => finance_info[2],
      :inflation => finance_info[3],
      :income_level => finance_info[4],
      :region => finance_info[5],
      :description => description
    } #return hash
  end
end

#more_info_page(url) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/countries_of_the_world/scraper.rb', line 63

def more_info_page(url)
#this helper method returns an array [income_level, region]

  #below handles broken links
  uri = URI.parse(url)
  result = Net::HTTP.start(uri.host, uri.port) { |http| http.get(uri.path) }

  if url.include?("capeverde") #cape verde page will return 302, but the link is actually broken
    [nil,nil]
  elsif result.code.to_i >= 200 && result.code.to_i < 400
    doc = Nokogiri::HTML(open(url))
    [doc.css("li.come-level a.toggle strong").text,
    doc.css("li.region a.toggle strong").text
    ] 
  else
    [nil,nil]
  end
end

#scrape_pageObject



8
9
10
11
12
13
14
15
16
# File 'lib/countries_of_the_world/scraper.rb', line 8

def scrape_page
#this method scrapes the main page builds all the countries with name and url
  doc = Nokogiri::HTML(open("http://www.worldbank.org/en/country"))
  doc.css("li.name-country").each do |country|
    c_name = country.css("a").text
    url = country.css("a").attr('href').value
    c = CountriesOfTheWorld::Country.new(c_name, url)
  end
end