Class: RubyGems::WebCrawler

Inherits:

Object

Object
RubyGems::WebCrawler

Defined in:: lib/rubygems-crawler/web_crawler.rb

Constant Summary collapse

BASE_URL =

'http://rubygems.org'

REQUEST_HEADERS =

{'User-Agent'=>'rubygems-crawler'}

TIMEOUT =

GRACE_PERIOD = Sleep for a while - be gentle

Instance Method Summary collapse

#crawl(letter = 'A') ⇒ Object

Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB.
#download_page(url) ⇒ Object

Download an HTML page given an url, parse the HTML and convert the result back into an HASH.
#initialize(mongo) ⇒ WebCrawler constructor

A new instance of WebCrawler.
#network_call(url, request_headers = {}, timeout = nil) ⇒ Object

Execute a GET HTTP call to url given the specified headers.
#parse_content(response) ⇒ Object

Parse the HTML of the page extracting gem names and total number of pages.
#save_gems(gems) ⇒ Object

Save all the gem names into Mongo.

Constructor Details

#initialize(mongo) ⇒ `WebCrawler`

Returns a new instance of WebCrawler.



14
15
16

# File 'lib/rubygems-crawler/web_crawler.rb', line 14

def initialize(mongo)
  @mongo = mongo
end

Instance Method Details

#crawl(letter = 'A') ⇒ `Object`

Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB

# File 'lib/rubygems-crawler/web_crawler.rb', line 19

def crawl(letter='A')
  url = "#{BASE_URL}/gems?letter=#{letter}"
  while url && gems = download_page(url)
    save_gems(gems[:gems])
    STDOUT.puts "[RubyGems Web Crawler] [#{url}] - Acquired #{gems[:gems].count} gems"
    
    url = (gems[:next_path]) ? "#{BASE_URL}#{gems[:next_path]}" : nil
    sleep GRACE_PERIOD
  end
end

#download_page(url) ⇒ `Object`

Download an HTML page given an url, parse the HTML and convert the result back into an HASH

# File 'lib/rubygems-crawler/web_crawler.rb', line 31

def download_page(url)
  STDOUT.puts "Acquiring #{url}"
  
  network_res = network_call(url, REQUEST_HEADERS, TIMEOUT)
  return parse_content(network_res[:response]) if network_res && network_res[:response]
end

#network_call(url, request_headers = {}, timeout = nil) ⇒ `Object`

Execute a GET HTTP call to url given the specified headers

# File 'lib/rubygems-crawler/web_crawler.rb', line 39

def network_call(url, request_headers={}, timeout = nil)

  retries = 0
  begin
    uri = URI.parse(url.ascii_only? ? url : URI.escape(url))
    http = Net::HTTP.new(uri.host, uri.port)

    unless timeout.nil?
      http.open_timeout = timeout
      http.read_timeout = timeout
    end

    request = Net::HTTP::Get.new(uri.request_uri, request_headers)
    response = http.request(request)

  rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
    retries += 1
    retry unless retries > 3
    return {error: e, code: 0}
  end

  result = {:code=>response.code.to_i}    
  result[:response] = response.body if response.code.to_s == '200'
  result
end

#parse_content(response) ⇒ `Object`

Parse the HTML of the page extracting gem names and total number of pages

# File 'lib/rubygems-crawler/web_crawler.rb', line 66

def parse_content(response)
  gem_res = {:gems => [], :next_path => nil}
  
  html_doc = Nokogiri::HTML(response)
  
  html_doc.css('.gems li a>strong').each do |node|
    node.content =~ /(.+)\s\(.+\)/
    gem_res[:gems] << $1
  end      
  
  next_page = html_doc.css('.next_page').first
  if next_page
    gem_res[:next_path] = next_page.attr('href')
  end
  
  gem_res
end

#save_gems(gems) ⇒ `Object`

Save all the gem names into Mongo



85
86
87

# File 'lib/rubygems-crawler/web_crawler.rb', line 85

def save_gems(gems)
  gems.each {|gem_name| @mongo[:gems].insert({name: gem_name}) }
end

Class: RubyGems::WebCrawler

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(mongo) ⇒ WebCrawler

Instance Method Details

#crawl(letter = 'A') ⇒ Object

#download_page(url) ⇒ Object

#network_call(url, request_headers = {}, timeout = nil) ⇒ Object