Class: RubyGems::WebCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/rubygems-crawler/web_crawler.rb

Constant Summary collapse

BASE_URL =
'http://rubygems.org'
REQUEST_HEADERS =
{'User-Agent'=>'rubygems-crawler'}
TIMEOUT =
30
GRACE_PERIOD =

Sleep for a while - be gentle

1

Instance Method Summary collapse

Constructor Details

#initialize(mongo) ⇒ WebCrawler

Returns a new instance of WebCrawler.



14
15
16
# File 'lib/rubygems-crawler/web_crawler.rb', line 14

def initialize(mongo)
  @mongo = mongo
end

Instance Method Details

#crawl(letter = 'A') ⇒ Object

Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB



19
20
21
22
23
24
25
26
27
28
# File 'lib/rubygems-crawler/web_crawler.rb', line 19

def crawl(letter='A')
  url = "#{BASE_URL}/gems?letter=#{letter}"
  while url && gems = download_page(url)
    save_gems(gems[:gems])
    STDOUT.puts "[RubyGems Web Crawler] [#{url}] - Acquired #{gems[:gems].count} gems"
    
    url = (gems[:next_path]) ? "#{BASE_URL}#{gems[:next_path]}" : nil
    sleep GRACE_PERIOD
  end
end

#download_page(url) ⇒ Object

Download an HTML page given an url, parse the HTML and convert the result back into an HASH



31
32
33
34
35
36
# File 'lib/rubygems-crawler/web_crawler.rb', line 31

def download_page(url)
  STDOUT.puts "Acquiring #{url}"
  
  network_res = network_call(url, REQUEST_HEADERS, TIMEOUT)
  return parse_content(network_res[:response]) if network_res && network_res[:response]
end

#network_call(url, request_headers = {}, timeout = nil) ⇒ Object

Execute a GET HTTP call to url given the specified headers



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/rubygems-crawler/web_crawler.rb', line 39

def network_call(url, request_headers={}, timeout = nil)

  retries = 0
  begin
    uri = URI.parse(url.ascii_only? ? url : URI.escape(url))
    http = Net::HTTP.new(uri.host, uri.port)

    unless timeout.nil?
      http.open_timeout = timeout
      http.read_timeout = timeout
    end

    request = Net::HTTP::Get.new(uri.request_uri, request_headers)
    response = http.request(request)

  rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
    retries += 1
    retry unless retries > 3
    return {error: e, code: 0}
  end

  result = {:code=>response.code.to_i}    
  result[:response] = response.body if response.code.to_s == '200'
  result
end

#parse_content(response) ⇒ Object

Parse the HTML of the page extracting gem names and total number of pages



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/rubygems-crawler/web_crawler.rb', line 66

def parse_content(response)
  gem_res = {:gems => [], :next_path => nil}
  
  html_doc = Nokogiri::HTML(response)
  
  html_doc.css('.gems li a>strong').each do |node|
    node.content =~ /(.+)\s\(.+\)/
    gem_res[:gems] << $1
  end      
  
  next_page = html_doc.css('.next_page').first
  if next_page
    gem_res[:next_path] = next_page.attr('href')
  end
  
  gem_res
end

#save_gems(gems) ⇒ Object

Save all the gem names into Mongo



85
86
87
# File 'lib/rubygems-crawler/web_crawler.rb', line 85

def save_gems(gems)
  gems.each {|gem_name| @mongo[:gems].insert({name: gem_name}) }
end