Class: RubyGems::WebCrawler
- Inherits:
-
Object
- Object
- RubyGems::WebCrawler
- Defined in:
- lib/rubygems-crawler/web_crawler.rb
Constant Summary collapse
- BASE_URL =
'http://rubygems.org'
- REQUEST_HEADERS =
{'User-Agent'=>'rubygems-crawler'}
- TIMEOUT =
30
- GRACE_PERIOD =
Sleep for a while - be gentle
1
Instance Method Summary collapse
-
#crawl(letter = 'A') ⇒ Object
Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB.
-
#download_page(url) ⇒ Object
Download an HTML page given an url, parse the HTML and convert the result back into an HASH.
-
#initialize(mongo) ⇒ WebCrawler
constructor
A new instance of WebCrawler.
-
#network_call(url, request_headers = {}, timeout = nil) ⇒ Object
Execute a GET HTTP call to url given the specified headers.
-
#parse_content(response) ⇒ Object
Parse the HTML of the page extracting gem names and total number of pages.
-
#save_gems(gems) ⇒ Object
Save all the gem names into Mongo.
Constructor Details
#initialize(mongo) ⇒ WebCrawler
Returns a new instance of WebCrawler.
14 15 16 |
# File 'lib/rubygems-crawler/web_crawler.rb', line 14 def initialize(mongo) @mongo = mongo end |
Instance Method Details
#crawl(letter = 'A') ⇒ Object
Crawl all the pages of RubyGems, given an initial letter and save the data into MongoDB
19 20 21 22 23 24 25 26 27 28 |
# File 'lib/rubygems-crawler/web_crawler.rb', line 19 def crawl(letter='A') url = "#{BASE_URL}/gems?letter=#{letter}" while url && gems = download_page(url) save_gems(gems[:gems]) STDOUT.puts "[RubyGems Web Crawler] [#{url}] - Acquired #{gems[:gems].count} gems" url = (gems[:next_path]) ? "#{BASE_URL}#{gems[:next_path]}" : nil sleep GRACE_PERIOD end end |
#download_page(url) ⇒ Object
Download an HTML page given an url, parse the HTML and convert the result back into an HASH
31 32 33 34 35 36 |
# File 'lib/rubygems-crawler/web_crawler.rb', line 31 def download_page(url) STDOUT.puts "Acquiring #{url}" network_res = network_call(url, REQUEST_HEADERS, TIMEOUT) return parse_content(network_res[:response]) if network_res && network_res[:response] end |
#network_call(url, request_headers = {}, timeout = nil) ⇒ Object
Execute a GET HTTP call to url given the specified headers
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/rubygems-crawler/web_crawler.rb', line 39 def network_call(url, request_headers={}, timeout = nil) retries = 0 begin uri = URI.parse(url.ascii_only? ? url : URI.escape(url)) http = Net::HTTP.new(uri.host, uri.port) unless timeout.nil? http.open_timeout = timeout http.read_timeout = timeout end request = Net::HTTP::Get.new(uri.request_uri, request_headers) response = http.request(request) rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e retries += 1 retry unless retries > 3 return {error: e, code: 0} end result = {:code=>response.code.to_i} result[:response] = response.body if response.code.to_s == '200' result end |
#parse_content(response) ⇒ Object
Parse the HTML of the page extracting gem names and total number of pages
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/rubygems-crawler/web_crawler.rb', line 66 def parse_content(response) gem_res = {:gems => [], :next_path => nil} html_doc = Nokogiri::HTML(response) html_doc.css('.gems li a>strong').each do |node| node.content =~ /(.+)\s\(.+\)/ gem_res[:gems] << $1 end next_page = html_doc.css('.next_page').first if next_page gem_res[:next_path] = next_page.attr('href') end gem_res end |
#save_gems(gems) ⇒ Object
Save all the gem names into Mongo
85 86 87 |
# File 'lib/rubygems-crawler/web_crawler.rb', line 85 def save_gems(gems) gems.each {|gem_name| @mongo[:gems].insert({name: gem_name}) } end |