Class: Crawlette::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/crawlette/crawler.rb

Constant Summary collapse

MAX_THREADS =
8
BadUrlError =
Class.new(ArgumentError)

Instance Method Summary collapse

Constructor Details

#initialize(url, sitemap = {}) ⇒ Crawler

Returns a new instance of Crawler.



10
11
12
13
14
15
16
17
# File 'lib/crawlette/crawler.rb', line 10

def initialize(url, sitemap = {})
  @uri = URI.parse(url)
  @pending_uris = [@uri]
  @sitemap = sitemap
  unless @uri.host && @uri.scheme
    fail BadUrlError, "Invalid url: You must provide a full qualified url"
  end
end

Instance Method Details

#crawlObject

Crawl a web page and generate a sitemap that must also contain:

  • Links betwenn pages.

  • On which static assets each page depend on.

Example:

Crawlette::Crawler.new(‘gocardless.com’).crawl # => {

'http://example.com/' => {
  'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'],
  'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'],
},
'http://example.com/watch-a-demo' => {
  'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'],
  'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'],
},
'http://example.com/features' => {
  'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'],
  'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'],
},
'http://example.com/features/api' => {
  ...
},
'http://example.com/features/pricing' => {
  ...
},

}



49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/crawlette/crawler.rb', line 49

def crawl
  while @pending_uris.size > 0
    threads = []
    @pending_uris.pop(MAX_THREADS).each do |uri|
      threads << Thread.new do
        process_uri(uri)
      end
    end
    threads.each(&:join)
  end
  @sitemap
end