Class: Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/crawler.rb

Defined Under Namespace

Classes: Node

Constant Summary collapse

CONCURRENCY =
5
HTTP_OK =
200
MAX_REDIRECTS =
3
MAX_RETRIES =
3
VALID_SCHEMES =
%w(http https)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(root_url) ⇒ Crawler

Returns a new instance of Crawler.



19
20
21
22
23
24
# File 'lib/crawler.rb', line 19

def initialize(root_url)
  @map = {}
  @urls_to_crawl = [root_url]
  @root_hostname = URI.parse(root_url).hostname
  @retries = Hash.new { |h, k| h[k] = 0 }
end

Instance Attribute Details

#mapObject (readonly)

Returns the value of attribute map.



17
18
19
# File 'lib/crawler.rb', line 17

def map
  @map
end

Instance Method Details

#crawlObject



26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/crawler.rb', line 26

def crawl
  if @urls_to_crawl.empty?
    EventMachine.stop
    return
  end

  EM.synchrony do
    # Iterate over a copy while we change the main array
    urls = @urls_to_crawl.dup
    @urls_to_crawl = crawl_urls(urls)
    crawl
  end
end


40
41
42
# File 'lib/crawler.rb', line 40

def print
  ap @map, { index: false }
end