Class: Spider

Inherits:
Object
  • Object
show all
Includes:
UrlUtils
Defined in:
lib/spider.rb

Instance Method Summary collapse

Methods included from UrlUtils

#get_domain, #make_absolute, #relative?, #urls_on_same_domain?

Constructor Details

#initializeSpider

Returns a new instance of Spider.



11
12
13
# File 'lib/spider.rb', line 11

def initialize
	@already_visited = {}
end

Instance Method Details

#crawl_domain(url, page_limit = 100) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/spider.rb', line 36

def crawl_domain(url, page_limit = 100)
	return if @already_visited.size == page_limit

	url_object = open_url(url)
	return if url_object.nil?

	parsed_doc = parse_url(url_object)
	return if parsed_doc.nil?

	@already_visited[url] = true if @already_visited[url].nil?
	page_urls = find_urls_on_page(parsed_doc, url)
	page_urls.each do |page_url|
		if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
			crawl_domain(page_url)
		end
	end
end

#crawl_web(urls, depth = 2, page_limit = 100) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/spider.rb', line 15

def crawl_web(urls, depth=2, page_limit = 100)
	depth.times do
		next_urls = []
		urls.each do |url|
			url_object = open_url(url)
			next if url_object.nil?

			url = update_url_if_redirected(url_object)
			parsed_doc = parse_url(url_object)
			next if parsed_doc.nil?

			@already_visited[url] = true if @already_visited[url].nil?
			return if @already_visited.size == page_limit

			next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
			next_urls.uniq!
		end
		urls = next_urls
	end
end