Module: CrawlerHelper

Included in:
Crawler
Defined in:
lib/crawler_lib.rb

Instance Method Summary collapse

Instance Method Details

#do_not_ignore?(each_link, scraped) ⇒ Boolean

end

Returns:

  • (Boolean)


52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/crawler_lib.rb', line 52

def do_not_ignore?(each_link, scraped)
	# This checks if the passed link should be
	# scraped or not based on:
	# Has it already been scraped, is it bad_link?
	# puts each_link
	# puts scraped.class
	if scraped.include?(each_link)
		return false
	elsif bad_link?(each_link)
		return false
	else
		return true
	end
end

#fix_scheme(url) ⇒ Object



10
11
12
13
14
15
16
17
18
19
# File 'lib/crawler_lib.rb', line 10

def fix_scheme(url)
	puts "- No scheme provided for #{url}, trying to fix it."
	driver = Selenium::WebDriver.for :firefox
	driver.get("http://"+url) #assumes redirect to https is setup if it exists.
	url_tmp = driver.current_url
	scheme = URI.parse(url_tmp).scheme
	driver.quit
	puts "scheme is: #{scheme}"
	return scheme+"://"+url
end

#sanitize(link) ⇒ Object



22
23
24
25
26
27
28
29
30
31
# File 'lib/crawler_lib.rb', line 22

def sanitize(link)
	# puts link
	name = link.gsub(":", "")
	name = name.gsub("/", "")
	name = name.gsub("%", "")
	name = name.gsub('\\', "")
	name = name.gsub('.', "")
	# puts name
	return name
end

#testObject



6
7
8
# File 'lib/crawler_lib.rb', line 6

def test
	puts "Test call"
end