Module: CrawlerHelper

Included in:
Crawler
Defined in:
lib/crawler_lib.rb

Instance Method Summary collapse

Instance Method Details

#do_not_ignore?(each_link, scraped) ⇒ Boolean

end

Returns:

  • (Boolean)


52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/crawler_lib.rb', line 52

def do_not_ignore?(each_link, scraped)
  # This checks if the passed link should be
  # scraped or not based on:
  # Has it already been scraped, is it bad_link?
  # puts each_link
  # puts scraped.class
  if scraped.include?(each_link)
    return false
  elsif bad_link?(each_link)
    return false
  else
    return true
  end
end

#fix_scheme(url) ⇒ Object



10
11
12
13
14
15
16
17
18
19
# File 'lib/crawler_lib.rb', line 10

def fix_scheme(url)
  puts "- No scheme provided for #{url}, trying to fix it."
  driver = Selenium::WebDriver.for :firefox
  driver.get("http://"+url) #assumes redirect to https is setup if it exists.
  url_tmp = driver.current_url
  scheme = URI.parse(url_tmp).scheme
  driver.quit
  puts "scheme is: #{scheme}"
  return scheme+"://"+url
end

#sanitize(link) ⇒ Object



22
23
24
25
26
27
28
29
30
31
# File 'lib/crawler_lib.rb', line 22

def sanitize(link)
  # puts link
  name = link.gsub(":", "")
  name = name.gsub("/", "")
  name = name.gsub("%", "")
  name = name.gsub('\\', "")
  name = name.gsub('.', "")
  # puts name
  return name
end

#testObject



6
7
8
# File 'lib/crawler_lib.rb', line 6

def test
  puts "Test call"
end