Module: CrawlerHelper
- Included in:
- Crawler
- Defined in:
- lib/crawler_lib.rb
Instance Method Summary collapse
-
#do_not_ignore?(each_link, scraped) ⇒ Boolean
end.
- #fix_scheme(url) ⇒ Object
- #sanitize(link) ⇒ Object
- #test ⇒ Object
Instance Method Details
#do_not_ignore?(each_link, scraped) ⇒ Boolean
end
52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/crawler_lib.rb', line 52 def do_not_ignore?(each_link, scraped) # This checks if the passed link should be # scraped or not based on: # Has it already been scraped, is it bad_link? # puts each_link # puts scraped.class if scraped.include?(each_link) return false elsif bad_link?(each_link) return false else return true end end |
#fix_scheme(url) ⇒ Object
10 11 12 13 14 15 16 17 18 19 |
# File 'lib/crawler_lib.rb', line 10 def fix_scheme(url) puts "- No scheme provided for #{url}, trying to fix it." driver = Selenium::WebDriver.for :firefox driver.get("http://"+url) #assumes redirect to https is setup if it exists. url_tmp = driver.current_url scheme = URI.parse(url_tmp).scheme driver.quit puts "scheme is: #{scheme}" return scheme+"://"+url end |
#sanitize(link) ⇒ Object
22 23 24 25 26 27 28 29 30 31 |
# File 'lib/crawler_lib.rb', line 22 def sanitize(link) # puts link name = link.gsub(":", "") name = name.gsub("/", "") name = name.gsub("%", "") name = name.gsub('\\', "") name = name.gsub('.', "") # puts name return name end |
#test ⇒ Object
6 7 8 |
# File 'lib/crawler_lib.rb', line 6 def test puts "Test call" end |