Class: EmailCrawler::Scraper

Inherits:
Object
  • Object
show all
Includes:
MechanizeHelper, URLHelper
Defined in:
lib/email_crawler/scraper.rb

Constant Summary collapse

MAX_RESULTS =
100

Constants included from URLHelper

URLHelper::DOMAIN_REGEXP, URLHelper::WWW_REGEXP

Constants included from MechanizeHelper

MechanizeHelper::READ_TIMEOUT

Instance Method Summary collapse

Methods included from URLHelper

#extract_domain_from

Methods included from MechanizeHelper

#get, #new_agent

Constructor Details

#initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: []) ⇒ Scraper

Returns a new instance of Scraper.



12
13
14
15
16
# File 'lib/email_crawler/scraper.rb', line 12

def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
  @search_url = "https://www.#{google_website}/search?q="
  @max_results = max_results
  @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
end

Instance Method Details

#search_result_urls_for(q) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/email_crawler/scraper.rb', line 18

def search_result_urls_for(q)
  search_results_page = agent.get(@search_url + CGI.escape(q))
  urls = Set.new(search_results_on(search_results_page))

  page = 1
  while urls.size < @max_results
    next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
    break unless next_page_link

    next_search_results_page = next_page_link.click
    search_results_on(next_search_results_page).each do |url|
      urls << url
    end

    page += 1
  end

  urls.to_a.first(@max_results)
end