24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# File 'lib/email_crawler.rb', line 24
def run(q, max_links = PageLinks::MAX_LINKS)
urls = Scraper.new(@google_website).top_ten_urls_for(q)
urls.each { |url, links| @logger.info "#{url}" }
threads = (1..urls.length).map do |i|
Thread.new(i, urls[i-1]) do |i, url|
@logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
Thread.current[:url] = url
Thread.current[:links] = PageLinks.for(url, max_links)
end
end
threads.each(&:join)
threads.each { |thread| @logger.info "#{thread[:url]} (#{thread[:links].length} links)" }
links_by_url = Hash[threads.map { |thread| [thread[:url], thread[:links]] }]
threads = (links_by_url).map.with_index do |arr, i|
Thread.new(i+1, arr.first, arr.last) do |i, url, links|
@logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
Thread.current[:url] = url
Thread.current[:emails] = EmailScanner.new(url).scan(links)
end
end
threads.each(&:join)
read_emails = Set.new
CSV.generate do |csv|
csv << %w(Email Domain URL)
csv << []
threads.each do |thread|
email_count = thread[:emails].inject(0) { |sum, arr| sum += arr.last.length }
@logger.info "#{thread[:url]} (#{email_count} emails)"
url = thread[:url]
thread[:emails].each do |link, emails|
emails.each do |email|
csv << [email, url, link] if read_emails.add?(email)
end
end
end
end
end
|