Class: EmailCrawler::Runner
- Inherits:
-
Object
- Object
- EmailCrawler::Runner
- Defined in:
- lib/email_crawler.rb
Constant Summary collapse
- MAX_CONCURRENCY =
50
Instance Attribute Summary collapse
-
#blacklisted_domains ⇒ Object
writeonly
Sets the attribute blacklisted_domains.
-
#logger ⇒ Object
writeonly
Sets the attribute logger.
-
#max_concurrency ⇒ Object
writeonly
Sets the attribute max_concurrency.
-
#max_links ⇒ Object
writeonly
Sets the attribute max_links.
-
#max_results ⇒ Object
writeonly
Sets the attribute max_results.
Instance Method Summary collapse
-
#initialize(google_website) {|_self| ... } ⇒ Runner
constructor
A new instance of Runner.
- #run(q) ⇒ Object
Constructor Details
#initialize(google_website) {|_self| ... } ⇒ Runner
Returns a new instance of Runner.
19 20 21 22 |
# File 'lib/email_crawler.rb', line 19 def initialize(google_website) @google_website = google_website yield(self) end |
Instance Attribute Details
#blacklisted_domains=(value) ⇒ Object (writeonly)
Sets the attribute blacklisted_domains
16 17 18 |
# File 'lib/email_crawler.rb', line 16 def blacklisted_domains=(value) @blacklisted_domains = value end |
#logger=(value) ⇒ Object
Sets the attribute logger
16 17 18 |
# File 'lib/email_crawler.rb', line 16 def logger=(value) @logger = value end |
#max_concurrency=(value) ⇒ Object (writeonly)
Sets the attribute max_concurrency
16 17 18 |
# File 'lib/email_crawler.rb', line 16 def max_concurrency=(value) @max_concurrency = value end |
#max_links=(value) ⇒ Object (writeonly)
Sets the attribute max_links
16 17 18 |
# File 'lib/email_crawler.rb', line 16 def max_links=(value) @max_links = value end |
#max_results=(value) ⇒ Object (writeonly)
Sets the attribute max_results
16 17 18 |
# File 'lib/email_crawler.rb', line 16 def max_results=(value) @max_results = value end |
Instance Method Details
#run(q) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/email_crawler.rb', line 24 def run(q) urls = Scraper.new(@google_website, max_results: @max_results, blacklisted_domains: @blacklisted_domains). search_result_urls_for(q) urls.each { |url| logger.info "#{url}" } queue = Thread::Queue.new urls.each { |url| queue.push(url) } links_by_url = ThreadSafe::Array.new threads = (1..[urls.length, @max_concurrency].min).map do |i| Thread.new(i) do |i| url = begin queue.pop(true) rescue ThreadError; end while url logger.info "[Thread ##{i}] grabbing page links for '#{url}'.." links = PageLinks.for(url, max_links: @max_links, logger: logger) links_by_url << [url, links] url = begin queue.pop(true) rescue ThreadError; end end end end threads.each(&:join) logger.debug "links_by_url: #{links_by_url.inspect}" links_by_url.each { |arr| queue.push(arr) } emails_by_url = ThreadSafe::Hash.new threads = (1..[links_by_url.length, @max_concurrency].min).map do |i| Thread.new(i) do |i| arr = begin queue.pop(true) rescue ThreadError; end while arr url, links = arr logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)" emails = EmailScanner.new(logger).scan(links) emails_by_url[url] = emails arr = begin queue.pop(true) rescue ThreadError; end end end end threads.each(&:join) logger.debug "emails_by_url: #{emails_by_url.inspect}" read_emails = Set.new CSV.generate do |csv| csv << %w(Email Domain URL) csv << [] emails_by_url.each do |url, emails_by_link| email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length } logger.info "#{url} (#{email_count} emails)" emails_by_link.each do |link, emails| emails.each do |email| csv << [email, url, link] if read_emails.add?(email) end end end end end |