Class: EmailCrawler::EmailScanner
- Inherits:
-
Object
- Object
- EmailCrawler::EmailScanner
- Defined in:
- lib/email_crawler/email_scanner.rb
Constant Summary collapse
- EMAIL_REGEXP =
/\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i- SLEEP_TIME =
0.5
Instance Method Summary collapse
-
#initialize(url) ⇒ EmailScanner
constructor
A new instance of EmailScanner.
- #scan(links) ⇒ Object
Constructor Details
#initialize(url) ⇒ EmailScanner
8 9 10 11 12 13 |
# File 'lib/email_crawler/email_scanner.rb', line 8 def initialize(url) @url = url @logger = ::Logger.new(STDOUT).tap do |logger| logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR end end |
Instance Method Details
#scan(links) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/email_crawler/email_scanner.rb', line 15 def scan(links) emails_by_link = {} links.each do |link| @logger.info "searching for emails on '#{link}'.." html = begin open(link).read rescue OpenURI::HTTPError => err @logger.warn(err) nil rescue => err if err. =~ /redirection forbidden/ link = err..split(" ").last retry end end next unless html emails = html.scan(EMAIL_REGEXP) emails_by_link[link] = Set.new(emails) unless emails.empty? sleep(SLEEP_TIME) end emails_by_link end |