Class: EmailCrawler::EmailScanner

Inherits:
Object
  • Object
show all
Defined in:
lib/email_crawler/email_scanner.rb

Constant Summary collapse

EMAIL_REGEXP =
/\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
SLEEP_TIME =
0.5

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ EmailScanner



8
9
10
11
12
13
# File 'lib/email_crawler/email_scanner.rb', line 8

def initialize(url)
  @url = url
  @logger = ::Logger.new(STDOUT).tap do |logger|
    logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
  end
end

Instance Method Details

#scan(links) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/email_crawler/email_scanner.rb', line 15

def scan(links)
  emails_by_link = {}

  links.each do |link|
    @logger.info "searching for emails on '#{link}'.."

    html = begin
      open(link).read
    rescue OpenURI::HTTPError => err
      @logger.warn(err)
      nil
    rescue => err
      if err.message =~ /redirection forbidden/
        link = err.message.split(" ").last
        retry
      end
    end
    next unless html

    emails = html.scan(EMAIL_REGEXP)
    emails_by_link[link] = Set.new(emails) unless emails.empty?
    sleep(SLEEP_TIME)
  end

  emails_by_link
end