Class: EmailCrawler::EmailScanner

Inherits:
Object
  • Object
show all
Defined in:
lib/email_crawler/email_scanner.rb

Constant Summary collapse

EMAIL_REGEXP =
/\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
UTF_8 =
"UTF-8".freeze

Instance Method Summary collapse

Constructor Details

#initialize(logger = Logger.new("/dev/null")) ⇒ EmailScanner

Returns a new instance of EmailScanner.



8
9
10
# File 'lib/email_crawler/email_scanner.rb', line 8

def initialize(logger = Logger.new("/dev/null"))
  @logger = logger
end

Instance Method Details

#scan(links) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/email_crawler/email_scanner.rb', line 12

def scan(links)
  links.each_with_object({}) do |link, h|
    @logger.info "searching for emails on '#{link}'.."
    retried = false

    html = begin
             open(link).read
           rescue OpenURI::HTTPError => err
             @logger.warn(err)
             nil
           rescue => err
             if err.message =~ /redirection forbidden/
               link = err.message.split(" ").last
               retry
             end
           end
    next unless html

    begin
      emails = html.scan(EMAIL_REGEXP)
    rescue ArgumentError => err
      if retried
        emails = []
      else
        @logger.warn err.inspect
        html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
        retried = true
        retry
      end
    end

    h[link] = Set.new(emails) unless emails.empty?
  end
end