Class: EmailCrawler::EmailScanner

Inherits:
Object
  • Object
show all
Includes:
MechanizeHelper
Defined in:
lib/email_crawler/email_scanner.rb

Constant Summary collapse

EMAIL_REGEXP =
/\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
UTF_8 =
"UTF-8".freeze

Constants included from MechanizeHelper

MechanizeHelper::READ_TIMEOUT

Instance Method Summary collapse

Methods included from MechanizeHelper

#get, #new_agent

Constructor Details

#initialize(logger = Logger.new("/dev/null")) ⇒ EmailScanner

Returns a new instance of EmailScanner.



8
9
10
# File 'lib/email_crawler/email_scanner.rb', line 8

def initialize(logger = Logger.new("/dev/null"))
  @logger = logger
end

Instance Method Details

#scan(links) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/email_crawler/email_scanner.rb', line 12

def scan(links)
  links.each_with_object({}) do |link, h|
    @logger.info "searching for emails on '#{link}'.."
    retried = false

    begin
      html = get(link).body
    rescue => err
      @logger.warn err.inspect
      nil
    end
    next unless html

    begin
      emails = html.scan(EMAIL_REGEXP)
    rescue ArgumentError => err
      if retried
        emails = []
      else
        @logger.warn err.inspect
        html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
        retried = true
        retry
      end
    end

    h[link] = Set.new(emails) unless emails.empty?
  end
end