Module: EmailCrawler::MechanizeHelper

Included in:
PageLinks, Scraper
Defined in:
lib/email_crawler/mechanize_helper.rb

Constant Summary collapse

READ_TIMEOUT =
15

Instance Method Summary collapse

Instance Method Details

#get(url) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/email_crawler/mechanize_helper.rb', line 17

def get(url)
  retried = false

  begin
    page = begin
             Timeout::timeout(READ_TIMEOUT) do
               agent.get(url)
             end
           rescue Timeout::Error
             unless retried
               retried = true
               retry
             end
           end
    page if page.is_a?(Mechanize::Page)
  rescue Mechanize::Error;
  rescue SocketError
    unless retried
      retried = true
      retry
    end
  end
end

#new_agentObject



7
8
9
10
11
12
13
14
15
# File 'lib/email_crawler/mechanize_helper.rb', line 7

def new_agent
  Thread.current[:agent] ||= Mechanize.new do |agent|
    agent.user_agent_alias = "Windows Mozilla"
    agent.open_timeout = agent.read_timeout = READ_TIMEOUT
    agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
    agent.history.max_size = 1
    yield(agent) if block_given?
  end
end