Class: Mechanize::HTTP::Agent

Inherits:
Object
  • Object
show all
Defined in:
lib/mechanize/http/agent.rb

Overview

An HTTP (and local disk access) user agent

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeAgent

Returns a new instance of Agent.



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/mechanize/http/agent.rb', line 82

def initialize
  @auth_hash            = {} # Keep track of urls for sending auth
  @conditional_requests = true
  @context              = nil
  @cookie_jar           = Mechanize::CookieJar.new
  @digest               = nil # DigestAuth Digest
  @digest_auth          = Net::HTTP::DigestAuth.new
  @follow_meta_refresh  = false
  @gzip_enabled         = true
  @history              = Mechanize::History.new
  @keep_alive_time      = 300
  @open_timeout         = nil
  @password             = nil # HTTP auth password
  @post_connect_hooks   = []
  @pre_connect_hooks    = []
  @proxy_uri            = nil
  @read_timeout         = nil
  @redirect_ok          = true
  @redirection_limit    = 20
  @request_headers      = {}
  @robots               = false
  @user                 = nil # HTTP auth user
  @user_agent           = nil
  @webrobots            = nil

  @ca_file         = nil # OpenSSL server certificate file
  @cert            = nil # OpenSSL Certificate
  @key             = nil # OpenSSL Private Key
  @pass            = nil # OpenSSL Password
  @verify_callback = nil

  @scheme_handlers = Hash.new { |h, scheme|
    h[scheme] = lambda { |link, page|
      raise Mechanize::UnsupportedSchemeError, scheme
    }
  }

  @scheme_handlers['http']      = lambda { |link, page| link }
  @scheme_handlers['https']     = @scheme_handlers['http']
  @scheme_handlers['relative']  = @scheme_handlers['http']
  @scheme_handlers['file']      = @scheme_handlers['http']
end

Instance Attribute Details

#ca_fileObject

Path to an OpenSSL server certificate file



61
62
63
# File 'lib/mechanize/http/agent.rb', line 61

def ca_file
  @ca_file
end

#certObject

An OpenSSL client certificate or the path to a certificate file.



67
68
69
# File 'lib/mechanize/http/agent.rb', line 67

def cert
  @cert
end

#conditional_requestsObject

Disables If-Modified-Since conditional requests (enabled by default)



9
10
11
# File 'lib/mechanize/http/agent.rb', line 9

def conditional_requests
  @conditional_requests
end

#contextObject

Returns the value of attribute context.



10
11
12
# File 'lib/mechanize/http/agent.rb', line 10

def context
  @context
end

Returns the value of attribute cookie_jar.



6
7
8
# File 'lib/mechanize/http/agent.rb', line 6

def cookie_jar
  @cookie_jar
end

#follow_meta_refreshObject

Follow HTML meta refresh. If set to :anywhere meta refresh tags outside of the head element will be followed.



14
15
16
# File 'lib/mechanize/http/agent.rb', line 14

def follow_meta_refresh
  @follow_meta_refresh
end

#gzip_enabledObject

Returns the value of attribute gzip_enabled.



15
16
17
# File 'lib/mechanize/http/agent.rb', line 15

def gzip_enabled
  @gzip_enabled
end

#historyObject

Returns the value of attribute history.



16
17
18
# File 'lib/mechanize/http/agent.rb', line 16

def history
  @history
end

#httpObject (readonly)

:nodoc:



80
81
82
# File 'lib/mechanize/http/agent.rb', line 80

def http
  @http
end

#keyObject

An OpenSSL private key or the path to a private key



64
65
66
# File 'lib/mechanize/http/agent.rb', line 64

def key
  @key
end

#open_timeoutObject

Length of time to wait until a connection is opened in seconds



19
20
21
# File 'lib/mechanize/http/agent.rb', line 19

def open_timeout
  @open_timeout
end

#passObject

OpenSSL key password



70
71
72
# File 'lib/mechanize/http/agent.rb', line 70

def pass
  @pass
end

#passwordObject

Returns the value of attribute password.



21
22
23
# File 'lib/mechanize/http/agent.rb', line 21

def password
  @password
end

#post_connect_hooksObject (readonly)

A list of hooks to call after retrieving a response. Hooks are called with the agent and the response returned.



27
28
29
# File 'lib/mechanize/http/agent.rb', line 27

def post_connect_hooks
  @post_connect_hooks
end

#pre_connect_hooksObject (readonly)

A list of hooks to call before making a request. Hooks are called with the agent and the request to be performed.



32
33
34
# File 'lib/mechanize/http/agent.rb', line 32

def pre_connect_hooks
  @pre_connect_hooks
end

#proxy_uriObject (readonly)

Returns the value of attribute proxy_uri.



22
23
24
# File 'lib/mechanize/http/agent.rb', line 22

def proxy_uri
  @proxy_uri
end

#read_timeoutObject

Length of time to attempt to read data from the server



35
36
37
# File 'lib/mechanize/http/agent.rb', line 35

def read_timeout
  @read_timeout
end

#redirect_okObject

Controls how this agent deals with redirects. The following values are allowed:

:all, true

All 3xx redirects are followed (default)

:permanent

Only 301 Moved Permanantly redirects are followed

false

No redirects are followed



44
45
46
# File 'lib/mechanize/http/agent.rb', line 44

def redirect_ok
  @redirect_ok
end

#redirection_limitObject

Returns the value of attribute redirection_limit.



45
46
47
# File 'lib/mechanize/http/agent.rb', line 45

def redirection_limit
  @redirection_limit
end

#request_headersObject

A hash of request headers to be used



49
50
51
# File 'lib/mechanize/http/agent.rb', line 49

def request_headers
  @request_headers
end

#robotsObject

When true, this agent will consult the site’s robots.txt for each access.



53
54
55
# File 'lib/mechanize/http/agent.rb', line 53

def robots
  @robots
end

#scheme_handlersObject

Returns the value of attribute scheme_handlers.



55
56
57
# File 'lib/mechanize/http/agent.rb', line 55

def scheme_handlers
  @scheme_handlers
end

#userObject

Returns the value of attribute user.



57
58
59
# File 'lib/mechanize/http/agent.rb', line 57

def user
  @user
end

#user_agentObject

Returns the value of attribute user_agent.



58
59
60
# File 'lib/mechanize/http/agent.rb', line 58

def user_agent
  @user_agent
end

#verify_callbackObject

A callback for additional certificate verification. See OpenSSL::SSL::SSLContext#verify_callback

The callback can be used for debugging or to ignore errors by always returning true. Specifying nil uses the default method that was valid when the SSLContext was created



78
79
80
# File 'lib/mechanize/http/agent.rb', line 78

def verify_callback
  @verify_callback
end

Instance Method Details

#backObject

Equivalent to the browser back button. Returns the most recent page visited.



127
128
129
# File 'lib/mechanize/http/agent.rb', line 127

def back
  @history.pop
end

#certificateObject



131
132
133
# File 'lib/mechanize/http/agent.rb', line 131

def certificate
  @http.certificate
end

#connection_for(uri) ⇒ Object



135
136
137
138
139
140
141
142
# File 'lib/mechanize/http/agent.rb', line 135

def connection_for uri
  case uri.scheme.downcase
  when 'http', 'https' then
    return @http
  when 'file' then
    return Mechanize::FileConnection.new
  end
end

#current_pageObject

Returns the latest page loaded by the agent



147
148
149
# File 'lib/mechanize/http/agent.rb', line 147

def current_page
  @history.last
end

#enable_gzip(request) ⇒ Object



151
152
153
154
155
156
157
# File 'lib/mechanize/http/agent.rb', line 151

def enable_gzip request
  request['accept-encoding'] = if @gzip_enabled
                                 'gzip,deflate,identity'
                               else
                                 'identity'
                               end
end

#fetch(uri, method = :get, headers = {}, params = [], referer = current_page, redirects = 0) ⇒ Object

uri is an absolute URI



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/mechanize/http/agent.rb', line 160

def fetch uri, method = :get, headers = {}, params = [],
          referer = current_page, redirects = 0
  referer_uri = referer ? referer.uri : nil

  uri = resolve uri, referer

  uri, params = resolve_parameters uri, method, params

  request = http_request uri, method, params

  connection = connection_for uri

  request_auth request, uri

  enable_gzip request

  request_language_charset request
  request_cookies request, uri
  request_host request, uri
  request_referer request, uri, referer_uri
  request_user_agent request
  request_add_headers request, headers

  pre_connect request

  # Consult robots.txt
  if robots && uri.is_a?(URI::HTTP)
    robots_allowed?(uri) or raise Mechanize::RobotsDisallowedError.new(uri)
  end

  # Add If-Modified-Since if page is in history
  if (page = visited_page(uri)) and page.response['Last-Modified']
    request['If-Modified-Since'] = page.response['Last-Modified']
  end if(@conditional_requests)

  # Specify timeouts if given
  connection.open_timeout = @open_timeout if @open_timeout
  connection.read_timeout = @read_timeout if @read_timeout

  request_log request

  response_body_io = nil

  # Send the request
  response = connection.request(uri, request) { |res|
    response_log res

    response_body_io = response_read res, request

    res
  }

  response_body = response_content_encoding response, response_body_io

  post_connect uri, response, response_body

  page = response_parse response, response_body, uri

  response_cookies response, uri, page

  meta = response_follow_meta_refresh response, uri, page, redirects
  return meta if meta

  case response
  when Net::HTTPSuccess
    if robots && page.is_a?(Mechanize::Page)
      page.parser.noindex? and raise Mechanize::RobotsDisallowedError.new(uri)
    end

    page
  when Mechanize::FileResponse
    page
  when Net::HTTPNotModified
    log.debug("Got cached page") if log
    visited_page(uri) || page
  when Net::HTTPRedirection
    response_redirect response, method, page, redirects
  when Net::HTTPUnauthorized
    response_authenticate(response, page, uri, request, headers, params,
                          referer)
  else
    raise Mechanize::ResponseCodeError.new(page), "Unhandled response"
  end
end

#get_robots(uri) ⇒ Object

:nodoc:



726
727
728
729
730
731
# File 'lib/mechanize/http/agent.rb', line 726

def get_robots(uri) # :nodoc:
  fetch(uri).body
rescue Mechanize::ResponseCodeError => e
  return '' if e.response_code == '404'
  raise e
end

#http_request(uri, method, params = nil) ⇒ Object



253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/mechanize/http/agent.rb', line 253

def http_request uri, method, params = nil
  case uri.scheme.downcase
  when 'http', 'https' then
    klass = Net::HTTP.const_get(method.to_s.capitalize)

    request ||= klass.new(uri.request_uri)
    request.body = params.first if params

    request
  when 'file' then
    Mechanize::FileRequest.new uri
  end
end

#logObject



267
268
269
# File 'lib/mechanize/http/agent.rb', line 267

def log
  Mechanize.log
end

#max_historyObject



245
246
247
# File 'lib/mechanize/http/agent.rb', line 245

def max_history
  @history.max_size
end

#max_history=(length) ⇒ Object



249
250
251
# File 'lib/mechanize/http/agent.rb', line 249

def max_history=(length)
  @history.max_size = length
end

#post_connect(uri, response, body) ⇒ Object

Invokes hooks added to post_connect_hooks after a response is returned and the response body is handled.

Yields the context, the uri for the request, the response and the response body.



278
279
280
281
282
# File 'lib/mechanize/http/agent.rb', line 278

def post_connect uri, response, body # :yields: agent, uri, response, body
  @post_connect_hooks.each do |hook|
    hook.call self, uri, response, body
  end
end

#pre_connect(request) ⇒ Object

Invokes hooks added to pre_connect_hooks before a request is made. Yields the agent and the request that will be performed to each hook.



288
289
290
291
292
# File 'lib/mechanize/http/agent.rb', line 288

def pre_connect request # :yields: agent, request
  @pre_connect_hooks.each do |hook|
    hook.call self, request
  end
end

#request_add_headers(request, headers = {}) ⇒ Object



347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# File 'lib/mechanize/http/agent.rb', line 347

def request_add_headers request, headers = {}
  @request_headers.each do |k,v|
    request[k] = v
  end

  headers.each do |field, value|
    case field
    when :etag              then request["ETag"] = value
    when :if_modified_since then request["If-Modified-Since"] = value
    when Symbol then
      raise ArgumentError, "unknown header symbol #{field}"
    else
      request[field] = value
    end
  end
end

#request_auth(request, uri) ⇒ Object



294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/mechanize/http/agent.rb', line 294

def request_auth request, uri
  auth_type = @auth_hash[uri.host]

  return unless auth_type

  case auth_type
  when :basic
    request.basic_auth @user, @password
  when :digest, :iis_digest
    uri.user = @user
    uri.password = @password

    iis = auth_type == :iis_digest

    auth = @digest_auth.auth_header uri, @digest, request.method, iis

    request['Authorization'] = auth
  end
end

#request_cookies(request, uri) ⇒ Object



314
315
316
317
318
319
320
321
322
# File 'lib/mechanize/http/agent.rb', line 314

def request_cookies request, uri
  return if @cookie_jar.empty? uri

  cookies = @cookie_jar.cookies uri

  return if cookies.empty?

  request.add_field 'Cookie', cookies.join('; ')
end

#request_host(request, uri) ⇒ Object



324
325
326
327
328
329
# File 'lib/mechanize/http/agent.rb', line 324

def request_host request, uri
  port = [80, 443].include?(uri.port.to_i) ? nil : uri.port
  host = uri.host

  request['Host'] = [host, port].compact.join ':'
end

#request_language_charset(request) ⇒ Object



331
332
333
334
# File 'lib/mechanize/http/agent.rb', line 331

def request_language_charset request
  request['accept-charset']  = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
  request['accept-language'] = 'en-us,en;q=0.5'
end

#request_log(request) ⇒ Object

Log specified headers for the request



337
338
339
340
341
342
343
344
345
# File 'lib/mechanize/http/agent.rb', line 337

def request_log request
  return unless log

  log.info("#{request.class}: #{request.path}")

  request.each_header do |k, v|
    log.debug("request-header: #{k} => #{v}")
  end
end

#request_referer(request, uri, referer) ⇒ Object



364
365
366
367
368
369
370
# File 'lib/mechanize/http/agent.rb', line 364

def request_referer request, uri, referer
  return unless referer
  return if 'https' == referer.scheme.downcase and
            'https' != uri.scheme.downcase

  request['Referer'] = referer
end

#request_user_agent(request) ⇒ Object



372
373
374
# File 'lib/mechanize/http/agent.rb', line 372

def request_user_agent request
  request['User-Agent'] = @user_agent if @user_agent
end

#resolve(uri, referer = current_page) ⇒ Object



376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
# File 'lib/mechanize/http/agent.rb', line 376

def resolve(uri, referer = current_page)
  uri = uri.dup if uri.is_a?(URI)

  unless uri.is_a?(URI)
    uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match|
      if RUBY_VERSION >= "1.9.0"
        Mechanize::Util.uri_escape(match)
      else
        sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0])
      end
    }

    unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/)
    escaped   = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/)

    escaped_uri = Mechanize::Util.html_unescape(
      unescaped.zip(escaped).map { |x,y|
        "#{WEBrick::HTTPUtils.escape(x)}#{y}"
      }.join('')
    )

    begin
      uri = URI.parse(escaped_uri)
    rescue
      uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri))
    end
  end

  scheme = uri.relative? ? 'relative' : uri.scheme.downcase
  uri = @scheme_handlers[scheme].call(uri, referer)

  if referer && referer.uri
    if uri.path.length == 0 && uri.relative?
      uri.path = referer.uri.path
    end
  end

  uri.path = '/' if uri.path.length == 0

  if uri.relative?
    raise ArgumentError, "absolute URL needed (not #{uri})" unless
      referer && referer.uri

    base = nil
    if referer.respond_to?(:bases) && referer.parser
      base = referer.bases.last
    end

    uri = ((base && base.uri && base.uri.absolute?) ?
           base.uri :
           referer.uri) + uri
    uri = referer.uri + uri
    # Strip initial "/.." bits from the path
    uri.path.sub!(/^(\/\.\.)+(?=\/)/, '')
  end

  unless ['http', 'https', 'file'].include?(uri.scheme.downcase)
    raise ArgumentError, "unsupported scheme: #{uri.scheme}"
  end

  uri
end

#resolve_parameters(uri, method, parameters) ⇒ Object



439
440
441
442
443
444
445
446
447
448
449
450
451
452
# File 'lib/mechanize/http/agent.rb', line 439

def resolve_parameters uri, method, parameters
  case method
  when :head, :get, :delete, :trace then
    if parameters and parameters.length > 0
      uri.query ||= ''
      uri.query << '&' if uri.query.length > 0
      uri.query << Mechanize::Util.build_query_string(parameters)
    end

    return uri, nil
  end

  return uri, parameters
end

#response_authenticate(response, page, uri, request, headers, params, referer) ⇒ Object



621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
# File 'lib/mechanize/http/agent.rb', line 621

def response_authenticate(response, page, uri, request, headers, params,
                          referer)
  raise Mechanize::ResponseCodeError, page unless @user || @password
  raise Mechanize::ResponseCodeError, page if @auth_hash.has_key?(uri.host)

  if response['www-authenticate'] =~ /Digest/i
    @auth_hash[uri.host] = :digest
    if response['server'] =~ /Microsoft-IIS/
      @auth_hash[uri.host] = :iis_digest
    end
    @digest = response['www-authenticate']
  else
    @auth_hash[uri.host] = :basic
  end

  fetch uri, request.method.downcase.to_sym, headers, params, referer
end

#response_content_encoding(response, body_io) ⇒ Object



454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# File 'lib/mechanize/http/agent.rb', line 454

def response_content_encoding response, body_io
  length = response.content_length || body_io.length

  case response['Content-Encoding']
  when nil, 'none', '7bit' then
    body_io.string
  when 'deflate' then
    log.debug('deflate body') if log

    return if length.zero?

    begin
      Zlib::Inflate.inflate body_io.string
    rescue Zlib::BufError, Zlib::DataError
      log.error('Unable to inflate page, retrying with raw deflate') if log
      begin
        Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.string)
      rescue Zlib::BufError, Zlib::DataError
        log.error("unable to inflate page: #{$!}") if log
        ''
      end
    end
  when 'gzip', 'x-gzip' then
    log.debug('gzip body') if log

    return if length.zero?

    begin
      zio = Zlib::GzipReader.new body_io
      zio.read
    rescue Zlib::BufError, Zlib::GzipFile::Error
      log.error('Unable to gunzip body, trying raw inflate') if log
      body_io.rewind
      body_io.read 10
      Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body_io.read)
    rescue Zlib::DataError
      log.error("unable to gunzip page: #{$!}") if log
      ''
    ensure
      zio.close if zio and not zio.closed?
    end
  else
    raise Mechanize::Error,
          "Unsupported Content-Encoding: #{response['Content-Encoding']}"
  end
end

#response_cookies(response, uri, page) ⇒ Object



501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# File 'lib/mechanize/http/agent.rb', line 501

def response_cookies response, uri, page
  if Mechanize::Page === page and page.body =~ /Set-Cookie/n
    page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta|
      Mechanize::Cookie.parse(uri, meta['content']) { |c|
        log.debug("saved cookie: #{c}") if log
        @cookie_jar.add(uri, c)
      }
    end
  end

  header_cookies = response.get_fields 'Set-Cookie'

  return unless header_cookies

  header_cookies.each do |cookie|
    Mechanize::Cookie.parse(uri, cookie) { |c|
      log.debug("saved cookie: #{c}") if log
      @cookie_jar.add(uri, c)
    }
  end
end

#response_follow_meta_refresh(response, uri, page, redirects) ⇒ Object



523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
# File 'lib/mechanize/http/agent.rb', line 523

def response_follow_meta_refresh response, uri, page, redirects
  return unless @follow_meta_refresh

  redirect_uri = nil
  referer      = page

  if page.respond_to?(:meta_refresh) and (redirect = page.meta_refresh.first)
    redirect_uri = Mechanize::Util.uri_unescape redirect.uri.to_s
    sleep redirect.node['delay'].to_f
    referer = Mechanize::Page.new(nil, {'content-type'=>'text/html'})
  elsif refresh = response['refresh']
    delay, redirect_uri = Mechanize::Page::MetaRefresh.parse refresh, uri
    raise Mechanize::Error, 'Invalid refresh http header' unless delay
    raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
      redirects + 1 > @redirection_limit
    sleep delay.to_f
  end

  if redirect_uri
    @history.push(page, page.uri)
    fetch redirect_uri, :get, {}, [], referer, redirects + 1
  end
end

#response_log(response) ⇒ Object



547
548
549
550
551
552
553
554
555
556
# File 'lib/mechanize/http/agent.rb', line 547

def response_log response
  return unless log

  log.info("status: #{response.class} #{response.http_version} " \
           "#{response.code} #{response.message}")

  response.each_header do |k, v|
    log.debug("response-header: #{k} => #{v}")
  end
end

#response_parse(response, body, uri) ⇒ Object



558
559
560
# File 'lib/mechanize/http/agent.rb', line 558

def response_parse response, body, uri
  @context.parse uri, response, body
end

#response_read(response, request) ⇒ Object



562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
# File 'lib/mechanize/http/agent.rb', line 562

def response_read response, request
  body_io = StringIO.new
  body_io.set_encoding Encoding::BINARY if body_io.respond_to? :set_encoding
  total = 0

  begin
    response.read_body { |part|
      total += part.length
      body_io.write(part)
      log.debug("Read #{part.length} bytes (#{total} total)") if log
    }
  rescue Net::HTTP::Persistent::Error => e
    body_io.rewind
    raise Mechanize::ResponseReadError.new(e, response, body_io)
  end

  body_io.rewind

  raise Mechanize::ResponseCodeError, response if
    Net::HTTPUnknownResponse === response

  content_length = response.content_length

  unless Net::HTTP::Head === request or Net::HTTPRedirection === response then
    raise EOFError, "Content-Length (#{content_length}) does not match " \
                    "response body length (#{body_io.length})" if
      content_length and content_length != body_io.length
  end

  body_io
end

#response_redirect(response, method, page, redirects) ⇒ Object



594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
# File 'lib/mechanize/http/agent.rb', line 594

def response_redirect response, method, page, redirects
  case @redirect_ok
  when true, :all
    # shortcut
  when false, nil
    return page
  when :permanent
    return page unless Net::HTTPMovedPermanently === response
  end

  log.info("follow redirect to: #{response['Location']}") if log

  from_uri = page.uri

  raise Mechanize::RedirectLimitReachedError.new(page, redirects) if
    redirects + 1 > @redirection_limit

  redirect_method = method == :head ? :head : :get

  page = fetch(response['Location'].to_s, redirect_method, {}, [], page,
               redirects + 1)

  @history.push(page, from_uri)

  return page
end

#robots_allowed?(uri) ⇒ Boolean

Tests if this agent is allowed to access url, consulting the site’s robots.txt.

Returns:

  • (Boolean)


649
650
651
652
653
# File 'lib/mechanize/http/agent.rb', line 649

def robots_allowed? uri
  return true if uri.request_uri == '/robots.txt'

  webrobots.allowed? uri
end

#robots_disallowed?(url) ⇒ Boolean

Opposite of robots_allowed?

Returns:

  • (Boolean)


657
658
659
# File 'lib/mechanize/http/agent.rb', line 657

def robots_disallowed? url
  !robots_allowed? url
end

#robots_error(url) ⇒ Object

Returns an error object if there is an error in fetching or parsing robots.txt of the site url.



663
664
665
# File 'lib/mechanize/http/agent.rb', line 663

def robots_error(url)
  webrobots.error(url)
end

#robots_error!(url) ⇒ Object

Raises the error if there is an error in fetching or parsing robots.txt of the site url.



669
670
671
# File 'lib/mechanize/http/agent.rb', line 669

def robots_error!(url)
  webrobots.error!(url)
end

#robots_reset(url) ⇒ Object

Removes robots.txt cache for the site url.



674
675
676
# File 'lib/mechanize/http/agent.rb', line 674

def robots_reset(url)
  webrobots.reset(url)
end

#set_httpObject



678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
# File 'lib/mechanize/http/agent.rb', line 678

def set_http
  @http = Net::HTTP::Persistent.new 'mechanize', @proxy_uri

  @http.keep_alive = @keep_alive_time

  @http.ca_file         = @ca_file
  @http.verify_callback = @verify_callback

  if @cert and @key then
    cert = if OpenSSL::X509::Certificate === @cert then
             @cert
           else
             OpenSSL::X509::Certificate.new ::File.read @cert
           end

    key = if OpenSSL::PKey::PKey === @key then
            @key
          else
            OpenSSL::PKey::RSA.new ::File.read(@key), @pass
          end

    @http.certificate = cert
    @http.private_key = key
  end
end

#set_proxy(addr, port, user = nil, pass = nil) ⇒ Object

Sets the proxy address, port, user, and password addr should be a host, with no “http://”



706
707
708
709
710
711
712
713
714
# File 'lib/mechanize/http/agent.rb', line 706

def set_proxy(addr, port, user = nil, pass = nil)
  return unless addr and port
  @proxy_uri = URI "http://#{addr}"
  @proxy_uri.port = port
  @proxy_uri.user     = user if user
  @proxy_uri.password = pass if pass

  @proxy_uri
end

#visited_page(url) ⇒ Object

Returns a visited page for the url passed in, otherwise nil



722
723
724
# File 'lib/mechanize/http/agent.rb', line 722

def visited_page url
  @history.visited_page resolve url
end

#webrobotsObject



733
734
735
# File 'lib/mechanize/http/agent.rb', line 733

def webrobots
  @webrobots ||= WebRobots.new(@user_agent, :http_get => method(:get_robots))
end