Class: Digger::HTTP

Inherits:
Object
  • Object
show all
Defined in:
lib/digger/http.rb

Constant Summary collapse

REDIRECT_LIMIT =

Maximum number of redirects to follow on each get_response

5
RESCUABLE_ERRORS =
[
  EOFError,
  Errno::ECONNREFUSED,
  Errno::ECONNRESET,
  Errno::EHOSTUNREACH,
  Errno::EINVAL,
  Errno::EPIPE,
  Errno::ETIMEDOUT,
  Net::HTTPBadResponse,
  Net::HTTPHeaderSyntaxError,
  Net::ProtocolError,
  SocketError,
  Timeout::Error,
  Zlib::DataError,
  Zlib::GzipFile::Error
]

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ HTTP

Returns a new instance of HTTP.



30
31
32
33
34
# File 'lib/digger/http.rb', line 30

def initialize(opts = {})
  @connections = {}
  @connections_hits = {}
  @opts = opts
end

Instance Method Details

#accept_cookies?Boolean

Does this HTTP client accept cookies from the server?

Returns:

  • (Boolean)


152
153
154
# File 'lib/digger/http.rb', line 152

def accept_cookies?
  @opts[:accept_cookies]
end


156
157
158
159
# File 'lib/digger/http.rb', line 156

def cookie_jar
  @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
  @opts[:cookie_jar]
end

#fetch_page(url, referer = nil, depth = nil) ⇒ Object

Fetch a single Page from the response of an HTTP request to url. Just gets the final destination page.



40
41
42
# File 'lib/digger/http.rb', line 40

def fetch_page(url, referer = nil, depth = nil)
  fetch_pages(url, referer, depth).last
end

#fetch_pages(url, referer = nil, depth = nil) ⇒ Object

Create new Pages from the response of an HTTP request to url, including redirects



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/digger/http.rb', line 48

def fetch_pages(url, referer = nil, depth = nil)
  url = URI(url)
  pages = []
  get(url, referer) do |response, code, location, redirect_to, response_time|
    handle_compression response if handle_compression?
    pages << Page.new(location, body: response.body,
                                code: code,
                                headers: response.to_hash,
                                referer: referer,
                                depth: depth,
                                redirect_to: redirect_to,
                                response_time: response_time,
                                fetched_at: Time.now.to_i)
  end

  pages
rescue *RESCUABLE_ERRORS => e
  if verbose?
    puts e.inspect
    puts e.backtrace
  end

  [Page.new(url, error: e, referer: referer, depth: depth)]
end

#handle_compression?Boolean

Accept response compression, may bring encoding error if true

Returns:

  • (Boolean)


76
77
78
# File 'lib/digger/http.rb', line 76

def handle_compression?
  @opts[:handle_compression]
end

#open_timeoutObject

HTTP open timeout in seconds



146
147
148
# File 'lib/digger/http.rb', line 146

def open_timeout
  @opts[:open_timeout]
end

#proxy_hostObject

The proxy address string



102
103
104
# File 'lib/digger/http.rb', line 102

def proxy_host
  @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
end

#proxy_host_portObject

Shorthand to get proxy info with a single call It returns an array of [‘addr’, port, ‘user’, ‘pass’]



132
133
134
# File 'lib/digger/http.rb', line 132

def proxy_host_port
  @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
end

#proxy_passObject

The proxy password



123
124
125
126
# File 'lib/digger/http.rb', line 123

def proxy_pass
  #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
  @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
end

#proxy_portObject

The proxy port



109
110
111
# File 'lib/digger/http.rb', line 109

def proxy_port
  @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
end

#proxy_userObject

The proxy username



116
117
118
# File 'lib/digger/http.rb', line 116

def proxy_user
  @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
end

#read_timeoutObject

HTTP read timeout in seconds



139
140
141
# File 'lib/digger/http.rb', line 139

def read_timeout
  @opts[:read_timeout]
end

#redirect_limitObject

The maximum number of redirects to follow



83
84
85
# File 'lib/digger/http.rb', line 83

def redirect_limit
  @opts[:redirect_limit] || REDIRECT_LIMIT
end

#user_agentObject

The user-agent string which will be sent with each request, or nil if no such option is set



91
92
93
94
95
96
97
# File 'lib/digger/http.rb', line 91

def user_agent
  if @opts[:user_agent].respond_to?(:sample)
    @opts[:user_agent].sample
  else
    @opts[:user_agent]
  end
end