Class: TinyGrabber::Agent

Inherits:
Object
  • Object
show all
Defined in:
lib/tiny_grabber/agent.rb

Constant Summary collapse

AGENT_ALIASES =

Agent aliases given from www.useragentstring.com/pages/Chrome/

[
  # Chrome
  'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
  # Firefox
  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
  'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
  'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
  # Internet Explorer
  'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
  'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
  'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)',
  'Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)',
  # Opera
  'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
  'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
  'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
  'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52'
].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeAgent

Initialization object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/tiny_grabber/agent.rb', line 52

def initialize
  @debug = Debug.new

  # Initialize variables agent attributes
  @user_agent = AGENT_ALIASES[rand(AGENT_ALIASES.count) - 1]
  @proxy = []
  @basic_auth = {}
  @headers = {}
  @cookies = nil
  @follow_location = false
  @read_timeout = 10
  # Initialize variable for URI object
  @uri = nil
  # Initialize variable for Net::HTTP request object
  @http = Net::HTTP
  # Initialize variable for Net::HTTP response object
  @response = nil
  @verify_mode = OpenSSL::SSL::VERIFY_NONE
end

Instance Attribute Details

#basic_auth=(basic_auth) ⇒ Object (writeonly)

Set BASIC_AUTH agent attribute

Parameters:

  • basic_auth

    Authentification configuration



16
17
18
# File 'lib/tiny_grabber/agent.rb', line 16

def basic_auth=(value)
  @basic_auth = value
end

#cookiesObject

Headers



20
21
22
# File 'lib/tiny_grabber/agent.rb', line 20

def cookies
  @cookies
end

#debug=(debug) ⇒ Object (writeonly)

Set debug configuration

Parameters:

  • debug


8
9
10
# File 'lib/tiny_grabber/agent.rb', line 8

def debug=(value)
  @debug = value
end

#follow_location=(follow_location) ⇒ Object (writeonly)

Init follow location for redirect

Parameters:

  • follow_location

    Follow location flag



24
25
26
# File 'lib/tiny_grabber/agent.rb', line 24

def follow_location=(value)
  @follow_location = value
end

#headersObject

Headers



18
19
20
# File 'lib/tiny_grabber/agent.rb', line 18

def headers
  @headers
end

#proxyObject

Remote proxy configuration



14
15
16
# File 'lib/tiny_grabber/agent.rb', line 14

def proxy
  @proxy
end

#read_timeout=(read_timeout) ⇒ Object (writeonly)

Set READ_TIMEOUT agent attribute

Parameters:

  • read_timeout

    Waiting time to reading



10
11
12
# File 'lib/tiny_grabber/agent.rb', line 10

def read_timeout=(value)
  @read_timeout = value
end

#user_agent=(user_agent) ⇒ Object (writeonly)

Set USER_AGENT agent attribute

Parameters:

  • user_agent

    Web browser name



12
13
14
# File 'lib/tiny_grabber/agent.rb', line 12

def user_agent=(value)
  @user_agent = value
end

#verify_mode=(value) ⇒ Object (writeonly)

Set verify mode



22
23
24
# File 'lib/tiny_grabber/agent.rb', line 22

def verify_mode=(value)
  @verify_mode = value
end

Instance Method Details

#convert_to_uri(url) ⇒ Object

Initialize URI object from request url

Parameters:

  • url

    Request link



248
249
250
251
252
253
254
# File 'lib/tiny_grabber/agent.rb', line 248

def convert_to_uri(url)
  # Remove anchor
  url = url.gsub(/#.*\Z/, '')
  # It's magic work with escaped url
  @uri = URI(URI.escape(URI.unescape(url)))
  @debug.save "-> [uri] = #{@uri}" if @debug.active
end

#fetch(url, method = :get, headers = {}, params = {}) ⇒ Object

Fetch request for GET and POST HTTP methods Setting USER_AGENT, BASIC_AUTH, HEADERS, COOKIES request attribute Make response and save COOKIES for next requests

Parameters:

  • url

    Resource link

  • method (defaults to: :get)

    Request method

  • headers (defaults to: {})

    Request header

  • params (defaults to: {})

    Request additional params



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/tiny_grabber/agent.rb', line 186

def fetch(url, method = :get, headers = {}, params = {})
  if @debug.active
    @debug.save '=============================='
    @debug.save "#{method.upcase} #{url}"
    @debug.save "-> [proxy] = #{@proxy}" if @proxy
    @debug.save "-> [params] = #{params}"
    @debug.save '------------------------------'
  end
  convert_to_uri url
  case method
  when :get
    @request = Net::HTTP::Get.new(@uri.request_uri)
  when :post
    @request = Net::HTTP::Post.new(@uri.request_uri)
    @request.set_form_data(params)
  end
  set_user_agent if @user_agent
  set_basic_auth unless @basic_auth.empty?
  @headers = headers unless headers.empty?
  set_headers if @headers
  set_cookies if @cookies
  @response = send_request
  case @response
  # HTTP response code 1xx
  when Net::HTTPInformation
    @debug.save '<- [response] = Net::HTTPInformation' if @debug.active
  # HTTP response code 2xx
  when Net::HTTPSuccess
    save_headers
    save_cookies
    @debug.save "<- [response] = #{@response.code} Net::HTTPSuccess" if @debug.active
    # Follow meta refresh
    if @follow_location
      refresh = @response.ng.at_css('meta[http-equiv="refresh"]')
      @response = fetch refresh.attr('content').gsub(/\A.*?(http)/, 'http') if refresh
    end
  # HTTP response code 3xx
  when Net::HTTPRedirection
    @debug.save "<- [response] = #{@response.code} Net::HTTPRedirection" if @debug.active
    @debug.save 'try curl user_agent: tg.user_agent=\'curl\'' if @debug.active
    # Follow location
    if @follow_location
      @response = fetch @response.header['Location']
    else
      save_headers
      save_cookies
    end
  # HTTP response code 4xx
  when Net::HTTPClientError
    @debug.save "<- [response] = #{@response.code} Net::HTTPClientError" if @debug.active
  # HTTP response code 5xx
  when Net::HTTPServerError
    @debug.save "<- [response] = #{@response.code} Net::HTTPServerError" if @debug.active
  end
  @debug.save_to_file @response.body if @debug.save_html
  @response
end

#resetObject

Clears headers and cookies



323
324
325
326
# File 'lib/tiny_grabber/agent.rb', line 323

def reset
  @headers = {}
  @cookies = nil
end

#save_cookiesObject

Save response cookies in agent attribute



314
315
316
317
318
319
# File 'lib/tiny_grabber/agent.rb', line 314

def save_cookies
  return unless @response.cookies
  # @cookies = @response['Set-Cookie']
  @cookies = @response.cookies
  @debug.save "<- [cookies] = #{@cookies}" if @debug.active
end

#save_headersObject

Save response headers in agent attribute



304
305
306
307
308
309
310
# File 'lib/tiny_grabber/agent.rb', line 304

def save_headers
  return unless @response.header
  @headers = @response.header
  # Delete header TRANSFER_ENCODING for chain of requests
  @headers.delete('transfer-encoding')
  @debug.save "<- [headers] = #{@headers}" if @debug.active
end

#send_requestObject

Send request and get response Use SSL connect for HTTPS link scheme



295
296
297
298
299
300
# File 'lib/tiny_grabber/agent.rb', line 295

def send_request
  @http.start(@uri.host, @uri.port, use_ssl: @uri.scheme == 'https', verify_mode: @verify_mode, read_timeout: @read_timeout) do |http|
    @debug.save "-> [read_timeout] = #{@read_timeout}" if @debug.active
    http.request(@request)
  end
end

#set_basic_authObject

Set BASIC_AUTH request authentification



265
266
267
268
# File 'lib/tiny_grabber/agent.rb', line 265

def set_basic_auth
  @request.basic_auth @basic_auth[:username], @basic_auth[:password]
  @debug.save "-> [basic_auth] = #{@basic_auth}" if @debug.active
end

#set_cookiesObject

Set request COOKIES



287
288
289
290
# File 'lib/tiny_grabber/agent.rb', line 287

def set_cookies
  @request['Cookie'] = @cookies
  @debug.save "-> [cookies] = #{@cookies}" if @debug.active
end

#set_headersObject

Set request HEADERS



272
273
274
275
276
277
278
279
280
281
282
283
# File 'lib/tiny_grabber/agent.rb', line 272

def set_headers
  @headers.each do |k, v|
    k = String(k)
    case k
    when 'Accept'
      @request[k] = v
    else
      @request.add_field(k, v)
    end
  end
  @debug.save "-> [headers] = #{@headers}" if @debug.active
end

#set_user_agentObject

Set USER_AGENT request attribute



258
259
260
261
# File 'lib/tiny_grabber/agent.rb', line 258

def set_user_agent
  @headers['User-Agent'] = @user_agent
  @debug.save "-> [user_agent] = #{@user_agent}" if @debug.active
end

#var_to_sym(var, str_to_sym = false) ⇒ Object

Convert variables and contains to symbol

Parameters:

  • var

    Variable need to convert



332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# File 'lib/tiny_grabber/agent.rb', line 332

def var_to_sym(var, str_to_sym = false)
  if var.is_a?(Hash)
    result = {}
    var.each do |k, v|
      result[k.to_sym] = var_to_sym(v, str_to_sym)
    end
  elsif var.is_a?(Array)
    result = []
    var.each do |v|
      result << var_to_sym(v, str_to_sym)
    end
  elsif var.is_a?(String)
    result = str_to_sym ? var.to_sym : var
  else
    result = var
  end
  result
end