Class: TinyGrabber::Agent

Inherits:
Object
  • Object
show all
Defined in:
lib/tiny_grabber/agent.rb

Constant Summary collapse

AGENT_ALIASES =

Agent aliases given from www.useragentstring.com/pages/Chrome/

[
  # Chrome
  'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
  # Firefox
  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
  'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
  'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
  # Internet Explorer
  'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
  'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
  'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)',
  'Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)',
  # Opera
  'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
  'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
  'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
  'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52'
].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeAgent

Initialization object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/tiny_grabber/agent.rb', line 52

def initialize
  @debug = Debug.new

  # Initialize variables agent attributes
  @user_agent = AGENT_ALIASES[rand(AGENT_ALIASES.count) - 1]
  @proxy = []
  @basic_auth = {}
  @headers = {}
  @cookies = nil
  @follow_location = false
  @read_timeout = 10
  # Initialize variable for URI object
  @uri = nil
  # Initialize variable for Net::HTTP request object
  @http = Net::HTTP
  # Initialize variable for Net::HTTP response object
  @response = nil
  @verify_mode = OpenSSL::SSL::VERIFY_NONE
end

Instance Attribute Details

#basic_auth=(basic_auth) ⇒ Object (writeonly)

Set BASIC_AUTH agent attribute

Parameters:

  • basic_auth

    Authentification configuration



16
17
18
# File 'lib/tiny_grabber/agent.rb', line 16

def basic_auth=(value)
  @basic_auth = value
end

#cookies=(cookies) ⇒ Object (writeonly)

Set COOKIES agent attribute

Parameters:

  • cookies

    Request cookies



20
21
22
# File 'lib/tiny_grabber/agent.rb', line 20

def cookies=(value)
  @cookies = value
end

#debug=(debug) ⇒ Object (writeonly)

Set debug configuration

Parameters:

  • debug


8
9
10
# File 'lib/tiny_grabber/agent.rb', line 8

def debug=(value)
  @debug = value
end

#follow_location=(follow_location) ⇒ Object (writeonly)

Init follow location for redirect

Parameters:

  • follow_location

    Follow location flag



24
25
26
# File 'lib/tiny_grabber/agent.rb', line 24

def follow_location=(value)
  @follow_location = value
end

#headers=(headers) ⇒ Object (writeonly)

Set HEADERS agent attribute

Parameters:

  • headers

    Request headers



18
19
20
# File 'lib/tiny_grabber/agent.rb', line 18

def headers=(value)
  @headers = value
end

#proxyObject

Remote proxy configuration



14
15
16
# File 'lib/tiny_grabber/agent.rb', line 14

def proxy
  @proxy
end

#read_timeout=(read_timeout) ⇒ Object (writeonly)

Set READ_TIMEOUT agent attribute

Parameters:

  • read_timeout

    Waiting time to reading



10
11
12
# File 'lib/tiny_grabber/agent.rb', line 10

def read_timeout=(value)
  @read_timeout = value
end

#user_agent=(user_agent) ⇒ Object (writeonly)

Set USER_AGENT agent attribute

Parameters:

  • user_agent

    Web browser name



12
13
14
# File 'lib/tiny_grabber/agent.rb', line 12

def user_agent=(value)
  @user_agent = value
end

#verify_mode=(value) ⇒ Object (writeonly)

Set verify mode



22
23
24
# File 'lib/tiny_grabber/agent.rb', line 22

def verify_mode=(value)
  @verify_mode = value
end

Instance Method Details

#convert_to_uri(url) ⇒ Object

Initialize URI object from request url

Parameters:

  • url

    Request link



243
244
245
246
247
# File 'lib/tiny_grabber/agent.rb', line 243

def convert_to_uri(url)
  # It's magic work with escaped url
  @uri = URI(URI.escape(URI.unescape(url)))
  @debug.save "-> [uri] = #{@uri}" if @debug.active
end

#fetch(url, method = :get, headers = {}, params = {}) ⇒ Object

Fetch request for GET and POST HTTP methods Setting USER_AGENT, BASIC_AUTH, HEADERS, COOKIES request attribute Make response and save COOKIES for next requests

Parameters:

  • url

    Resource link

  • method (defaults to: :get)

    Request method

  • headers (defaults to: {})

    Request header

  • params (defaults to: {})

    Request additional params



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/tiny_grabber/agent.rb', line 186

def fetch(url, method = :get, headers = {}, params = {})
  if @debug.active
    @debug.save '=============================='
    @debug.save "#{method.upcase} #{url}"
    @debug.save "-> [proxy] = #{@proxy}" if @proxy
    @debug.save "-> [params] = #{params}"
    @debug.save '------------------------------'
  end
  convert_to_uri url
  case method
  when :get
    @request = Net::HTTP::Get.new(@uri.request_uri)
  when :post
    @request = Net::HTTP::Post.new(@uri.request_uri)
    @request.set_form_data(params)
  end
  set_user_agent if @user_agent
  set_basic_auth unless @basic_auth.empty?
  @headers = headers unless headers.empty?
  set_headers unless @headers.empty?
  set_cookies if @cookies
  @response = send_request
  case @response
  # HTTP response code 1xx
  when Net::HTTPInformation
    @debug.save '<- [response] = Net::HTTPInformation' if @debug.active
  # HTTP response code 2xx
  when Net::HTTPSuccess
    save_headers if @response.header
    save_cookies if @response.cookies
    @debug.save "<- [response] = #{@response.code} Net::HTTPSuccess" if @debug.active
    # Follow meta refresh
    if @follow_location
      refresh = @response.ng.at_css('meta[http-equiv="refresh"]')
      @response = fetch refresh.attr('content').gsub(/\A.*?(http)/, 'http') if refresh
    end
  # HTTP response code 3xx
  when Net::HTTPRedirection
    @debug.save "<- [response] = #{@response.code} Net::HTTPRedirection" if @debug.active
    @debug.save 'try curl user_agent: tg.user_agent=\'curl\''
    # Follow location
    @response = fetch @response.header['Location'] if @follow_location
  # HTTP response code 4xx
  when Net::HTTPClientError
    @debug.save "<- [response] = #{@response.code} Net::HTTPClientError" if @debug.active
  # HTTP response code 5xx
  when Net::HTTPServerError
    @debug.save "<- [response] = #{@response.code} Net::HTTPServerError" if @debug.active
  end
  @debug.save_to_file @response.body if @debug.save_html
  @response
end

#resetObject

Clears headers and cookies



313
314
315
316
# File 'lib/tiny_grabber/agent.rb', line 313

def reset
  @headers = {}
  @cookies = nil
end

#save_cookiesObject

Save response cookies in agent attribute



306
307
308
309
# File 'lib/tiny_grabber/agent.rb', line 306

def save_cookies
  @cookies = @response.cookies
  @debug.save "<- [cookies] = #{@cookies}" if @debug.active
end

#save_headersObject

Save response headers in agent attribute



297
298
299
300
301
302
# File 'lib/tiny_grabber/agent.rb', line 297

def save_headers
  @headers = @response.headers
  # Delete header TRANSFER_ENCODING for chain of requests
  @headers.delete('transfer-encoding')
  @debug.save "<- [headers] = #{@headers}" if @debug.active
end

#send_requestObject

Send request and get response Use SSL connect for HTTPS link scheme



288
289
290
291
292
293
# File 'lib/tiny_grabber/agent.rb', line 288

def send_request
  @http.start(@uri.host, @uri.port, use_ssl: @uri.scheme == 'https', verify_mode: @verify_mode, read_timeout: @read_timeout) do |http|
    @debug.save "-> [read_timeout] = #{@read_timeout}" if @debug.active
    http.request(@request)
  end
end

#set_basic_authObject

Set BASIC_AUTH request authentification



258
259
260
261
# File 'lib/tiny_grabber/agent.rb', line 258

def set_basic_auth
  @request.basic_auth @basic_auth[:username], @basic_auth[:password]
  @debug.save "-> [basic_auth] = #{@basic_auth}" if @debug.active
end

#set_cookiesObject

Set request COOKIES



280
281
282
283
# File 'lib/tiny_grabber/agent.rb', line 280

def set_cookies
  @request['Cookie'] = @cookies
  @debug.save "-> [cookies] = #{@cookies}" if @debug.active
end

#set_headersObject

Set request HEADERS



265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/tiny_grabber/agent.rb', line 265

def set_headers
  @headers.each do |k, v|
    k = String(k)
    case k
    when 'Accept'
      @request[k] = v
    else
      @request.add_field(k, v)
    end
  end
  @debug.save "-> [headers] = #{@headers}" if @debug.active
end

#set_user_agentObject

Set USER_AGENT request attribute



251
252
253
254
# File 'lib/tiny_grabber/agent.rb', line 251

def set_user_agent
  @headers['User-Agent'] = @user_agent
  @debug.save "-> [user_agent] = #{@user_agent}" if @debug.active
end

#var_to_sym(var, str_to_sym = false) ⇒ Object

Convert variables and contains to symbol

Parameters:

  • var

    Variable need to convert



322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/tiny_grabber/agent.rb', line 322

def var_to_sym(var, str_to_sym = false)
  if var.is_a?(Hash)
    result = {}
    var.each do |k, v|
      result[k.to_sym] = var_to_sym(v, str_to_sym)
    end
  elsif var.is_a?(Array)
    result = []
    var.each do |v|
      result << var_to_sym(v, str_to_sym)
    end
  elsif var.is_a?(String)
    result = str_to_sym ? var.to_sym : var
  else
    result = var
  end
  result
end