Class: TinyGrabber::Agent

Inherits:
Object
  • Object
show all
Defined in:
lib/tiny_grabber/agent.rb

Constant Summary collapse

AGENT_ALIASES =

Agent aliases given from www.useragentstring.com/pages/Chrome/

[
  # Chrome
  'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
  # Firefox
  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
  'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
  'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
  # Internet Explorer
  'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
  'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
  'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)',
  'Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)',
  # Opera
  'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
  'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
  'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
  'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52'
].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeAgent

Initialization object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/tiny_grabber/agent.rb', line 54

def initialize
  @debug = Debug.new

  # Initialize variables agent attributes
  @user_agent = AGENT_ALIASES[rand(AGENT_ALIASES.count) - 1]
  @proxy = []
  @basic_auth = {}
  @headers = {}
  @cookies = nil
  @follow_location = false
  @read_timeout = 10
  # Initialize variable for URI object
  @uri = nil
  # Initialize variable for Net::HTTP request object
  @http = Net::HTTP
  # Initialize variable for Net::HTTP response object
  @response = nil
  @verify_mode = OpenSSL::SSL::VERIFY_NONE
end

Instance Attribute Details

#basic_auth=(basic_auth) ⇒ Object (writeonly)

Set BASIC_AUTH agent attribute

Parameters:

  • basic_auth

    Authentification configuration



16
17
18
# File 'lib/tiny_grabber/agent.rb', line 16

def basic_auth=(value)
  @basic_auth = value
end

#cookiesObject

Headers



20
21
22
# File 'lib/tiny_grabber/agent.rb', line 20

def cookies
  @cookies
end

#debug=(debug) ⇒ Object (writeonly)

Set debug configuration

Parameters:

  • debug


8
9
10
# File 'lib/tiny_grabber/agent.rb', line 8

def debug=(value)
  @debug = value
end

#follow_location=(follow_location) ⇒ Object (writeonly)

Init follow location for redirect

Parameters:

  • follow_location

    Follow location flag



24
25
26
# File 'lib/tiny_grabber/agent.rb', line 24

def follow_location=(value)
  @follow_location = value
end

#headersObject

Headers



18
19
20
# File 'lib/tiny_grabber/agent.rb', line 18

def headers
  @headers
end

#proxyObject

Remote proxy configuration



14
15
16
# File 'lib/tiny_grabber/agent.rb', line 14

def proxy
  @proxy
end

#read_timeout=(read_timeout) ⇒ Object (writeonly)

Set READ_TIMEOUT agent attribute

Parameters:

  • read_timeout

    Waiting time to reading



10
11
12
# File 'lib/tiny_grabber/agent.rb', line 10

def read_timeout=(value)
  @read_timeout = value
end

#uriObject

Uri



26
27
28
# File 'lib/tiny_grabber/agent.rb', line 26

def uri
  @uri
end

#user_agent=(user_agent) ⇒ Object (writeonly)

Set USER_AGENT agent attribute

Parameters:

  • user_agent

    Web browser name



12
13
14
# File 'lib/tiny_grabber/agent.rb', line 12

def user_agent=(value)
  @user_agent = value
end

#verify_mode=(value) ⇒ Object (writeonly)

Set verify mode



22
23
24
# File 'lib/tiny_grabber/agent.rb', line 22

def verify_mode=(value)
  @verify_mode = value
end

Instance Method Details

#convert_to_uri(url) ⇒ Object

Initialize URI object from request url

Parameters:

  • url

    Request link



251
252
253
254
255
256
257
# File 'lib/tiny_grabber/agent.rb', line 251

def convert_to_uri(url)
  # Remove anchor
  url = url.gsub(/#.*\Z/, '')
  # It's magic work with escaped url
  @uri = URI(URI.escape(URI.unescape(url)))
  @debug.save "-> [uri] = #{@uri}" if @debug.active
end

#fetch(url, method = :get, headers = {}, params = {}) ⇒ Object

Fetch request for GET and POST HTTP methods Setting USER_AGENT, BASIC_AUTH, HEADERS, COOKIES request attribute Make response and save COOKIES for next requests

Parameters:

  • url

    Resource link

  • method (defaults to: :get)

    Request method

  • headers (defaults to: {})

    Request header

  • params (defaults to: {})

    Request additional params



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/tiny_grabber/agent.rb', line 188

def fetch(url, method = :get, headers = {}, params = {})
  if @debug.active
    @debug.save '=============================='
    @debug.save "#{method.upcase} #{url}"
    @debug.save "-> [proxy] = #{@proxy}" if @proxy
    @debug.save "-> [params] = #{params}"
    @debug.save '------------------------------'
  end
  convert_to_uri url
  case method
  when :get
    @request = Net::HTTP::Get.new(@uri.request_uri)
  when :post
    @request = Net::HTTP::Post.new(@uri.request_uri)
    @request.set_form_data(params)
  end
  set_user_agent if @user_agent
  set_basic_auth unless @basic_auth.empty?
  @headers = headers unless headers.empty?
  set_headers if @headers
  set_cookies if @cookies
  @response = send_request
  case @response
  # HTTP response code 1xx
  when Net::HTTPInformation
    @debug.save '<- [response] = Net::HTTPInformation' if @debug.active
  # HTTP response code 2xx
  when Net::HTTPSuccess
    save_headers
    save_cookies
    @debug.save "<- [response] = #{@response.code} Net::HTTPSuccess" if @debug.active
    # Follow meta refresh
    if @follow_location
      refresh = @response.ng.at_css('meta[http-equiv="refresh"]')
      @response = fetch refresh.attr('content').gsub(/\A.*?(http)/, 'http') if refresh
    end
  # HTTP response code 3xx
  when Net::HTTPRedirection
    @debug.save "<- [response] = #{@response.code} Net::HTTPRedirection" if @debug.active
    @debug.save 'try curl user_agent: tg.user_agent=\'curl\'' if @debug.active
    # Follow location
    if @follow_location
      @response = fetch @response.header['Location']
    else
      save_headers
      save_cookies
    end
  # HTTP response code 4xx
  when Net::HTTPClientError
    @debug.save "<- [response] = #{@response.code} Net::HTTPClientError" if @debug.active
  # HTTP response code 5xx
  when Net::HTTPServerError
    @debug.save "<- [response] = #{@response.code} Net::HTTPServerError" if @debug.active
  end
  @response.uri = @uri
  @debug.save_to_file @response.body if @debug.save_html
  @response
end

#resetObject

Clears headers and cookies



329
330
331
332
# File 'lib/tiny_grabber/agent.rb', line 329

def reset
  @headers = {}
  @cookies = nil
end

#save_cookiesObject

Save response cookies in agent attribute



317
318
319
320
321
322
323
324
325
# File 'lib/tiny_grabber/agent.rb', line 317

def save_cookies
  if @response.respond_to?(:cookies)
    return unless @response.cookies
    @cookies = @response.cookies
  else
    return unless @response['Set-Cookie']
    @cookies = @response['Set-Cookie']
  end
end

#save_headersObject

Save response headers in agent attribute



307
308
309
310
311
312
313
# File 'lib/tiny_grabber/agent.rb', line 307

def save_headers
  return unless @response.header
  @headers = @response.header
  # Delete header TRANSFER_ENCODING for chain of requests
  @headers.delete('transfer-encoding')
  @debug.save "<- [headers] = #{@headers}" if @debug.active
end

#send_requestObject

Send request and get response Use SSL connect for HTTPS link scheme



298
299
300
301
302
303
# File 'lib/tiny_grabber/agent.rb', line 298

def send_request
  @http.start(@uri.host, @uri.port, use_ssl: @uri.scheme == 'https', verify_mode: @verify_mode, read_timeout: @read_timeout) do |http|
    @debug.save "-> [read_timeout] = #{@read_timeout}" if @debug.active
    http.request(@request)
  end
end

#set_basic_authObject

Set BASIC_AUTH request authentification



268
269
270
271
# File 'lib/tiny_grabber/agent.rb', line 268

def set_basic_auth
  @request.basic_auth @basic_auth[:username], @basic_auth[:password]
  @debug.save "-> [basic_auth] = #{@basic_auth}" if @debug.active
end

#set_cookiesObject

Set request COOKIES



290
291
292
293
# File 'lib/tiny_grabber/agent.rb', line 290

def set_cookies
  @request['Cookie'] = @cookies
  @debug.save "-> [cookies] = #{@cookies}" if @debug.active
end

#set_headersObject

Set request HEADERS



275
276
277
278
279
280
281
282
283
284
285
286
# File 'lib/tiny_grabber/agent.rb', line 275

def set_headers
  @headers.each do |k, v|
    k = String(k)
    case k
    when 'Accept'
      @request[k] = v
    else
      @request.add_field(k, v)
    end
  end
  @debug.save "-> [headers] = #{@headers}" if @debug.active
end

#set_user_agentObject

Set USER_AGENT request attribute



261
262
263
264
# File 'lib/tiny_grabber/agent.rb', line 261

def set_user_agent
  @headers['User-Agent'] = @user_agent
  @debug.save "-> [user_agent] = #{@user_agent}" if @debug.active
end

#var_to_sym(var, str_to_sym = false) ⇒ Object

Convert variables and contains to symbol

Parameters:

  • var

    Variable need to convert



338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# File 'lib/tiny_grabber/agent.rb', line 338

def var_to_sym(var, str_to_sym = false)
  if var.is_a?(Hash)
    result = {}
    var.each do |k, v|
      result[k.to_sym] = var_to_sym(v, str_to_sym)
    end
  elsif var.is_a?(Array)
    result = []
    var.each do |v|
      result << var_to_sym(v, str_to_sym)
    end
  elsif var.is_a?(String)
    result = str_to_sym ? var.to_sym : var
  else
    result = var
  end
  result
end