Class: RequestManager

Inherits:
Object
  • Object
show all
Defined in:
lib/requestmanager.rb

Instance Method Summary collapse

Constructor Details

#initialize(proxy_list, request_interval) ⇒ RequestManager

Returns a new instance of RequestManager.



7
8
9
10
11
# File 'lib/requestmanager.rb', line 7

def initialize(proxy_list, request_interval)
  @proxy_list = parse_proxy_list(proxy_list)
  @request_interval = request_interval
  @used_proxies = Hash.new
end

Instance Method Details

#gen_driver(chosen_proxy) ⇒ Object

Generate driver for searches



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/requestmanager.rb', line 35

def gen_driver(chosen_proxy)
  # Profile settings
  profile = Selenium::WebDriver::Firefox::Profile.new
  profile['intl.accept_languages'] = 'en'

  # Set proxy if proxy list, otherwise sleep
  if chosen_proxy
    proxy = Selenium::WebDriver::Proxy.new(http: chosen_proxy, ssl: chosen_proxy)
    profile.proxy = proxy
  else
    sleep(rand(@request_interval[0]..@request_interval[1]))
  end
  
  return Selenium::WebDriver.for :firefox, profile: profile
end

#get_page(url, form_input = nil) ⇒ Object

Get the page requested



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/requestmanager.rb', line 14

def get_page(url, form_input = nil)
  chosen_proxy = @proxy_list != nil ? get_random_proxy(url) : nil
  driver = gen_driver(chosen_proxy)
  driver.navigate.to url
  puts "Getting page " + url

  # Handle form input if there is any
  if form_input
    element = driver.find_element(name: "q")
    element.send_keys form_input
    element.submit
  end

  # Sleep while things load then save
  sleep(7)
  page_html = driver.page_source
  driver.quit
  return page_html
end

#get_random_proxy(url) ⇒ Object

Choose a random proxy that hasn’t been used recently



52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/requestmanager.rb', line 52

def get_random_proxy(url)
  max = @proxy_list.length
  chosen = @proxy_list[Random.rand(max)]

  # Only use proxy if it hasn't been used in last n seconds on same host
  if is_not_used?(chosen, url)
    @used_proxies[chosen] = [Time.now, URI.parse(url).host]
    return chosen[0]+":"+chosen[1]
  else
    sleep(0.005)
    get_random_proxy(url)
  end
end

#is_not_used?(chosen, url) ⇒ Boolean

Checks if a proxy has been used on domain in the last 20 seconds

Returns:

  • (Boolean)


67
68
69
70
71
# File 'lib/requestmanager.rb', line 67

def is_not_used?(chosen, url)
  return (!@used_proxies[chosen] ||
          @used_proxies[chosen][0] <= Time.now-@request_interval[0] ||
          @used_proxies[chosen][1] != URI.parse(url).host)
end

#parse_proxy_list(proxy_file) ⇒ Object

Parse the proxy list



74
75
76
77
78
# File 'lib/requestmanager.rb', line 74

def parse_proxy_list(proxy_file)
  if proxy_file
    return IO.readlines(proxy_file).map{ |proxy| proxy.strip.split(":")}
  end
end