Class: HttpProxyPool::ProxyPool

Inherits:
Object
  • Object
show all
Defined in:
lib/http_proxy_pool/proxy_pool.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(args = {}) ⇒ ProxyPool

Returns a new instance of ProxyPool.



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/http_proxy_pool/proxy_pool.rb', line 7

def initialize(args = {})
  @data_path  = args[:data_path] || File.join(HttpProxyPool.home, 'ips.yaml')
  @script     = args[:script]    || Dir["#{HttpProxyPool.home}/script/*.site"]
  @logger     = args[:logger]    || HttpProxyPool.logger
  @proxys     = []

  @agent      = Mechanize.new
  @agent.user_agent_alias = get_agent_alias

  load_proxy if File.exists? @data_path
end

Instance Attribute Details

#loggerObject

Returns the value of attribute logger.



5
6
7
# File 'lib/http_proxy_pool/proxy_pool.rb', line 5

def logger
  @logger
end

#proxysObject

Returns the value of attribute proxys.



5
6
7
# File 'lib/http_proxy_pool/proxy_pool.rb', line 5

def proxys
  @proxys
end

Instance Method Details

#build_query_parameter(prefix = 'proxy', args) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/http_proxy_pool/proxy_pool.rb', line 40

def build_query_parameter(prefix = 'proxy', args)
  condition_str = ''

  args = query_key_filter(args)

  args.each do |key, express|
    condition_str << "#{prefix}.#{key} #{express} && "
  end

  condition_str.sub!(/\s?&&\s?$/, '')

  condition_str
end

#checker(proxy) ⇒ Object



154
155
156
157
158
159
160
# File 'lib/http_proxy_pool/proxy_pool.rb', line 154

def checker(proxy)
  if proxy.is_a? Array
    checker_batch(proxy)
  else
    checker_single(proxy)
  end
end

#checker_batch(proxys, task_count = 5) ⇒ Object



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/http_proxy_pool/proxy_pool.rb', line 162

def checker_batch(proxys, task_count = 5)
  result = []
  mutex = Mutex.new
  thread_count = (proxys.size / task_count.to_f).ceil

  thread_count.times do |thread_idx|
    (Thread.new do
      start_idx = thread_idx * task_count
      end_idx   = (thread_idx + 1) * task_count 
      end_idx   = proxys.size if end_idx > proxys.size

      proxys[start_idx..end_idx].each do |proxy|
        p = checker_single(proxy)

        mutex.synchronize  do
          result<< p if p
        end
      end
    end).join
  end

  result
end

#checker_single(proxy, timeout = 0.05) ⇒ Object



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/http_proxy_pool/proxy_pool.rb', line 186

def checker_single(proxy, timeout = 0.05)
  http = Net::HTTP.new('baidu.com', 80, proxy.ip, proxy.port)
  http.open_timeout = timeout
  http.read_timeout = timeout * 10

  begin
    return proxy if http.get('/').code =~ /^[1|2|3|4]/
  rescue => e
    @logger.info("can not connect proxy.[#{proxy}].#{e.to_s}")
    @proxys.delete(proxy)
    @logger.info("delete disabled proxy [#{proxy}].")
  end

  false
end

#crawling(lastest = true, check = false) ⇒ Object



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/http_proxy_pool/proxy_pool.rb', line 91

def crawling(lastest = true, check = false)
  @script.each do |file|
    begin
      task = Basetask.new(:agent => @agent,:logger => @logger)
      task.instance_eval(read_taskfile(file))

      task.ips(lastest) do |fields|
        proxy = Proxy.new(fields)
        (next unless checker(proxy)) if check
        @proxys << proxy unless include?(proxy)
      end
    rescue => e
      @logger.error(e)
    ensure
      save_proxy
    end
  end
end

#get_agent_aliasObject



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/http_proxy_pool/proxy_pool.rb', line 135

def get_agent_alias
  agent_arr = [
              'Linux Firefox',
              'Linux Mozilla',
              'Mac Firefox',
              'Mac Mozilla',
              'Mac Safari',
              'Windows Chrome',
              'Windows IE 7',
              'Windows IE 8',
              'Windows IE 9',
              'Windows Mozilla',
              'iPhone',
              'iPad',
              'Android']

  agent_arr[rand(agent_arr.size)]                    
end

#get_random_proxy(check = true, thread_num = 10) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/http_proxy_pool/proxy_pool.rb', line 59

def get_random_proxy(check = true, thread_num = 10)
  mutex       = Mutex.new
  result      = nil
  thread_list = []

  begin
    thread_num.times do |thread|
      thread_list  << Thread.new do
                        while(!result)
                          proxy = @proxys[rand(@proxys.size)]
                          @logger.info("using #{proxy}.")
                          proxy = checker(proxy) if check

                          if proxy.is_a? Proxy
                            mutex.synchronize do
                              result = proxy
                            end
                          end
                        end
                      end
    end

    thread_list.each { |t| t.join }
  rescue => e
    @logger.error("find proxy error. #{e.to_s}")
  ensure
    save_proxy
  end

  result
end

#include?(proxy) ⇒ Boolean

Returns:

  • (Boolean)


110
111
112
# File 'lib/http_proxy_pool/proxy_pool.rb', line 110

def include?(proxy)
  @proxys.select{ |p| p.ip == proxy.ip}.size > 0
end

#load_proxyObject



120
121
122
# File 'lib/http_proxy_pool/proxy_pool.rb', line 120

def load_proxy
  @proxys = YAML.load_file(@data_path)
end

#query(args = {}) ⇒ Object

query interface



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/http_proxy_pool/proxy_pool.rb', line 24

def query(args = {})
  begin
    selected_proxy = @proxys.select do |proxy|
                       instance_eval(build_query_parameter('proxy', args))
                     end
  rescue => e
    raise QueryError.new("query parameter error!")
  end

  return selected_proxy unless block_given?

  selected_proxy.each do |proxy|
    yield proxy
  end
end

#query_key_filter(args) ⇒ Object



54
55
56
57
# File 'lib/http_proxy_pool/proxy_pool.rb', line 54

def query_key_filter(args)
  proxy = Proxy.new
  args.select{ |k| proxy.respond_to? k }
end

#read_taskfile(file) ⇒ Object



124
125
126
127
128
129
130
131
132
133
# File 'lib/http_proxy_pool/proxy_pool.rb', line 124

def read_taskfile(file)
  cnt = ''
  File.open(file) do |f|
    while(line = f.gets)
      cnt << line
    end
  end

  cnt
end

#save_proxyObject



114
115
116
117
118
# File 'lib/http_proxy_pool/proxy_pool.rb', line 114

def save_proxy
  file = File.open(@data_path, 'w')
  YAML.dump(@proxys, file)
  file.close
end

#statusObject



19
20
21
# File 'lib/http_proxy_pool/proxy_pool.rb', line 19

def status
  puts "proxy count : #{@proxys.size}"
end