Class: HttpCrawler::Client

Inherits:
Object show all
Defined in:
lib/http_crawler/client.rb

Direct Known Subclasses

Proxy::Client, Web::Client

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameter = {}) ⇒ Client

init_uri 如果未初始化@uri,则会报错



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/http_crawler/client.rb', line 34

def initialize(parameter = {})
  parameter = parameter.symbolize_keys

  parameter[:uri_or_path] = parameter[:url] || parameter[:uri]

  if parameter[:uri_or_path]
    # 如果自定义uri
    raise "Client uri为重复初始化" if uri
    update_uri(parameter[:uri_or_path])
  else
    # 初始化 uri
    init_uri
  end

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl unless uri.blank?

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class.to_s.gsub(":", "_")}"}
end

Instance Attribute Details

#all_timeoutObject

Returns the value of attribute all_timeout.



94
95
96
# File 'lib/http_crawler/client.rb', line 94

def all_timeout
  @all_timeout
end

#connect_timeObject

Returns the value of attribute connect_time.



94
95
96
# File 'lib/http_crawler/client.rb', line 94

def connect_time
  @connect_time
end

#cookies(parameter = {}) ⇒ Object

cookies相关方法



138
139
140
# File 'lib/http_crawler/client.rb', line 138

def cookies
  @cookies
end

#error_urlsObject

Returns the value of attribute error_urls.



230
231
232
# File 'lib/http_crawler/client.rb', line 230

def error_urls
  @error_urls
end

#header(parameter = {}) ⇒ Object

头文件相关方法



114
115
116
# File 'lib/http_crawler/client.rb', line 114

def header
  @header
end

#max_error_numObject

最大错误重试次数



63
64
65
# File 'lib/http_crawler/client.rb', line 63

def max_error_num
  @max_error_num
end

#read_timeObject

Returns the value of attribute read_time.



94
95
96
# File 'lib/http_crawler/client.rb', line 94

def read_time
  @read_time
end

#responseObject

请求的响应



300
301
302
# File 'lib/http_crawler/client.rb', line 300

def response
  @response
end

#uriObject (readonly)

Returns the value of attribute uri.



67
68
69
# File 'lib/http_crawler/client.rb', line 67

def uri
  @uri
end

#write_timeObject

Returns the value of attribute write_time.



94
95
96
# File 'lib/http_crawler/client.rb', line 94

def write_time
  @write_time
end

Class Method Details

.for(web_name) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
end

.for_module(module_name, *args) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new()
end

.for_uri(path) ⇒ Object



25
26
27
# File 'lib/http_crawler/client.rb', line 25

def for_uri(path)
  self.new(uri: path)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



237
238
239
# File 'lib/http_crawler/client.rb', line 237

def add_error_url(url_string)
  self.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



162
163
164
165
166
# File 'lib/http_crawler/client.rb', line 162

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#get(path, params = {}, limit = 3) ⇒ Object

发送 get 请求



275
276
277
278
279
280
281
282
283
284
285
# File 'lib/http_crawler/client.rb', line 275

def get(path, params = {}, limit = 3)
  raise "Client uri为空" unless self.uri
  request do
    r = http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)
    return r if limit < 0
    r.html.at_xpath("//meta[@http-equiv='Refresh']").jagger_blank do |objc|
      r = self.get(objc.to_html[/(?:URL|url)="?(.*)[^";>]/, 1], params, limit - 1)
    end
    r
  end
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/http_crawler/client.rb', line 202

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params.symbolize_keys)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    # 测试本地代理
    # proxy_ip = {p_addr: "127.0.0.1", p_port: 8888} if "production" =! Rails.env
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取:proxy 为空"
    else
      break
    end
    sleep(5)
  end while true
  proxy_ip = proxy_ip.symbolize_keys

  unless proxy_ip[:p_addr] && proxy_ip[:p_port]
    Rails.logger.warn "无最新代理等待5秒后重新获取:p_addr 或 p_port 为空"
    sleep(5)
    proxy_ip = get_proxy
  end

  Rails.logger.info("当前IP => #{@proxy},切换至代理 => #{proxy_ip}")
  proxy_ip
end

#get_uriObject

直接发送uri的get请求



288
289
290
291
# File 'lib/http_crawler/client.rb', line 288

def get_uri
  raise "Client uri为空" unless self.uri
  request {http.get(self.uri.to_s, :ssl_context => @ctx)}
end

#httpObject

初始化http请求前置条件



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# File 'lib/http_crawler/client.rb', line 248

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy[:p_addr], @proxy[:p_port].to_i, @proxy[:p_user], @proxy[:p_pass]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  if (@all_timeout)
    # 整体总计超时时间
    h = h.timeout(@all_timeout)
  else
    # 指定每个处理超时时间
    h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
  end

  h
end

#init_clientObject

初始化init_client参数



243
244
245
# File 'lib/http_crawler/client.rb', line 243

def init_client
  nil
end

#init_cookies(parameter = {}) ⇒ Object



143
144
145
146
# File 'lib/http_crawler/client.rb', line 143

def init_cookies(parameter = {})
  parameter = parameter.symbolize_keys
  @cookies = {}
end

#init_header(parameter = {}) ⇒ Object



119
120
121
122
123
124
125
126
127
128
129
# File 'lib/http_crawler/client.rb', line 119

def init_header(parameter = {})
  parameter = parameter.symbolize_keys
  @header = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Encoding": "gzip, br",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Connection": "keep-alive",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
  }
end

#init_sslObject

初始化 ssl 协议



104
105
106
107
108
109
110
# File 'lib/http_crawler/client.rb', line 104

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



96
97
98
99
100
101
# File 'lib/http_crawler/client.rb', line 96

def init_timeout
  @connect_time = 5
  @write_time = 5
  @read_time = 5
  @all_timeout = nil
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错



71
72
73
# File 'lib/http_crawler/client.rb', line 71

def init_uri
  @uri = nil
end

#post(path, params = {}, format = :form) ⇒ Object

发送 post 请求



294
295
296
297
# File 'lib/http_crawler/client.rb', line 294

def post(path, params = {}, format = :form)
  raise "Client uri为空" unless self.uri
  request {http.post((self.uri + path).to_s, format => params, :ssl_context => @ctx)}
end

#proxy_apiObject

代理使用的api方法名



169
170
171
# File 'lib/http_crawler/client.rb', line 169

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



174
175
176
# File 'lib/http_crawler/client.rb', line 174

def proxy_params
  @proxy_params ||= {key: "default"}
end

#str_to_cookies(str) ⇒ Object

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}



155
156
157
158
159
# File 'lib/http_crawler/client.rb', line 155

def str_to_cookies(str)
  str.scan(/([^=]*)=([^;]*);? ?/) do |m|
    self.cookies[:"#{m[0]}"] = m[1]
  end
end

#update_cookies(parameter = {}) ⇒ Object



148
149
150
151
# File 'lib/http_crawler/client.rb', line 148

def update_cookies(parameter = {})
  parameter = parameter.symbolize_keys
  nil
end

#update_header(parameter = {}) ⇒ Object



131
132
133
134
# File 'lib/http_crawler/client.rb', line 131

def update_header(parameter = {})
  parameter = parameter.symbolize_keys
  @header = init_header
end

#update_proxy(proxy = {}) ⇒ Object



178
179
180
181
182
183
184
185
186
# File 'lib/http_crawler/client.rb', line 178

def update_proxy(proxy = {})
  proxy = proxy.symbolize_keys
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?Boolean

如果自动更新代理 则更新代理返回 true,否则返回false



190
191
192
193
194
195
196
197
# File 'lib/http_crawler/client.rb', line 190

def update_proxy?
  if @auto_proxy
    self.update_proxy
    return true
  else
    return false
  end
end

#update_uri(uri_or_path) ⇒ Object

更新uri



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/http_crawler/client.rb', line 76

def update_uri(uri_or_path)
  case uri_or_path
  when URI
    @uri = uri_or_path
  when String
    if uri_or_path =~ /^http/
      @uri = URI(uri_or_path)
    else
      @uri = @uri + uri_or_path
    end
  else
    raise ArgumentError, uri_or_path
  end
  # 初始化 ssl 协议
  self.init_ssl
  self.uri
end

#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理



304
305
306
307
308
309
310
311
312
313
314
# File 'lib/http_crawler/client.rb', line 304

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end