Class: HttpCrawler::Client

Inherits:
Object show all
Defined in:
lib/http_crawler/client.rb

Direct Known Subclasses

Proxy::Client, Web::Client

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameter = {}) ⇒ Client

init_uri 如果未初始化@uri,则会报错

继承类需要重定义 init_uri


242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/http_crawler/client.rb', line 242

def initialize(parameter = {})
  # 初始化 uri
  init_uri

  # 如果自定义uri
  if parameter[:uri]
    raise "Client uri为重复初始化" if uri
    update_uri(parameter[:uri])
  end

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class}"}
end

Instance Attribute Details

#cookies(parameter = {}) ⇒ Object

cookies相关方法



104
105
106
# File 'lib/http_crawler/client.rb', line 104

def cookies
  @cookies
end

#error_urlsObject

Returns the value of attribute error_urls.



195
196
197
# File 'lib/http_crawler/client.rb', line 195

def error_urls
  @error_urls
end

#header(parameter = {}) ⇒ Object

头文件相关方法



83
84
85
# File 'lib/http_crawler/client.rb', line 83

def header
  @header
end

#max_error_numObject

最大错误重试次数



33
34
35
# File 'lib/http_crawler/client.rb', line 33

def max_error_num
  @max_error_num
end

#responseObject

请求的响应



284
285
286
# File 'lib/http_crawler/client.rb', line 284

def response
  @response
end

#uriObject (readonly)

Returns the value of attribute uri.



37
38
39
# File 'lib/http_crawler/client.rb', line 37

def uri
  @uri
end

Class Method Details

.for(web_name) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
end

.for_module(module_name, *args) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new()
end

.for_uri(path) ⇒ Object



25
26
27
# File 'lib/http_crawler/client.rb', line 25

def for_uri(path)
  self.new(uri: path)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



202
203
204
# File 'lib/http_crawler/client.rb', line 202

def add_error_url(url_string)
  self.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



125
126
127
128
129
# File 'lib/http_crawler/client.rb', line 125

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#get(path, params = {}) ⇒ Object

发送 get 请求



266
267
268
269
# File 'lib/http_crawler/client.rb', line 266

def get(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/http_crawler/client.rb', line 164

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取"
    else
      break
    end
    sleep(5)
  end while true

  Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")

  unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end

  if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end
  proxy_ip
end

#get_uriObject

直接发送uri的get请求



272
273
274
275
# File 'lib/http_crawler/client.rb', line 272

def get_uri
  raise "Client uri为空" unless self.uri
  request {http.get(self.uri.to_s, :ssl_context => @ctx)}
end

#httpObject

初始化http请求前置条件



213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/http_crawler/client.rb', line 213

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  if(@all_timeout)
    # 整体总计超时时间
    h = h.timeout(@all_timeout)
  else
    # 指定每个处理超时时间
    h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
  end

  h
end

#init_clientObject

初始化init_client参数



208
209
210
# File 'lib/http_crawler/client.rb', line 208

def init_client
  nil
end

#init_cookies(parameter = {}) ⇒ Object



108
109
110
# File 'lib/http_crawler/client.rb', line 108

def init_cookies(parameter = {})
  @cookies = {}
end

#init_header(parameter = {}) ⇒ Object



87
88
89
90
91
92
93
94
95
96
# File 'lib/http_crawler/client.rb', line 87

def init_header(parameter = {})
  @header = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Encoding": "gzip, br",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Connection": "keep-alive",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
  }
end

#init_sslObject

初始化 ssl 协议



73
74
75
76
77
78
79
# File 'lib/http_crawler/client.rb', line 73

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



65
66
67
68
69
70
# File 'lib/http_crawler/client.rb', line 65

def init_timeout
  @connect_time = 5
  @write_time = 5
  @read_time = 5
  @all_timeout = nil
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错

继承类需要实现 @uri = URI("http://host")


41
42
43
# File 'lib/http_crawler/client.rb', line 41

def init_uri
  @uri = nil
end

#post(path, params = {}) ⇒ Object

发送 post 请求



278
279
280
281
# File 'lib/http_crawler/client.rb', line 278

def post(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
end

#proxy_apiObject

代理使用的api方法名



132
133
134
# File 'lib/http_crawler/client.rb', line 132

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



137
138
139
# File 'lib/http_crawler/client.rb', line 137

def proxy_params
  @proxy_params ||= {"key": "default"}
end

#str_to_cookies(str) ⇒ Object

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}



118
119
120
121
122
# File 'lib/http_crawler/client.rb', line 118

def str_to_cookies(str)
  str.scan(/([^=]*)=([^;]*);? ?/) do |m|
    self.cookies[:"#{m[0]}"] = m[1]
  end
end

#update_cookies(parameter = {}) ⇒ Object



112
113
114
# File 'lib/http_crawler/client.rb', line 112

def update_cookies(parameter = {})
  nil
end

#update_header(parameter = {}) ⇒ Object



98
99
100
# File 'lib/http_crawler/client.rb', line 98

def update_header(parameter = {})
  @header = init_header
end

#update_proxy(proxy = {}) ⇒ Object



141
142
143
144
145
146
147
148
# File 'lib/http_crawler/client.rb', line 141

def update_proxy(proxy = {})
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?(proxy_ip = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


152
153
154
155
156
157
158
159
# File 'lib/http_crawler/client.rb', line 152

def update_proxy?(proxy_ip = {})
  if @auto_proxy
    update_proxy(proxy_ip)
    return true
  else
    return false
  end
end

#update_uri(uri_or_path) ⇒ Object

更新uri



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/http_crawler/client.rb', line 46

def update_uri(uri_or_path)
  case uri_or_path
  when URI
    @uri = uri_or_path
  when String
    if uri_or_path =~ /^http/
      @uri = URI(uri_or_path)
    else
      @uri = @uri + uri_or_path
    end
  else
    raise ArgumentError, uri_or_path
  end
  # 初始化 ssl 协议
  self.init_ssl
  self.uri
end

#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理

Returns:

  • (Boolean)


288
289
290
291
292
293
294
295
296
297
298
# File 'lib/http_crawler/client.rb', line 288

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end