Class: HttpCrawler::Client

Inherits:
Object show all
Defined in:
lib/http_crawler/client.rb

Direct Known Subclasses

Proxy::Client, Web::Client

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameter = {}) ⇒ Client

init_uri 如果未初始化@uri,则会报错



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# File 'lib/http_crawler/client.rb', line 243

def initialize(parameter = {})
  # 初始化 uri
  init_uri

  # 如果自定义uri
  if parameter[:uri]
    raise "Client uri为重复初始化" if uri
    update_uri(parameter[:uri])
  end

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class}"}
end

Instance Attribute Details

#all_timeoutObject

Returns the value of attribute all_timeout.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def all_timeout
  @all_timeout
end

#connect_timeObject

Returns the value of attribute connect_time.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def connect_time
  @connect_time
end

#cookies(parameter = {}) ⇒ Object

cookies相关方法



105
106
107
# File 'lib/http_crawler/client.rb', line 105

def cookies
  @cookies
end

#error_urlsObject

Returns the value of attribute error_urls.



196
197
198
# File 'lib/http_crawler/client.rb', line 196

def error_urls
  @error_urls
end

#header(parameter = {}) ⇒ Object

头文件相关方法



84
85
86
# File 'lib/http_crawler/client.rb', line 84

def header
  @header
end

#max_error_numObject

最大错误重试次数



33
34
35
# File 'lib/http_crawler/client.rb', line 33

def max_error_num
  @max_error_num
end

#read_timeObject

Returns the value of attribute read_time.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def read_time
  @read_time
end

#responseObject

请求的响应



285
286
287
# File 'lib/http_crawler/client.rb', line 285

def response
  @response
end

#uriObject (readonly)

Returns the value of attribute uri.



37
38
39
# File 'lib/http_crawler/client.rb', line 37

def uri
  @uri
end

#write_timeObject

Returns the value of attribute write_time.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def write_time
  @write_time
end

Class Method Details

.for(web_name) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
end

.for_module(module_name, *args) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new()
end

.for_uri(path) ⇒ Object



25
26
27
# File 'lib/http_crawler/client.rb', line 25

def for_uri(path)
  self.new(uri: path)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



203
204
205
# File 'lib/http_crawler/client.rb', line 203

def add_error_url(url_string)
  self.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



126
127
128
129
130
# File 'lib/http_crawler/client.rb', line 126

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#get(path, params = {}) ⇒ Object

发送 get 请求



267
268
269
270
# File 'lib/http_crawler/client.rb', line 267

def get(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/http_crawler/client.rb', line 165

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取"
    else
      break
    end
    sleep(5)
  end while true

  Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")

  unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end

  if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end
  proxy_ip
end

#get_uriObject

直接发送uri的get请求



273
274
275
276
# File 'lib/http_crawler/client.rb', line 273

def get_uri
  raise "Client uri为空" unless self.uri
  request {http.get(self.uri.to_s, :ssl_context => @ctx)}
end

#httpObject

初始化http请求前置条件



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/http_crawler/client.rb', line 214

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  if (@all_timeout)
    # 整体总计超时时间
    h = h.timeout(@all_timeout)
  else
    # 指定每个处理超时时间
    h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
  end

  h
end

#init_clientObject

初始化init_client参数



209
210
211
# File 'lib/http_crawler/client.rb', line 209

def init_client
  nil
end

#init_cookies(parameter = {}) ⇒ Object



109
110
111
# File 'lib/http_crawler/client.rb', line 109

def init_cookies(parameter = {})
  @cookies = {}
end

#init_header(parameter = {}) ⇒ Object



88
89
90
91
92
93
94
95
96
97
# File 'lib/http_crawler/client.rb', line 88

def init_header(parameter = {})
  @header = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Encoding": "gzip, br",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Connection": "keep-alive",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
  }
end

#init_sslObject

初始化 ssl 协议



74
75
76
77
78
79
80
# File 'lib/http_crawler/client.rb', line 74

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



66
67
68
69
70
71
# File 'lib/http_crawler/client.rb', line 66

def init_timeout
  @connect_time = 5
  @write_time = 5
  @read_time = 5
  @all_timeout = nil
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错



41
42
43
# File 'lib/http_crawler/client.rb', line 41

def init_uri
  @uri = nil
end

#post(path, params = {}) ⇒ Object

发送 post 请求



279
280
281
282
# File 'lib/http_crawler/client.rb', line 279

def post(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
end

#proxy_apiObject

代理使用的api方法名



133
134
135
# File 'lib/http_crawler/client.rb', line 133

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



138
139
140
# File 'lib/http_crawler/client.rb', line 138

def proxy_params
  @proxy_params ||= {"key": "default"}
end

#str_to_cookies(str) ⇒ Object

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}



119
120
121
122
123
# File 'lib/http_crawler/client.rb', line 119

def str_to_cookies(str)
  str.scan(/([^=]*)=([^;]*);? ?/) do |m|
    self.cookies[:"#{m[0]}"] = m[1]
  end
end

#update_cookies(parameter = {}) ⇒ Object



113
114
115
# File 'lib/http_crawler/client.rb', line 113

def update_cookies(parameter = {})
  nil
end

#update_header(parameter = {}) ⇒ Object



99
100
101
# File 'lib/http_crawler/client.rb', line 99

def update_header(parameter = {})
  @header = init_header
end

#update_proxy(proxy = {}) ⇒ Object



142
143
144
145
146
147
148
149
# File 'lib/http_crawler/client.rb', line 142

def update_proxy(proxy = {})
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?(proxy_ip = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


153
154
155
156
157
158
159
160
# File 'lib/http_crawler/client.rb', line 153

def update_proxy?(proxy_ip = {})
  if @auto_proxy
    update_proxy(proxy_ip)
    return true
  else
    return false
  end
end

#update_uri(uri_or_path) ⇒ Object

更新uri



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/http_crawler/client.rb', line 46

def update_uri(uri_or_path)
  case uri_or_path
  when URI
    @uri = uri_or_path
  when String
    if uri_or_path =~ /^http/
      @uri = URI(uri_or_path)
    else
      @uri = @uri + uri_or_path
    end
  else
    raise ArgumentError, uri_or_path
  end
  # 初始化 ssl 协议
  self.init_ssl
  self.uri
end

#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理

Returns:

  • (Boolean)


289
290
291
292
293
294
295
296
297
298
299
# File 'lib/http_crawler/client.rb', line 289

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end