Class: HttpCrawler::Client

Inherits:
Object show all
Defined in:
lib/http_crawler/client.rb

Direct Known Subclasses

Proxy::Client, Web::Client

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameter = {}) ⇒ Client

init_uri 如果未初始化@uri,则会报错



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# File 'lib/http_crawler/client.rb', line 245

def initialize(parameter = {})
  parameter = parameter.symbolize_keys
  # 初始化 uri
  init_uri

  # 如果自定义uri
  if parameter[:uri]
    raise "Client uri为重复初始化" if uri
    update_uri(parameter[:uri])
  end

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class.to_s.gsub(":","_")}"}
end

Instance Attribute Details

#all_timeoutObject

Returns the value of attribute all_timeout.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def all_timeout
  @all_timeout
end

#connect_timeObject

Returns the value of attribute connect_time.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def connect_time
  @connect_time
end

#cookies(parameter = {}) ⇒ Object

cookies相关方法



108
109
110
# File 'lib/http_crawler/client.rb', line 108

def cookies
  @cookies
end

#error_urlsObject

Returns the value of attribute error_urls.



198
199
200
# File 'lib/http_crawler/client.rb', line 198

def error_urls
  @error_urls
end

#header(parameter = {}) ⇒ Object

头文件相关方法



84
85
86
# File 'lib/http_crawler/client.rb', line 84

def header
  @header
end

#max_error_numObject

最大错误重试次数



33
34
35
# File 'lib/http_crawler/client.rb', line 33

def max_error_num
  @max_error_num
end

#read_timeObject

Returns the value of attribute read_time.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def read_time
  @read_time
end

#responseObject

请求的响应



288
289
290
# File 'lib/http_crawler/client.rb', line 288

def response
  @response
end

#uriObject (readonly)

Returns the value of attribute uri.



37
38
39
# File 'lib/http_crawler/client.rb', line 37

def uri
  @uri
end

#write_timeObject

Returns the value of attribute write_time.



64
65
66
# File 'lib/http_crawler/client.rb', line 64

def write_time
  @write_time
end

Class Method Details

.for(web_name) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
end

.for_module(module_name, *args) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new()
end

.for_uri(path) ⇒ Object



25
26
27
# File 'lib/http_crawler/client.rb', line 25

def for_uri(path)
  self.new(uri: path)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



205
206
207
# File 'lib/http_crawler/client.rb', line 205

def add_error_url(url_string)
  self.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



132
133
134
135
136
# File 'lib/http_crawler/client.rb', line 132

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#get(path, params = {}) ⇒ Object

发送 get 请求



270
271
272
273
# File 'lib/http_crawler/client.rb', line 270

def get(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/http_crawler/client.rb', line 172

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params.symbolize_keys)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取:proxy 为空"
    else
      break
    end
    sleep(5)
  end while true
  proxy_ip = proxy_ip.symbolize_keys
  Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")

  unless proxy_ip[:p_addr] && proxy_ip[:p_port]
    Rails.logger.warn "无最新代理等待5秒后重新获取:p_addr 或 p_port 为空"
    sleep(5)
    proxy_ip = get_proxy
  end

  proxy_ip
end

#get_uriObject

直接发送uri的get请求



276
277
278
279
# File 'lib/http_crawler/client.rb', line 276

def get_uri
  raise "Client uri为空" unless self.uri
  request {http.get(self.uri.to_s, :ssl_context => @ctx)}
end

#httpObject

初始化http请求前置条件



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/http_crawler/client.rb', line 216

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy[:p_addr], @proxy[:p_port].to_i, @proxy[:p_user], @proxy[:p_pass]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  if (@all_timeout)
    # 整体总计超时时间
    h = h.timeout(@all_timeout)
  else
    # 指定每个处理超时时间
    h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
  end

  h
end

#init_clientObject

初始化init_client参数



211
212
213
# File 'lib/http_crawler/client.rb', line 211

def init_client
  nil
end

#init_cookies(parameter = {}) ⇒ Object



113
114
115
116
# File 'lib/http_crawler/client.rb', line 113

def init_cookies(parameter = {})
  parameter = parameter.symbolize_keys
  @cookies = {}
end

#init_header(parameter = {}) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
# File 'lib/http_crawler/client.rb', line 89

def init_header(parameter = {})
  parameter = parameter.symbolize_keys
  @header = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Encoding": "gzip, br",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Connection": "keep-alive",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
  }
end

#init_sslObject

初始化 ssl 协议



74
75
76
77
78
79
80
# File 'lib/http_crawler/client.rb', line 74

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



66
67
68
69
70
71
# File 'lib/http_crawler/client.rb', line 66

def init_timeout
  @connect_time = 5
  @write_time = 5
  @read_time = 5
  @all_timeout = nil
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错



41
42
43
# File 'lib/http_crawler/client.rb', line 41

def init_uri
  @uri = nil
end

#post(path, params = {}) ⇒ Object

发送 post 请求



282
283
284
285
# File 'lib/http_crawler/client.rb', line 282

def post(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
end

#proxy_apiObject

代理使用的api方法名



139
140
141
# File 'lib/http_crawler/client.rb', line 139

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



144
145
146
# File 'lib/http_crawler/client.rb', line 144

def proxy_params
  @proxy_params ||= {key: "default"}
end

#str_to_cookies(str) ⇒ Object

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}



125
126
127
128
129
# File 'lib/http_crawler/client.rb', line 125

def str_to_cookies(str)
  str.scan(/([^=]*)=([^;]*);? ?/) do |m|
    self.cookies[:"#{m[0]}"] = m[1]
  end
end

#update_cookies(parameter = {}) ⇒ Object



118
119
120
121
# File 'lib/http_crawler/client.rb', line 118

def update_cookies(parameter = {})
  parameter = parameter.symbolize_keys
  nil
end

#update_header(parameter = {}) ⇒ Object



101
102
103
104
# File 'lib/http_crawler/client.rb', line 101

def update_header(parameter = {})
  parameter = parameter.symbolize_keys
  @header = init_header
end

#update_proxy(proxy = {}) ⇒ Object



148
149
150
151
152
153
154
155
156
# File 'lib/http_crawler/client.rb', line 148

def update_proxy(proxy = {})
  proxy = proxy.symbolize_keys
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


160
161
162
163
164
165
166
167
# File 'lib/http_crawler/client.rb', line 160

def update_proxy?
  if @auto_proxy
    self.update_proxy
    return true
  else
    return false
  end
end

#update_uri(uri_or_path) ⇒ Object

更新uri



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/http_crawler/client.rb', line 46

def update_uri(uri_or_path)
  case uri_or_path
  when URI
    @uri = uri_or_path
  when String
    if uri_or_path =~ /^http/
      @uri = URI(uri_or_path)
    else
      @uri = @uri + uri_or_path
    end
  else
    raise ArgumentError, uri_or_path
  end
  # 初始化 ssl 协议
  self.init_ssl
  self.uri
end

#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理

Returns:

  • (Boolean)


292
293
294
295
296
297
298
299
300
301
302
# File 'lib/http_crawler/client.rb', line 292

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end