Module: HttpCrawler::Client

Included in:
Proxy, Web::Baidu::Client
Defined in:
lib/http_crawler/client.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#max_error_numObject

最大错误重试次数



29
30
31
# File 'lib/http_crawler/client.rb', line 29

def max_error_num
  @max_error_num
end

#responseObject

请求的响应



210
211
212
# File 'lib/http_crawler/client.rb', line 210

def response
  @response
end

#uriObject (readonly)

Returns the value of attribute uri.



33
34
35
# File 'lib/http_crawler/client.rb', line 33

def uri
  @uri
end

Class Method Details

.for(web_name, *args) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name, *args)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(*args)
end

.for_module(module_name, *args) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new(*args)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



148
149
150
# File 'lib/http_crawler/client.rb', line 148

def add_error_url(url_string)
  @http.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



77
78
79
80
81
# File 'lib/http_crawler/client.rb', line 77

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#cookiesObject

cookies



71
72
73
# File 'lib/http_crawler/client.rb', line 71

def cookies
  @cookies ||= {}
end

#get(path, params = {}) ⇒ Object

发送 get 请求



200
201
202
# File 'lib/http_crawler/client.rb', line 200

def get(path, params = {})
  request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/http_crawler/client.rb', line 116

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取"
    else
      break
    end
    sleep(5)
  end while true

  Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")

  unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end

  if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end
  proxy_ip
end

#headerObject

头文件相关方法



58
59
60
# File 'lib/http_crawler/client.rb', line 58

def header
  @header ||= init_header
end

#httpObject

初始化http请求前置条件



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/http_crawler/client.rb', line 159

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)

  h
end

#init_clientObject

初始化http参数



154
155
156
# File 'lib/http_crawler/client.rb', line 154

def init_client

end

#init_headerObject



62
63
64
# File 'lib/http_crawler/client.rb', line 62

def init_header
  nil
end

#init_sslObject

初始化 ssl 协议



49
50
51
52
53
54
55
# File 'lib/http_crawler/client.rb', line 49

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



42
43
44
45
46
# File 'lib/http_crawler/client.rb', line 42

def init_timeout
  @connect_time = 3
  @write_time = 3
  @read_time = 3
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错

继承类需要实现 @uri = URI("http://host")


37
38
39
# File 'lib/http_crawler/client.rb', line 37

def init_uri
  @uri = nil
end

#initializeObject

init_uri 如果未初始化@uri,则会报错

继承类需要重定义 init_uri


182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/http_crawler/client.rb', line 182

def initialize
  # 初始化 uri
  raise "Client uri为空" unless init_uri

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class}"}
end

#post(path, params = {}) ⇒ Object

发送 post 请求



205
206
207
# File 'lib/http_crawler/client.rb', line 205

def post(path, params = {})
  request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
end

#proxy_apiObject

代理使用的api方法名



84
85
86
# File 'lib/http_crawler/client.rb', line 84

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



89
90
91
# File 'lib/http_crawler/client.rb', line 89

def proxy_params
  @proxy_params ||= {"key": "default"}
end

#update_header(parameter = {}) ⇒ Object



66
67
68
# File 'lib/http_crawler/client.rb', line 66

def update_header(parameter = {})
  nil
end

#update_proxy(proxy = {}) ⇒ Object



93
94
95
96
97
98
99
100
# File 'lib/http_crawler/client.rb', line 93

def update_proxy(proxy = {})
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?(proxy_ip = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


104
105
106
107
108
109
110
111
# File 'lib/http_crawler/client.rb', line 104

def update_proxy?(proxy_ip = {})
  if @auto_proxy
    update_proxy(proxy_ip)
    return true
  else
    return false
  end
end

#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理

Returns:

  • (Boolean)


214
215
216
217
218
219
220
221
222
223
224
# File 'lib/http_crawler/client.rb', line 214

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end