Module: HttpCrawler::Client

Included in:
Proxy::TestProxyApi::Client, Web::Baidu::Client
Defined in:
lib/http_crawler/client.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#uriObject (readonly)

Returns the value of attribute uri.



26
27
28
# File 'lib/http_crawler/client.rb', line 26

def uri
  @uri
end

Class Method Details

.for(web_name, *args) ⇒ Object

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14
# File 'lib/http_crawler/client.rb', line 12

def for(web_name, *args)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(*args)
end

.for_module(module_name, *args) ⇒ Object

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23
# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new(*args)
end

Instance Method Details

#add_error_url(url_string) ⇒ Object

添加错误的url地址,表示这里面的url都是异常地址,存的是正则



142
143
144
# File 'lib/http_crawler/client.rb', line 142

def add_error_url(url_string)
  @http.error_urls << url_string
end

#auto_proxy=(value) ⇒ Object

代理设置



71
72
73
74
75
# File 'lib/http_crawler/client.rb', line 71

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#cookiesObject

cookies



65
66
67
# File 'lib/http_crawler/client.rb', line 65

def cookies
  @cookies ||= {}
end

#get(path, params = {}) ⇒ Object

发送 get 请求



173
174
175
# File 'lib/http_crawler/client.rb', line 173

def get(path, params = {})
  http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
end

#get_proxyObject

获取proxy 通过调用 api 获取代理或者通过自定义设置代理



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/http_crawler/client.rb', line 110

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取"
    else
      break
    end
    sleep(5)
  end while true

  Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")

  unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end

  if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end
  proxy_ip
end

#headerObject

头文件相关方法



52
53
54
# File 'lib/http_crawler/client.rb', line 52

def header
  @header ||= init_header
end

#httpObject

初始化http请求前置条件



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/http_crawler/client.rb', line 153

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)

  h
end

#init_clientObject

初始化http参数



148
149
150
# File 'lib/http_crawler/client.rb', line 148

def init_client

end

#init_headerObject



56
57
58
# File 'lib/http_crawler/client.rb', line 56

def init_header
  nil
end

#init_sslObject

初始化 ssl 协议



43
44
45
46
47
48
49
# File 'lib/http_crawler/client.rb', line 43

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeoutObject

初始化超时时间



36
37
38
39
40
# File 'lib/http_crawler/client.rb', line 36

def init_timeout
  @connect_time = 5
  @write_time = 2
  @read_time = 5
end

#init_uriObject

init_uri 如果未初始化@uri,则会报错



31
32
33
# File 'lib/http_crawler/client.rb', line 31

def init_uri
  @uri = nil
end

#initializeObject

init_uri 如果未初始化@uri,则会报错



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/http_crawler/client.rb', line 186

def initialize
  # 初始化 uri
  raise "Client uri为空" unless init_uri

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class}"}
end

#post(path, params = {}) ⇒ Object

发送 post 请求



178
179
180
# File 'lib/http_crawler/client.rb', line 178

def post(path, params = {})
  http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
end

#proxy_apiObject

代理使用的api方法名



78
79
80
# File 'lib/http_crawler/client.rb', line 78

def proxy_api
  @proxy_api ||= "my"
end

#proxy_paramsObject

调用代理 api使用的参数



83
84
85
# File 'lib/http_crawler/client.rb', line 83

def proxy_params
  @proxy_params ||= {"key": "default"}
end

#update_header(parameter = {}) ⇒ Object



60
61
62
# File 'lib/http_crawler/client.rb', line 60

def update_header(parameter = {})
  nil
end

#update_proxy(proxy = {}) ⇒ Object



87
88
89
90
91
92
93
94
# File 'lib/http_crawler/client.rb', line 87

def update_proxy(proxy = {})
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?(proxy_ip = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


98
99
100
101
102
103
104
105
# File 'lib/http_crawler/client.rb', line 98

def update_proxy?(proxy_ip = {})
  if @auto_proxy
    update_proxy(proxy_ip)
    return true
  else
    return false
  end
end