Class: HttpCrawler::HTTP

Inherits:
Net::HTTP show all
Defined in:
lib/http_crawler/http.rb

Constant Summary collapse

@@proxy_list =
[]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(address, port = nil) ⇒ HTTP

Returns a new instance of HTTP.



20
21
22
23
24
25
26
# File 'lib/http_crawler/http.rb', line 20

def initialize(address, port = nil)
  super(address, port)
  @max_error_num = 2
  @error_num = 0
  @proxy_key = "default"
  @error_urls = []
end

Instance Attribute Details

#auto_proxyObject

自动获取代理,true 表示自动获取代理 、false 表示不自动获取



5
6
7
# File 'lib/http_crawler/http.rb', line 5

def auto_proxy
  @auto_proxy
end

#error_urlsObject

错误的url地址,存的是正则



18
19
20
# File 'lib/http_crawler/http.rb', line 18

def error_urls
  @error_urls
end

#max_error_numObject

请求错误后的重复最大请求次数



16
17
18
# File 'lib/http_crawler/http.rb', line 16

def max_error_num
  @max_error_num
end

#proxy_apiObject

代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API



7
8
9
# File 'lib/http_crawler/http.rb', line 7

def proxy_api
  @proxy_api
end

#proxy_keyObject

调用自己的代理池所需要的主键 key



14
15
16
# File 'lib/http_crawler/http.rb', line 14

def proxy_key
  @proxy_key
end

Instance Method Details

#get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) ⇒ Object

重定向请求

Raises:

  • (ArgumentError)


126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/http_crawler/http.rb', line 126

def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
  # You should choose a better exception.
  raise ArgumentError, 'too many HTTP repeated' if limit == 0
  # 更新uri_or_path
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"

  response = get(uri_or_path, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    @error_urls.each do |url_string|
      if location =~ /#{url_string}/
        raise "跳转到异常url => #{location}"
      end
    end
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, limit - 1, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError  5XX to #{address}"
    server_error_sleep
    # 重新请求
    get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#get_proxyObject

通过调用 api 获取代理或者通过自定义设置代理



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/http_crawler/http.rb', line 60

def get_proxy

  # while @@proxy_list.blank?
  #   Rails.logger.debug("@@proxy_list 为空进行更新")
  #   proxy_client = HttpCrawler::Proxy.for(proxy_api)
  #   proxy_r = proxy_client.get_proxy(key: proxy_key)
  #   @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
  #   Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
  #   sleep(1)
  # end
  # p = @@proxy_list.delete_at(0)

  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(key: proxy_key)
    proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取"
    else
      break
    end
    sleep(5)
  end while true

  Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")

  unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end

  if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end
  proxy_ip
end

#http_error_sleepObject



28
29
30
# File 'lib/http_crawler/http.rb', line 28

def http_error_sleep
  sleep(0.5)
end

#post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) ⇒ Object

重定向请求



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/http_crawler/http.rb', line 167

def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
  # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
  # Rails.logger.debug "post_fetch => #{uri_or_path}"
  response = post(uri_or_path, data, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    @error_urls.each do |url_string|
      if location =~ /#{url_string}/
        raise "跳转到异常url => #{location}"
      end
    end
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, 9, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
    server_error_sleep
    # 重新请求
    post_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      post_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#proxy(p = {}) ⇒ Object

为 @http 重设代理



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/http_crawler/http.rb', line 39

def proxy(p = {})

  raise '代理设置 p_addr 不能为空' unless p["p_addr"]
  raise '代理设置 p_port 不能为空' unless p["p_port"]

  p["p_user"] ||= nil
  p["p_pass"] ||= nil

  Rails.logger.info("切换代理至 => #{p}")
  # 设为 false 否则不会启用代理
  @proxy_from_env = false

  # 初始化代理数据
  @proxy_address = p["p_addr"]
  @proxy_port = p["p_port"]
  @proxy_user = p["p_user"]
  @proxy_pass = p["p_pass"]

end

#request(req, body = nil, &block) ⇒ Object

重写 发送请求的方法



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# File 'lib/http_crawler/http.rb', line 210

def request(req, body = nil, &block)
  begin
    Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
    Rails.logger.debug("body => #{body}") if started? && body
    super(req, body, &block)
  rescue => error
    Rails.logger.error "出错了! 错误类型 => #{error.class}"
    if started?
      # started? 是为了判断是否结束http请求,如果不添加则会处理2次异常
      Rails.logger.error("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}")
      Rails.logger.error("body => #{body}") if body
      raise error
    else
      http_error_sleep
      # 最大错误尝试次数
      if @error_num < @max_error_num
        @error_num += 1
        retry # 这将把控制移到 begin 的开头
      else

        # 超过最大错误限制 判断错误类型
        case error
        when EOFError
          Rails.logger.warn "EOFError!"
        when Timeout::Error
          Rails.logger.warn "请求超时!"
        when Net::HTTPServerException
          Rails.logger.warn "代理失效:[#{proxy_address}:#{proxy_port}]"
        when Errno::ECONNREFUSED
          Rails.logger.warn "Errno::ECONNREFUSED"
        else
          raise error
        end

        if update_proxy?
          @error_num = 0
          retry # 这将把控制移到 begin 的开头
        else
          raise error
        end
      end
    end
  end # begin
end

#server_error_sleepObject



32
33
34
# File 'lib/http_crawler/http.rb', line 32

def server_error_sleep
  sleep(3)
end

#update_proxy(proxy_ip = {}) ⇒ Object



102
103
104
105
106
107
108
# File 'lib/http_crawler/http.rb', line 102

def update_proxy(proxy_ip = {})
  if proxy_ip.blank?
    proxy(get_proxy)
  else
    proxy(proxy_ip)
  end
end

#update_proxy?(proxy_ip = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/http_crawler/http.rb', line 111

def update_proxy?(proxy_ip = {})
  if auto_proxy
    if proxy_ip.blank?
      proxy(get_proxy)
    else
      proxy(proxy_ip)
    end
    return true
  else
    return false
  end
end