Class: HttpCrawler::HTTP

Inherits:
Net::HTTP show all
Defined in:
lib/http_crawler/http.rb

Constant Summary collapse

@@proxy_list =
[]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(address, port = nil) ⇒ HTTP

Returns a new instance of HTTP.



16
17
18
19
20
21
# File 'lib/http_crawler/http.rb', line 16

def initialize(address, port = nil)
  super(address, port)
  @max_error_num = 2
  @error_num = 0
  @proxy_key = "default"
end

Instance Attribute Details

#auto_proxyObject

自动获取代理,true 表示自动获取代理 、false 表示不自动获取



8
9
10
# File 'lib/http_crawler/http.rb', line 8

def auto_proxy
  @auto_proxy
end

#max_error_numObject

请求错误后的重复最大请求次数



14
15
16
# File 'lib/http_crawler/http.rb', line 14

def max_error_num
  @max_error_num
end

#proxy_apiObject

代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API



10
11
12
# File 'lib/http_crawler/http.rb', line 10

def proxy_api
  @proxy_api
end

#proxy_keyObject

调用自己的代理池所需要的主键 key



12
13
14
# File 'lib/http_crawler/http.rb', line 12

def proxy_key
  @proxy_key
end

Instance Method Details

#get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) ⇒ Object

重定向请求

Raises:

  • (ArgumentError)


111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/http_crawler/http.rb', line 111

def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
  # You should choose a better exception.
  raise ArgumentError, 'too many HTTP repeated' if limit == 0
  # 更新uri_or_path
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"

  response = get(uri_or_path, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, limit - 1, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError  5XX to #{address}"
    server_error_sleep
    # 重新请求
    get_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      get_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#get_proxyObject

通过调用 api 获取代理或者通过自定义设置代理



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/http_crawler/http.rb', line 58

def get_proxy

  while @@proxy_list.blank?
    Rails.logger.debug("@@proxy_list 为空进行更新")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(key: proxy_key)
    @@proxy_list << proxy_r.parsing
    Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
    sleep(1)
  end

  p = @@proxy_list.delete_at(0)

  Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")

  unless p && p["p_addr"] && p["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    p = get_proxy
  end

  if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    p = get_proxy
  end
  p
end

#http_error_sleepObject



23
24
25
# File 'lib/http_crawler/http.rb', line 23

def http_error_sleep
  sleep(0.5)
end

#post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) ⇒ Object

重定向请求



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/http_crawler/http.rb', line 147

def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
  # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
  # Rails.logger.debug "post_fetch => #{uri_or_path}"
  response = post(uri_or_path, data, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, 9, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
    server_error_sleep
    # 重新请求
    post_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{http.proxy_address}:#{http.proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      post_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#proxy(p = {}) ⇒ Object

为 @http 重设代理



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/http_crawler/http.rb', line 37

def proxy(p = {})

  raise '代理设置 p_addr 不能为空' unless p["p_addr"]
  raise '代理设置 p_port 不能为空' unless p["p_port"]

  p["p_user"] ||= nil
  p["p_pass"] ||= nil

  Rails.logger.info("切换代理至 => #{p}")
  # 设为 false 否则不会启用代理
  @proxy_from_env = false

  # 初始化代理数据
  @proxy_address = p["p_addr"]
  @proxy_port = p["p_port"]
  @proxy_user = p["p_user"]
  @proxy_pass = p["p_pass"]

end

#request(req, body = nil, &block) ⇒ Object

重写 发送请求的方法



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/http_crawler/http.rb', line 185

def request(req, body = nil, &block)
  begin
    Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
    Rails.logger.debug("body => #{body}") if started? && body
    super(req, body, &block)
  rescue => error
    if started?
      # started? 是为了判断是否结束http请求,如果不添加则会处理2次异常
      raise error
    else
      # 最大错误尝试次数
      if @error_num < @max_error_num
        @error_num += 1
        http_error_sleep
        retry # 这将把控制移到 begin 的开头
      else
        # 超过最大错误限制 判断错误类型
        case error
        when Net::HTTPFatalError
          raise error
        when EOFError
          Rails.logger.warn "EOFError!"
          if update_proxy?
            proxy(get_proxy)
            http_error_sleep
            retry # 这将把控制移到 begin 的开头
          else
            raise error
          end
        when Timeout::Error
          Rails.logger.warn "请求超时!"
          if update_proxy?
            @error_num = 0
            http_error_sleep
            retry # 这将把控制移到 begin 的开头
          else
            raise error
          end
        else
          raise error
        end
      end
    end
  end # begin
end

#server_error_sleepObject



27
28
29
# File 'lib/http_crawler/http.rb', line 27

def server_error_sleep
  sleep(3)
end

#update_proxy(p = {}) ⇒ Object



87
88
89
90
91
92
93
# File 'lib/http_crawler/http.rb', line 87

def update_proxy(p = {})
  if p.blank?
    proxy(get_proxy)
  else
    proxy(p)
  end
end

#update_proxy?(p = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/http_crawler/http.rb', line 96

def update_proxy?(p = {})
  if auto_proxy
    if p.blank?
      proxy(get_proxy)
    else
      proxy(p)
    end
    return true
  else
    return false
  end
end