Class: HttpCrawler::HTTP

Inherits:
Net::HTTP show all
Defined in:
lib/http_crawler/http.rb

Constant Summary collapse

@@proxy_list =
[]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(address, port = nil) ⇒ HTTP

Returns a new instance of HTTP.



14
15
16
17
18
19
# File 'lib/http_crawler/http.rb', line 14

def initialize(address, port = nil)
  super(address, port)
  @max_error_num = 2
  @error_num = 0
  @proxy_key = "default"
end

Instance Attribute Details

#auto_proxyObject

自动获取代理,true 表示自动获取代理 、false 表示不自动获取



6
7
8
# File 'lib/http_crawler/http.rb', line 6

def auto_proxy
  @auto_proxy
end

#max_error_numObject

请求错误后的重复最大请求次数



12
13
14
# File 'lib/http_crawler/http.rb', line 12

def max_error_num
  @max_error_num
end

#proxy_apiObject

代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API



8
9
10
# File 'lib/http_crawler/http.rb', line 8

def proxy_api
  @proxy_api
end

#proxy_keyObject

调用自己的代理池所需要的主键 key



10
11
12
# File 'lib/http_crawler/http.rb', line 10

def proxy_key
  @proxy_key
end

Instance Method Details

#get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) ⇒ Object

重定向请求

Raises:

  • (ArgumentError)


109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/http_crawler/http.rb', line 109

def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
  # You should choose a better exception.
  raise ArgumentError, 'too many HTTP repeated' if limit == 0
  # 更新uri_or_path
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"

  response = get(uri_or_path, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, limit - 1, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError  5XX to #{address}"
    server_error_sleep
    # 重新请求
    get_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      get_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#get_proxyObject

通过调用 api 获取代理或者通过自定义设置代理



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/http_crawler/http.rb', line 56

def get_proxy

  while @@proxy_list.blank?
    Rails.logger.debug("@@proxy_list 为空进行更新")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(key: proxy_key)
    @@proxy_list << proxy_r.parsing
    Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
    sleep(1)
  end

  p = @@proxy_list.delete_at(0)

  Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")

  unless p && p["p_addr"] && p["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    p = get_proxy
  end

  if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    p = get_proxy
  end
  p
end

#http_error_sleepObject



21
22
23
# File 'lib/http_crawler/http.rb', line 21

def http_error_sleep
  sleep(0.5)
end

#post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) ⇒ Object

重定向请求



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/http_crawler/http.rb', line 145

def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
  # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
  # Rails.logger.debug "post_fetch => #{uri_or_path}"
  response = post(uri_or_path, data, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, 9, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
    server_error_sleep
    # 重新请求
    post_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{proxy_address}:#{proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      post_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#proxy(p = {}) ⇒ Object

为 @http 重设代理



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/http_crawler/http.rb', line 35

def proxy(p = {})

  raise '代理设置 p_addr 不能为空' unless p["p_addr"]
  raise '代理设置 p_port 不能为空' unless p["p_port"]

  p["p_user"] ||= nil
  p["p_pass"] ||= nil

  Rails.logger.info("切换代理至 => #{p}")
  # 设为 false 否则不会启用代理
  @proxy_from_env = false

  # 初始化代理数据
  @proxy_address = p["p_addr"]
  @proxy_port = p["p_port"]
  @proxy_user = p["p_user"]
  @proxy_pass = p["p_pass"]

end

#request(req, body = nil, &block) ⇒ Object

重写 发送请求的方法



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/http_crawler/http.rb', line 183

def request(req, body = nil, &block)
  begin
    Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
    Rails.logger.debug("body => #{body}") if started? && body
    super(req, body, &block)
  rescue => error
    Rails.logger.error "出错了! 错误类型 => #{error.class}"
    if started?
      # started? 是为了判断是否结束http请求,如果不添加则会处理2次异常
      Rails.logger.error("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}")
      Rails.logger.error("body => #{body}") if body
      raise error
    else
      http_error_sleep
      # 最大错误尝试次数
      if @error_num < @max_error_num
        @error_num += 1
        retry # 这将把控制移到 begin 的开头
      else

        # 超过最大错误限制 判断错误类型
        case error
        when EOFError
          Rails.logger.warn "EOFError!"
        when Timeout::Error
          Rails.logger.warn "请求超时!"
        when Net::HTTPServerException
          Rails.logger.warn "代理失效:[#{proxy_address}:#{proxy_port}]"
        when Errno::ECONNREFUSED
          Rails.logger.warn "Errno::ECONNREFUSED"
        else
          raise error
        end

        if update_proxy?
          @error_num = 0
          retry # 这将把控制移到 begin 的开头
        else
          raise error
        end
      end
    end
  end # begin
end

#server_error_sleepObject



25
26
27
# File 'lib/http_crawler/http.rb', line 25

def server_error_sleep
  sleep(3)
end

#update_proxy(p = {}) ⇒ Object



85
86
87
88
89
90
91
# File 'lib/http_crawler/http.rb', line 85

def update_proxy(p = {})
  if p.blank?
    proxy(get_proxy)
  else
    proxy(p)
  end
end

#update_proxy?(p = {}) ⇒ Boolean

如果自动更新代理 则更新代理返回 true,否则返回false

Returns:

  • (Boolean)


94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/http_crawler/http.rb', line 94

def update_proxy?(p = {})
  if auto_proxy
    if p.blank?
      proxy(get_proxy)
    else
      proxy(p)
    end
    return true
  else
    return false
  end
end