Class: HttpCrawler::HTTP

Inherits:

Net::HTTP

Object
Net::HTTP
HttpCrawler::HTTP

show all

Defined in:: lib/http_crawler/http.rb

Constant Summary collapse

@@proxy_list =

[]

Instance Attribute Summary collapse

#auto_proxy ⇒ Object

自动获取代理，true 表示自动获取代理、false 表示不自动获取.
#max_error_num ⇒ Object

请求错误后的重复最大请求次数.
#proxy_api ⇒ Object

代理API的别名主要关联 HttpCrawler::Proxy中维护的代理API.
#proxy_key ⇒ Object

调用自己的代理池所需要的主键 key.

Instance Method Summary collapse

#get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) ⇒ Object

重定向请求.
#get_proxy ⇒ Object

通过调用 api 获取代理或者通过自定义设置代理.
#http_error_sleep ⇒ Object
#initialize(address, port = nil) ⇒ HTTP constructor

A new instance of HTTP.
#post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) ⇒ Object

重定向请求.
#proxy(p = {}) ⇒ Object

为 @http 重设代理.
#request(req, body = nil, &block) ⇒ Object

重写发送请求的方法.
#server_error_sleep ⇒ Object
#update_proxy(p = {}) ⇒ Object
#update_proxy?(p = {}) ⇒ Boolean

如果自动更新代理则更新代理返回 true，否则返回false.

Constructor Details

#initialize(address, port = nil) ⇒ `HTTP`

Returns a new instance of HTTP.

# File 'lib/http_crawler/http.rb', line 16

def initialize(address, port = nil)
  super(address, port)
  @max_error_num = 2
  @error_num = 0
  @proxy_key = "default"
end

Instance Attribute Details

#auto_proxy ⇒ `Object`

自动获取代理，true 表示自动获取代理、false 表示不自动获取



8
9
10

# File 'lib/http_crawler/http.rb', line 8

def auto_proxy
  @auto_proxy
end

#max_error_num ⇒ `Object`

请求错误后的重复最大请求次数



14
15
16

# File 'lib/http_crawler/http.rb', line 14

def max_error_num
  @max_error_num
end

#proxy_api ⇒ `Object`

代理API的别名主要关联 HttpCrawler::Proxy中维护的代理API



10
11
12

# File 'lib/http_crawler/http.rb', line 10

def proxy_api
  @proxy_api
end

#proxy_key ⇒ `Object`

调用自己的代理池所需要的主键 key



12
13
14

# File 'lib/http_crawler/http.rb', line 12

def proxy_key
  @proxy_key
end

Instance Method Details

#get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) ⇒ `Object`

重定向请求

Raises:

(ArgumentError)

# File 'lib/http_crawler/http.rb', line 111

def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
  # You should choose a better exception.
  raise ArgumentError, 'too many HTTP repeated' if limit == 0
  # 更新uri_or_path
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"

  response = get(uri_or_path, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, limit - 1, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError  5XX to #{address}"
    server_error_sleep
    # 重新请求
    get_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      get_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#get_proxy ⇒ `Object`

通过调用 api 获取代理或者通过自定义设置代理

# File 'lib/http_crawler/http.rb', line 58

def get_proxy

  while @@proxy_list.blank?
    Rails.logger.debug("@@proxy_list 为空进行更新")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(key: proxy_key)
    @@proxy_list << proxy_r.parsing
    Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
    sleep(1)
  end

  p = @@proxy_list.delete_at(0)

  Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")

  unless p && p["p_addr"] && p["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    p = get_proxy
  end

  if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    p = get_proxy
  end
  p
end

#http_error_sleep ⇒ `Object`



23
24
25

# File 'lib/http_crawler/http.rb', line 23

def http_error_sleep
  sleep(0.5)
end

#post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) ⇒ `Object`

重定向请求

# File 'lib/http_crawler/http.rb', line 147

def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
  # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
  uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
  # Rails.logger.debug "post_fetch => #{uri_or_path}"
  response = post(uri_or_path, data, initheader, dest, &block)
  case response
  when Net::HTTPSuccess then
    response
  when Net::HTTPRedirection then
    location = response['location']
    Rails.logger.warn "redirected to #{location}"
    # 传入 location 进行跳转
    get_fetch(location, initheader, dest, 9, &block)
  when Net::HTTPServerError then
    Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
    server_error_sleep
    # 重新请求
    post_fetch(uri_or_path, initheader, dest, &block)
  when Net::HTTPProxyAuthenticationRequired then
    Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{http.proxy_address}:#{http.proxy_port}]  =>#{address}"
    if update_proxy?
      server_error_sleep
      # 重新请求
      post_fetch(uri_or_path, initheader, dest, &block)
    else
      response.error!
    end
  else
    server_error_sleep
    response.error!
  end
end

#proxy(p = {}) ⇒ `Object`

为 @http 重设代理

# File 'lib/http_crawler/http.rb', line 37

def proxy(p = {})

  raise '代理设置 p_addr 不能为空' unless p["p_addr"]
  raise '代理设置 p_port 不能为空' unless p["p_port"]

  p["p_user"] ||= nil
  p["p_pass"] ||= nil

  Rails.logger.info("切换代理至 => #{p}")
  # 设为 false 否则不会启用代理
  @proxy_from_env = false

  # 初始化代理数据
  @proxy_address = p["p_addr"]
  @proxy_port = p["p_port"]
  @proxy_user = p["p_user"]
  @proxy_pass = p["p_pass"]

end

#request(req, body = nil, &block) ⇒ `Object`

重写发送请求的方法

# File 'lib/http_crawler/http.rb', line 185

def request(req, body = nil, &block)
  begin
    Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
    Rails.logger.debug("body => #{body}") if started? && body
    super(req, body, &block)
  rescue => error
    if started?
      # started? 是为了判断是否结束http请求，如果不添加则会处理2次异常
      raise error
    else
      # 最大错误尝试次数
      if @error_num < @max_error_num
        @error_num += 1
        http_error_sleep
        retry # 这将把控制移到 begin 的开头
      else
        # 超过最大错误限制 判断错误类型
        case error
        when Net::HTTPFatalError
          raise error
        when EOFError
          Rails.logger.warn "EOFError!"
          if update_proxy?
            proxy(get_proxy)
            http_error_sleep
            retry # 这将把控制移到 begin 的开头
          else
            raise error
          end
        when Timeout::Error
          Rails.logger.warn "请求超时!"
          if update_proxy?
            @error_num = 0
            http_error_sleep
            retry # 这将把控制移到 begin 的开头
          else
            raise error
          end
        else
          raise error
        end
      end
    end
  end # begin
end

#server_error_sleep ⇒ `Object`



27
28
29

# File 'lib/http_crawler/http.rb', line 27

def server_error_sleep
  sleep(3)
end

#update_proxy(p = {}) ⇒ `Object`

# File 'lib/http_crawler/http.rb', line 87

def update_proxy(p = {})
  if p.blank?
    proxy(get_proxy)
  else
    proxy(p)
  end
end

#update_proxy?(p = {}) ⇒ `Boolean`

如果自动更新代理则更新代理返回 true，否则返回false

Returns:

(Boolean)

# File 'lib/http_crawler/http.rb', line 96

def update_proxy?(p = {})
  if auto_proxy
    if p.blank?
      proxy(get_proxy)
    else
      proxy(p)
    end
    return true
  else
    return false
  end
end

Class: HttpCrawler::HTTP

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(address, port = nil) ⇒ HTTP

Instance Attribute Details

#auto_proxy ⇒ Object

#max_error_num ⇒ Object

#proxy_api ⇒ Object

#proxy_key ⇒ Object

Instance Method Details

#get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block) ⇒ Object

#get_proxy ⇒ Object

#http_error_sleep ⇒ Object

#post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block) ⇒ Object

#proxy(p = {}) ⇒ Object

#request(req, body = nil, &block) ⇒ Object

#server_error_sleep ⇒ Object

#update_proxy(p = {}) ⇒ Object

#update_proxy?(p = {}) ⇒ Boolean