Class: HttpCrawler::Client

Inherits:

Object

Object
HttpCrawler::Client

show all

Defined in:: lib/http_crawler/client.rb

Direct Known Subclasses

Proxy::Client, Web::Client

Instance Attribute Summary collapse

#cookies(parameter = {}) ⇒ Object

cookies相关方法.
#error_urls ⇒ Object

Returns the value of attribute error_urls.
#header(parameter = {}) ⇒ Object

头文件相关方法.
#max_error_num ⇒ Object

最大错误重试次数.
#response ⇒ Object readonly

请求的响应.
#uri ⇒ Object readonly

Returns the value of attribute uri.

Class Method Summary collapse

.for(web_name) ⇒ Object

接收格式 web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例.
.for_module(module_name, *args) ⇒ Object

接收格式 module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例.
.for_uri(path) ⇒ Object

Instance Method Summary collapse

#add_error_url(url_string) ⇒ Object

添加错误的url地址，表示这里面的url都是异常地址，存的是正则.
#auto_proxy=(value) ⇒ Object

代理设置.
#get(path, params = {}) ⇒ Object

发送 get 请求.
#get_proxy ⇒ Object

获取proxy 通过调用 api 获取代理或者通过自定义设置代理.
#get_uri ⇒ Object

直接发送uri的get请求.
#http ⇒ Object

初始化http请求前置条件.
#init_client ⇒ Object

初始化init_client参数.
#init_cookies(parameter = {}) ⇒ Object
#init_header(parameter = {}) ⇒ Object
#init_ssl ⇒ Object

初始化 ssl 协议.
#init_timeout ⇒ Object

初始化超时时间.
#init_uri ⇒ Object

init_uri 如果未初始化@uri,则会报错继承类需要实现 @uri = URI(“host”).
#initialize(parameter = {}) ⇒ Client constructor

init_uri 如果未初始化@uri,则会报错继承类需要重定义 init_uri.
#post(path, params = {}) ⇒ Object

发送 post 请求.
#proxy_api ⇒ Object

代理使用的api方法名.
#proxy_params ⇒ Object

调用代理 api使用的参数.
#str_to_cookies(str) ⇒ Object

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}.
#update_cookies(parameter = {}) ⇒ Object
#update_header(parameter = {}) ⇒ Object
#update_proxy(proxy = {}) ⇒ Object
#update_proxy?(proxy_ip = {}) ⇒ Boolean

如果自动更新代理则更新代理返回 true，否则返回false.
#update_uri(uri_or_path) ⇒ Object

更新uri.
#validation_to_proxy?(r = response) ⇒ Boolean

出现如果验证码,切换代理.

Constructor Details

#initialize(parameter = {}) ⇒ `Client`

init_uri 如果未初始化@uri,则会报错

继承类需要重定义 init_uri

# File 'lib/http_crawler/client.rb', line 242

def initialize(parameter = {})
  # 初始化 uri
  init_uri

  # 如果自定义uri
  if parameter[:uri]
    raise "Client uri为重复初始化" if uri
    update_uri(parameter[:uri])
  end

  # 初始化超时时间
  init_timeout

  # 初始化 ssl 协议
  init_ssl

  # 初始化一些 client 自定义参数
  init_client

  # 初始化 代理参数
  @proxy_params = {key: "#{self.class}"}
end

Instance Attribute Details

#cookies(parameter = {}) ⇒ `Object`

cookies相关方法



104
105
106

# File 'lib/http_crawler/client.rb', line 104

def cookies
  @cookies
end

#error_urls ⇒ `Object`

Returns the value of attribute error_urls.



195
196
197

# File 'lib/http_crawler/client.rb', line 195

def error_urls
  @error_urls
end

#header(parameter = {}) ⇒ `Object`

头文件相关方法



83
84
85

# File 'lib/http_crawler/client.rb', line 83

def header
  @header
end

#max_error_num ⇒ `Object`

最大错误重试次数



33
34
35

# File 'lib/http_crawler/client.rb', line 33

def max_error_num
  @max_error_num
end

#response ⇒ `Object`

请求的响应



284
285
286

# File 'lib/http_crawler/client.rb', line 284

def response
  @response
end

#uri ⇒ `Object` (readonly)

Returns the value of attribute uri.



37
38
39

# File 'lib/http_crawler/client.rb', line 37

def uri
  @uri
end

Class Method Details

.for(web_name) ⇒ `Object`

接收格式web_name = “biquge_duquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



12
13
14

# File 'lib/http_crawler/client.rb', line 12

def for(web_name)
  "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
end

.for_module(module_name, *args) ⇒ `Object`

接收格式module_name = “HttpCrawler::Web::BiqugeDuquanben” 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例



21
22
23

# File 'lib/http_crawler/client.rb', line 21

def for_module(module_name, *args)
  "#{module_name}::Client".constantize.new()
end

.for_uri(path) ⇒ `Object`



25
26
27

# File 'lib/http_crawler/client.rb', line 25

def for_uri(path)
  self.new(uri: path)
end

Instance Method Details

#add_error_url(url_string) ⇒ `Object`

添加错误的url地址，表示这里面的url都是异常地址，存的是正则



202
203
204

# File 'lib/http_crawler/client.rb', line 202

def add_error_url(url_string)
  self.error_urls << url_string
end

#auto_proxy=(value) ⇒ `Object`

代理设置

# File 'lib/http_crawler/client.rb', line 125

def auto_proxy=(value)
  Rails.logger.debug "自动更新代理"
  @auto_proxy = value
  update_proxy if (value == true && @proxy.blank?)
end

#get(path, params = {}) ⇒ `Object`

发送 get 请求

# File 'lib/http_crawler/client.rb', line 266

def get(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
end

#get_proxy ⇒ `Object`

获取proxy 通过调用 api 获取代理或者通过自定义设置代理

# File 'lib/http_crawler/client.rb', line 164

def get_proxy
  proxy_ip = nil
  begin
    Rails.logger.debug("开始获取代理IP")
    proxy_client = HttpCrawler::Proxy.for(proxy_api)
    proxy_r = proxy_client.get_proxy(proxy_params)
    proxy_ip = proxy_r.results unless proxy_r.results.blank?
    if proxy_ip.blank?
      Rails.logger.warn "无最新代理等待5秒后重新获取"
    else
      break
    end
    sleep(5)
  end while true

  Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")

  unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end

  if (@proxy && proxy_ip && @proxy["p_addr"] == proxy_ip["p_addr"] && @proxy["p_port"] == proxy_ip["p_port"])
    Rails.logger.warn "无最新代理等待5秒后重新获取"
    sleep(5)
    proxy_ip = get_proxy
  end
  proxy_ip
end

#get_uri ⇒ `Object`

直接发送uri的get请求

# File 'lib/http_crawler/client.rb', line 272

def get_uri
  raise "Client uri为空" unless self.uri
  request {http.get(self.uri.to_s, :ssl_context => @ctx)}
end

#http ⇒ `Object`

初始化http请求前置条件

# File 'lib/http_crawler/client.rb', line 213

def http
  # 自动重定向。最大重定向次数 max_hops: 5
  h = HTTP.follow(max_hops: 5)

  # 添加代理
  h = h.via(@proxy["p_addr"], @proxy["p_port"].to_i, @proxy["p_user"], @proxy["p_pass"]) unless (@proxy.blank?)

  # 添加头文件
  h = h.headers(header) if header

  # 添加cookies
  h = h.cookies(cookies) if cookies

  # 添加超时时间
  if(@all_timeout)
    # 整体总计超时时间
    h = h.timeout(@all_timeout)
  else
    # 指定每个处理超时时间
    h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
  end

  h
end

#init_client ⇒ `Object`

初始化init_client参数



208
209
210

# File 'lib/http_crawler/client.rb', line 208

def init_client
  nil
end

#init_cookies(parameter = {}) ⇒ `Object`



108
109
110

# File 'lib/http_crawler/client.rb', line 108

def init_cookies(parameter = {})
  @cookies = {}
end

#init_header(parameter = {}) ⇒ `Object`

# File 'lib/http_crawler/client.rb', line 87

def init_header(parameter = {})
  @header = {
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
      "Accept-Encoding": "gzip, br",
      "Accept-Language": "zh-CN,zh;q=0.9",
      "Connection": "keep-alive",
      "Upgrade-Insecure-Requests": "1",
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
  }
end

#init_ssl ⇒ `Object`

初始化 ssl 协议

# File 'lib/http_crawler/client.rb', line 73

def init_ssl
  if (@uri.scheme == "https")
    # ssl 协议
    @ctx = OpenSSL::SSL::SSLContext.new
    @ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
end

#init_timeout ⇒ `Object`

初始化超时时间

# File 'lib/http_crawler/client.rb', line 65

def init_timeout
  @connect_time = 5
  @write_time = 5
  @read_time = 5
  @all_timeout = nil
end

#init_uri ⇒ `Object`

init_uri 如果未初始化@uri,则会报错

继承类需要实现 @uri = URI("http://host")



41
42
43

# File 'lib/http_crawler/client.rb', line 41

def init_uri
  @uri = nil
end

#post(path, params = {}) ⇒ `Object`

发送 post 请求

# File 'lib/http_crawler/client.rb', line 278

def post(path, params = {})
  raise "Client uri为空" unless self.uri
  request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
end

#proxy_api ⇒ `Object`

代理使用的api方法名



132
133
134

# File 'lib/http_crawler/client.rb', line 132

def proxy_api
  @proxy_api ||= "my"
end

#proxy_params ⇒ `Object`

调用代理 api使用的参数



137
138
139

# File 'lib/http_crawler/client.rb', line 137

def proxy_params
  @proxy_params ||= {"key": "default"}
end

#str_to_cookies(str) ⇒ `Object`

字符串转换成cookies “abc=123; cd=412” => { “abc”: “123”, “cd”: “412”}

# File 'lib/http_crawler/client.rb', line 118

def str_to_cookies(str)
  str.scan(/([^=]*)=([^;]*);? ?/) do |m|
    self.cookies[:"#{m[0]}"] = m[1]
  end
end

#update_cookies(parameter = {}) ⇒ `Object`



112
113
114

# File 'lib/http_crawler/client.rb', line 112

def update_cookies(parameter = {})
  nil
end

#update_header(parameter = {}) ⇒ `Object`



98
99
100

# File 'lib/http_crawler/client.rb', line 98

def update_header(parameter = {})
  @header = init_header
end

#update_proxy(proxy = {}) ⇒ `Object`

# File 'lib/http_crawler/client.rb', line 141

def update_proxy(proxy = {})
  if (proxy.blank?)
    @proxy = get_proxy
  else
    @proxy = proxy
  end
  # @http.update_proxy(proxy)
end

#update_proxy?(proxy_ip = {}) ⇒ `Boolean`

如果自动更新代理则更新代理返回 true，否则返回false

Returns:

(Boolean)

# File 'lib/http_crawler/client.rb', line 152

def update_proxy?(proxy_ip = {})
  if @auto_proxy
    update_proxy(proxy_ip)
    return true
  else
    return false
  end
end

#update_uri(uri_or_path) ⇒ `Object`

更新uri

# File 'lib/http_crawler/client.rb', line 46

def update_uri(uri_or_path)
  case uri_or_path
  when URI
    @uri = uri_or_path
  when String
    if uri_or_path =~ /^http/
      @uri = URI(uri_or_path)
    else
      @uri = @uri + uri_or_path
    end
  else
    raise ArgumentError, uri_or_path
  end
  # 初始化 ssl 协议
  self.init_ssl
  self.uri
end

#validation_to_proxy?(r = response) ⇒ `Boolean`

出现如果验证码,切换代理

Returns:

(Boolean)

# File 'lib/http_crawler/client.rb', line 288

def validation_to_proxy?(r = response)
  # 判断是否出现验证码
  if r.validation_page?
    # 触发验证码切换代理
    self.update_proxy?
    # 成功处理
    return true
  else
    return false
  end
end

Class: HttpCrawler::Client

Direct Known Subclasses

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameter = {}) ⇒ Client

Instance Attribute Details

#cookies(parameter = {}) ⇒ Object

#error_urls ⇒ Object

#header(parameter = {}) ⇒ Object

#max_error_num ⇒ Object

#response ⇒ Object

#uri ⇒ Object (readonly)

Class Method Details

.for(web_name) ⇒ Object

.for_module(module_name, *args) ⇒ Object

.for_uri(path) ⇒ Object

Instance Method Details

#add_error_url(url_string) ⇒ Object

#auto_proxy=(value) ⇒ Object

#get(path, params = {}) ⇒ Object

#get_proxy ⇒ Object

#get_uri ⇒ Object

#http ⇒ Object

#init_client ⇒ Object

#init_cookies(parameter = {}) ⇒ Object

#init_header(parameter = {}) ⇒ Object

#init_ssl ⇒ Object

#init_timeout ⇒ Object

#init_uri ⇒ Object

#post(path, params = {}) ⇒ Object

#proxy_api ⇒ Object

#proxy_params ⇒ Object

#str_to_cookies(str) ⇒ Object

#update_cookies(parameter = {}) ⇒ Object

#update_header(parameter = {}) ⇒ Object

#update_proxy(proxy = {}) ⇒ Object

#update_proxy?(proxy_ip = {}) ⇒ Boolean

#update_uri(uri_or_path) ⇒ Object

#validation_to_proxy?(r = response) ⇒ Boolean