Class: HTTP::Response

Inherits:
Object show all
Defined in:
lib/http_crawler/http/response.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#validationsObject

验证码判断



109
110
111
# File 'lib/http_crawler/http/response.rb', line 109

def validations
  @validations
end

Instance Method Details

#contentObject



79
80
81
# File 'lib/http_crawler/http/response.rb', line 79

def content
  Nokogiri::HTML(readability.content).text
end

#decoding_bodyObject Also known as: dec

解压并转码 body 数据



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/http_crawler/http/response.rb', line 6

def decoding_body

  return @decoding_body if @decoding_body
  return nil unless self.body

  # 数据解压
  case self.headers['Content-Encoding']
  when 'gzip' then
    sio = StringIO.new(self.body.to_s)
    gz = Zlib::GzipReader.new(sio)
    @decoding_body = gz.read()
  when 'br'
    @decoding_body = Brotli.inflate(self.body.to_s)
    # when 'deflate'
    #   # 可能错误代码 暂时没解决 deflate 编码格式
    #   @decoding_body = Zlib::Inflate.inflate(self.body.to_s)
  else
    @decoding_body = self.body.to_s
  end

  # @decoding_body = self.body.to_s

  # 判断解压后数据编码格式

  # 从header取编码格式
  encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type']

  # 从html中的 charset 取编码格式
  # 不能使用,因为 decoding_body 还未转码,直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8
  # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding

  # 通过 CharDet 判断编码格式
  encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding


  # 进行转码
  begin
    @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
  rescue => e
    # 转码错误后再次使用 CharDet 判断编码格式后进行转码
    cd = CharDet.detect(@decoding_body)["encoding"]
    if (cd && cd != encoding)
      @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
    else
      # 还是转码错误则抛出异常
      Rails.logger.debug "encoding => #{encoding}"
      Rails.logger.debug "cd => #{cd}"
      Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
      raise e
    end
  end

end

#get_date(str) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/http_crawler/http/response.rb', line 92

def get_date(str)
  time = Time.now
  case str
  when /^(\d{1,2})小时前$/
    time = time - $1.to_i.hours
  when /^(\d{1,2})月(\d{1,2})日$/
    time = Time.local(time.year, $1.to_i, $2.to_i)
  when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
    time = Time.local($1.to_i, $2.to_i, $3.to_i)
  when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
    time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
  end
  return time
end

#htmlObject

def decoding_body



63
64
65
# File 'lib/http_crawler/http/response.rb', line 63

def html
  @html ||= Nokogiri::HTML(decoding_body)
end

#jsonObject



67
68
69
70
71
# File 'lib/http_crawler/http/response.rb', line 67

def json
  @json ||= JSON.parse(decoding_body)
  @json = JSON.parse(@json) if String === @json
  @json
end

#parsingObject

解析



83
84
85
# File 'lib/http_crawler/http/response.rb', line 83

def parsing
  self.json
end

#readabilityObject

通过readability 解析数据

Readability::Document


75
76
77
# File 'lib/http_crawler/http/response.rb', line 75

def readability
  @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
end

#resultsObject

获取解析结果



88
89
90
# File 'lib/http_crawler/http/response.rb', line 88

def results
  @results ||= parsing
end

#validation_page?Boolean

是否验证码界面

Returns:

  • (Boolean)


116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/http_crawler/http/response.rb', line 116

def validation_page?
  # 正则匹配数组 validations 的所有匹配值
  validations.each do |regular|
    regular_num = decoding_body =~ regular
    if regular_num
      Rails.logger.warn("触发验证信息")
      Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)])
      return true
    end
  end
  return false
end