Class: HTTP::Response

Inherits:
Object show all
Defined in:
lib/http_crawler/http/response.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#validationsObject

验证码判断



146
147
148
# File 'lib/http_crawler/http/response.rb', line 146

def validations
  @validations
end

Instance Method Details

#contentObject



114
115
116
# File 'lib/http_crawler/http/response.rb', line 114

def content
  Nokogiri::HTML(readability.content).text
end

#decoding_bodyObject Also known as: dec

解压并转码 body 数据



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/http_crawler/http/response.rb', line 5

def decoding_body
  return @decoding_body if @decoding_body
  return nil unless self.body

  # 数据解压
  case self.headers['Content-Encoding']
  when 'gzip' then
    sio = StringIO.new(self.body.to_s)
    gz = Zlib::GzipReader.new(sio)
    @decoding_body = gz.read()
  when 'br'
    @decoding_body = Brotli.inflate(self.body.to_s)
    # when 'deflate'
    #   # 可能错误代码 暂时没解决 deflate 编码格式
    #   @decoding_body = Zlib::Inflate.inflate(self.body.to_s)
  else
    @decoding_body = self.body.to_s
  end

  # @decoding_body = self.body.to_s

  # 判断解压后数据编码格式

  # 从header取编码格式
  encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type']

  # 从html中的 charset 取编码格式
  # 不能使用,因为 decoding_body 还未转码,直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8
  # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding

  # 通过 CharDet 判断编码格式
  encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding


  # 进行转码
  begin
    @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
  rescue => e
    # 转码错误后再次使用 CharDet 判断编码格式后进行转码
    cd = CharDet.detect(@decoding_body)["encoding"]
    if (cd && cd != encoding)
      @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
    else
      # 还是转码错误则抛出异常
      Rails.logger.debug "encoding => #{encoding}"
      Rails.logger.debug "cd => #{cd}"
      Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
      raise e
    end
  end

end

#get_date(str) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/http_crawler/http/response.rb', line 129

def get_date(str)
  time = Time.now
  case str
  when /^(\d{1,2})小时前$/
    time = time - $1.to_i.hours
  when /^(\d{1,2})月(\d{1,2})日$/
    time = Time.local(time.year, $1.to_i, $2.to_i)
  when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
    time = Time.local($1.to_i, $2.to_i, $3.to_i)
  when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
    time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
  end
  return time
end

#htmlNokogiri::HTML::Document

转换html格式

Returns:



63
64
65
66
# File 'lib/http_crawler/http/response.rb', line 63

def html
  return @html if @html
  self.html = self.dec
end

#html=(data) ⇒ Nokogiri::HTML::Document

Returns:



69
70
71
72
73
74
75
76
# File 'lib/http_crawler/http/response.rb', line 69

def html=(data)
  if (Nokogiri::HTML::Document === data)
    @html = data
  else
    @html = Nokogiri::HTML(data)
  end
  @html
end

#jsonHash

转换json格式

Returns:



80
81
82
83
# File 'lib/http_crawler/http/response.rb', line 80

def json
  return @json if @json
  self.json = self.dec
end

#json=(data) ⇒ Hash

Returns:



86
87
88
89
90
91
92
93
94
# File 'lib/http_crawler/http/response.rb', line 86

def json=(data)
  if (Hash === data)
    @json = data
  else
    @json = JSON.parse(data)
    @json = JSON.parse(@json) if String === @json
  end
  @json
end

#parsingObject

解析默认使用 json 的值



120
121
122
# File 'lib/http_crawler/http/response.rb', line 120

def parsing
  self.json
end

#readabilityReadability::Document

通过readability 解析数据

Returns:



98
99
100
101
# File 'lib/http_crawler/http/response.rb', line 98

def readability
  return @readability if @readability
  self.readability = self.dec
end

#readability=(data) ⇒ Readability::Document

Returns:



104
105
106
107
108
109
110
111
# File 'lib/http_crawler/http/response.rb', line 104

def readability=(data)
  if (Readability::Document === data)
    @readability = data
  else
    @readability = Readability::Document.new(data, {do_not_guess_encoding: true})
  end
  @readability
end

#resultsObject

获取解析结果



125
126
127
# File 'lib/http_crawler/http/response.rb', line 125

def results
  @results ||= parsing
end

#validation_page?Boolean

是否验证码界面

Returns:



153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/http_crawler/http/response.rb', line 153

def validation_page?
  # 正则匹配数组 validations 的所有匹配值
  validations.each do |regular|
    regular_num = decoding_body =~ regular
    if regular_num
      Rails.logger.warn("触发验证信息")
      Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)])
      return true
    end
  end
  return false
end