Class: HTTP::Response
Instance Attribute Summary collapse
-
#validations ⇒ Object
验证码判断.
Instance Method Summary collapse
- #content ⇒ Object
-
#decoding_body ⇒ Object
(also: #dec)
解压并转码 body 数据.
- #get_date(str) ⇒ Object
-
#html ⇒ Nokogiri::HTML::Document
转换html格式.
- #html=(data) ⇒ Nokogiri::HTML::Document
-
#json ⇒ Hash
转换json格式.
- #json=(data) ⇒ Hash
-
#parsing ⇒ Object
解析 默认使用 json 的值.
-
#readability ⇒ Readability::Document
通过readability 解析数据.
- #readability=(data) ⇒ Readability::Document
-
#results ⇒ Object
获取解析结果.
-
#validation_page? ⇒ Boolean
是否验证码界面.
Instance Attribute Details
#validations ⇒ Object
验证码判断
146 147 148 |
# File 'lib/http_crawler/http/response.rb', line 146 def validations @validations end |
Instance Method Details
#content ⇒ Object
114 115 116 |
# File 'lib/http_crawler/http/response.rb', line 114 def content Nokogiri::HTML(readability.content).text end |
#decoding_body ⇒ Object Also known as: dec
解压并转码 body 数据
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/http_crawler/http/response.rb', line 5 def decoding_body return @decoding_body if @decoding_body return nil unless self.body # 数据解压 case self.headers['Content-Encoding'] when 'gzip' then sio = StringIO.new(self.body.to_s) gz = Zlib::GzipReader.new(sio) @decoding_body = gz.read() when 'br' @decoding_body = Brotli.inflate(self.body.to_s) # when 'deflate' # # 可能错误代码 暂时没解决 deflate 编码格式 # @decoding_body = Zlib::Inflate.inflate(self.body.to_s) else @decoding_body = self.body.to_s end # @decoding_body = self.body.to_s # 判断解压后数据编码格式 # 从header取编码格式 encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type'] # 从html中的 charset 取编码格式 # 不能使用,因为 decoding_body 还未转码,直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8 # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding # 通过 CharDet 判断编码格式 encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding # 进行转码 begin @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding rescue => e # 转码错误后再次使用 CharDet 判断编码格式后进行转码 cd = CharDet.detect(@decoding_body)["encoding"] if (cd && cd != encoding) @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding else # 还是转码错误则抛出异常 Rails.logger.debug "encoding => #{encoding}" Rails.logger.debug "cd => #{cd}" Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}" raise e end end end |
#get_date(str) ⇒ Object
129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/http_crawler/http/response.rb', line 129 def get_date(str) time = Time.now case str when /^(\d{1,2})小时前$/ time = time - $1.to_i.hours when /^(\d{1,2})月(\d{1,2})日$/ time = Time.local(time.year, $1.to_i, $2.to_i) when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/ time = Time.local($1.to_i, $2.to_i, $3.to_i) when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04 time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i) end return time end |
#html ⇒ Nokogiri::HTML::Document
转换html格式
63 64 65 66 |
# File 'lib/http_crawler/http/response.rb', line 63 def html return @html if @html self.html = self.dec end |
#html=(data) ⇒ Nokogiri::HTML::Document
69 70 71 72 73 74 75 76 |
# File 'lib/http_crawler/http/response.rb', line 69 def html=(data) if (Nokogiri::HTML::Document === data) @html = data else @html = Nokogiri::HTML(data) end @html end |
#json ⇒ Hash
转换json格式
80 81 82 83 |
# File 'lib/http_crawler/http/response.rb', line 80 def json return @json if @json self.json = self.dec end |
#json=(data) ⇒ Hash
86 87 88 89 90 91 92 93 94 |
# File 'lib/http_crawler/http/response.rb', line 86 def json=(data) if (Hash === data) @json = data else @json = JSON.parse(data) @json = JSON.parse(@json) if String === @json end @json end |
#parsing ⇒ Object
解析默认使用 json 的值
120 121 122 |
# File 'lib/http_crawler/http/response.rb', line 120 def parsing self.json end |
#readability ⇒ Readability::Document
通过readability 解析数据
98 99 100 101 |
# File 'lib/http_crawler/http/response.rb', line 98 def readability return @readability if @readability self.readability = self.dec end |
#readability=(data) ⇒ Readability::Document
104 105 106 107 108 109 110 111 |
# File 'lib/http_crawler/http/response.rb', line 104 def readability=(data) if (Readability::Document === data) @readability = data else @readability = Readability::Document.new(data, {do_not_guess_encoding: true}) end @readability end |
#results ⇒ Object
获取解析结果
125 126 127 |
# File 'lib/http_crawler/http/response.rb', line 125 def results @results ||= parsing end |
#validation_page? ⇒ Boolean
是否验证码界面
153 154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/http_crawler/http/response.rb', line 153 def validation_page? # 正则匹配数组 validations 的所有匹配值 validations.each do |regular| regular_num = decoding_body =~ regular if regular_num Rails.logger.warn("触发验证信息") Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)]) return true end end return false end |