Class: HTTP::Response
Instance Attribute Summary collapse
-
#validations ⇒ Object
验证码判断.
Instance Method Summary collapse
- #content ⇒ Object
-
#decoding_body ⇒ Object
(also: #dec)
解压并转码 body 数据.
- #get_date(str) ⇒ Object
-
#html ⇒ Object
def decoding_body.
- #json ⇒ Object
-
#parsing ⇒ Object
解析.
-
#readability ⇒ Object
通过readability 解析数据 [Readability::Document].
-
#results ⇒ Object
获取解析结果.
-
#validation_page? ⇒ Boolean
是否验证码界面.
Instance Attribute Details
#validations ⇒ Object
验证码判断
109 110 111 |
# File 'lib/http_crawler/http/response.rb', line 109 def validations @validations end |
Instance Method Details
#content ⇒ Object
79 80 81 |
# File 'lib/http_crawler/http/response.rb', line 79 def content Nokogiri::HTML(readability.content).text end |
#decoding_body ⇒ Object Also known as: dec
解压并转码 body 数据
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/http_crawler/http/response.rb', line 6 def decoding_body return @decoding_body if @decoding_body return nil unless self.body # 数据解压 case self.headers['Content-Encoding'] when 'gzip' then sio = StringIO.new(self.body.to_s) gz = Zlib::GzipReader.new(sio) @decoding_body = gz.read() when 'br' @decoding_body = Brotli.inflate(self.body.to_s) # when 'deflate' # # 可能错误代码 暂时没解决 deflate 编码格式 # @decoding_body = Zlib::Inflate.inflate(self.body.to_s) else @decoding_body = self.body.to_s end # @decoding_body = self.body.to_s # 判断解压后数据编码格式 # 从header取编码格式 encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type'] # 从html中的 charset 取编码格式 # 不能使用,因为 decoding_body 还未转码,直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8 # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding # 通过 CharDet 判断编码格式 encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding # 进行转码 begin @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding rescue => e # 转码错误后再次使用 CharDet 判断编码格式后进行转码 cd = CharDet.detect(@decoding_body)["encoding"] if (cd && cd != encoding) @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding else # 还是转码错误则抛出异常 Rails.logger.debug "encoding => #{encoding}" Rails.logger.debug "cd => #{cd}" Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}" raise e end end end |
#get_date(str) ⇒ Object
92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/http_crawler/http/response.rb', line 92 def get_date(str) time = Time.now case str when /^(\d{1,2})小时前$/ time = time - $1.to_i.hours when /^(\d{1,2})月(\d{1,2})日$/ time = Time.local(time.year, $1.to_i, $2.to_i) when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/ time = Time.local($1.to_i, $2.to_i, $3.to_i) when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04 time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i) end return time end |
#html ⇒ Object
def decoding_body
63 64 65 |
# File 'lib/http_crawler/http/response.rb', line 63 def html @html ||= Nokogiri::HTML(decoding_body) end |
#json ⇒ Object
67 68 69 70 71 |
# File 'lib/http_crawler/http/response.rb', line 67 def json @json ||= JSON.parse(decoding_body) @json = JSON.parse(@json) if String === @json @json end |
#parsing ⇒ Object
解析
83 84 85 |
# File 'lib/http_crawler/http/response.rb', line 83 def parsing self.json end |
#readability ⇒ Object
通过readability 解析数据
- Readability::Document
75 76 77 |
# File 'lib/http_crawler/http/response.rb', line 75 def readability @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true}) end |
#results ⇒ Object
获取解析结果
88 89 90 |
# File 'lib/http_crawler/http/response.rb', line 88 def results @results ||= parsing end |
#validation_page? ⇒ Boolean
是否验证码界面
116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/http_crawler/http/response.rb', line 116 def validation_page? # 正则匹配数组 validations 的所有匹配值 validations.each do |regular| regular_num = decoding_body =~ regular if regular_num Rails.logger.warn("触发验证信息") Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)]) return true end end return false end |