Class: Net::HTTPResponse

Inherits:
Object show all
Defined in:
lib/http_crawler/net/response.rb

Instance Method Summary collapse

Instance Method Details

#decoding_bodyObject

解压并转码 body 数据



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/http_crawler/net/response.rb', line 5

def decoding_body

  return @decoding_body if @decoding_body
  return nil unless body

  # 数据解压
  case header['Content-Encoding']
  when 'gzip' then
    sio = StringIO.new(body)
    gz = Zlib::GzipReader.new(sio)
    @decoding_body = gz.read()
  when 'br'
    @decoding_body = Brotli.inflate(body)
  when 'deflate'
    # 可能错误代码 暂时没解决 deflate 编码格式
    @decoding_body = Zlib::Inflate.inflate(body)
  else
    @decoding_body = body
  end

  # 判断解压后数据编码格式

  # 从header取编码格式
  encoding = header['Content-Type'][/charset=([^, ;"]*)/, 1]

  # 从html中的 charset 取编码格式
  encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding

  # 通过 CharDet 判断编码格式
  encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding

  # 进行转码
  begin
    @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding != @decoding_body.encoding
  rescue => e
    # 转码错误后再次使用 CharDet 判断编码格式后进行转码
    cd = CharDet.detect(@decoding_body)["encoding"]
    if (cd && cd != encoding)
      @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
    else
      # 还是转码错误则抛出异常
      raise e
    end
  end

  @decoding_body
end

#get_date(str) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/http_crawler/net/response.rb', line 75

def get_date(str)
  time = Time.now
  case str
  when /^(\d{1,2})小时前$/
    time = time - $1.to_i.hours
  when /^(\d{1,2})月(\d{1,2})日$/
    time = Time.local(time.year, $1.to_i, $2.to_i)
  when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
    time = Time.local($1.to_i, $2.to_i, $3.to_i)
  when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
    time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
  end
  return time
end

#htmlObject

def decoding_body



55
56
57
# File 'lib/http_crawler/net/response.rb', line 55

def html
  @html ||= Nokogiri::HTML(decoding_body)
end

#jsonObject



59
60
61
62
63
# File 'lib/http_crawler/net/response.rb', line 59

def json
  @json ||= JSON.parse(decoding_body)
  @json = JSON.parse(@json) if String === @json
  @json
end

#parsingObject

解析



71
72
73
# File 'lib/http_crawler/net/response.rb', line 71

def parsing
  nil
end

#readabilityObject

通过readability 解析数据



66
67
68
# File 'lib/http_crawler/net/response.rb', line 66

def readability
  @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
end

#web_verify(*arg) ⇒ Object

是否是网站验证 true表示正常数据、false表示弹出网站验证



92
93
94
# File 'lib/http_crawler/net/response.rb', line 92

def web_verify(*arg)
  true
end