Module: SpiderHelper

Defined in:
lib/spider_helper.rb

Constant Summary collapse

BomHeaderMap =
{ 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze

Class Method Summary collapse

Class Method Details

.direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/spider_helper.rb', line 6

def direct_http_get(href, local_path, params: nil,
                    header: nil, convert_to_utf8: false)
  href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)

  begin
    href.query = URI.encode_www_form(params) if params
    req = Net::HTTP::Get.new(href)
    header.each { |k, v| req[k] = v } if header

    res =
      Net::HTTP.start(href.hostname, href.port) do |http|
        http.request(req)
      end

    if res.is_a?(Net::HTTPSuccess)
      local_dir = File.dirname(local_path)
      FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
      content = res.body
      content = to_utf8(content) if convert_to_utf8
      File.write(local_path, content)
      puts 'succeed'
      return true
    else
      puts res
    end
  rescue StandardError => e
    puts e.backtrace
    puts e
    false
  end
  false
end

.direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/spider_helper.rb', line 39

def direct_http_post(href, local_path, params,
                     header: nil, convert_to_utf8: false)
  href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)

  begin
    req = Net::HTTP::Post.new(href)
    req.set_form_data(params)
    header.each { |k, v| req[k] = v } if header

    res =
      Net::HTTP.start(href.hostname, href.port) do |http|
        http.request(req)
      end

    if res.is_a?(Net::HTTPSuccess)
      local_dir = File.dirname(local_path)
      FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
      content = res.body
      content = to_utf8(content) if convert_to_utf8
      File.write(local_path, content)
      puts 'succeed'
      return true
    else
      puts res
    end
  rescue StandardError => e
    puts e
    false
  end
  false
end

.extract_href_last(origin_href) ⇒ Object



71
72
73
# File 'lib/spider_helper.rb', line 71

def extract_href_last(origin_href)
  origin_href.split('/')[-1]
end

.smart_to_utf8(str) ⇒ Object

此函数有时此判断有误,使用to_utf8函数直接转换



89
90
91
92
# File 'lib/spider_helper.rb', line 89

def smart_to_utf8(str)
  return str if str.encoding == Encoding::UTF_8
  to_utf8(str)
end

.string_to_uri(href) ⇒ Object



75
76
77
78
79
80
# File 'lib/spider_helper.rb', line 75

def string_to_uri(href)
  l = href
  l.sub!('http:///', 'http://')
  l = Addressable::URI.parse(l)
  l.normalize!
end

.to_utf8(str) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/spider_helper.rb', line 94

def to_utf8(str)
  # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
  str.force_encoding(Encoding::ASCII_8BIT)
  cd = CharDet.detect(str)
  if cd['confidence'] > 0.6
    puts cd['encoding']
    str.force_encoding(cd['encoding'])
    # 移除BOM头
    bom_header = BomHeaderMap[cd['encoding']]
    str.sub!(bom_header, '') if bom_header
  end
  str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)

  str
end