Module: SpiderHelper
- Defined in:
- lib/spider_helper.rb
Constant Summary collapse
- BomHeaderMap =
{ 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'), 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'), 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'), 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'), 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
Class Method Summary collapse
- .direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false) ⇒ Object
- .direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false) ⇒ Object
- .extract_href_last(origin_href) ⇒ Object
-
.smart_to_utf8(str) ⇒ Object
此函数有时此判断有误,使用to_utf8函数直接转换.
- .string_to_uri(href) ⇒ Object
- .to_utf8(str) ⇒ Object
Class Method Details
.direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/spider_helper.rb', line 6 def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false) href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI) begin href.query = URI.encode_www_form(params) if params req = Net::HTTP::Get.new(href) header.each { |k, v| req[k] = v } if header res = Net::HTTP.start(href.hostname, href.port) do |http| http.request(req) end if res.is_a?(Net::HTTPSuccess) local_dir = File.dirname(local_path) FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir) content = res.body content = to_utf8(content) if convert_to_utf8 File.write(local_path, content) puts 'succeed' return true else puts res end rescue StandardError => e puts e.backtrace puts e false end false end |
.direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/spider_helper.rb', line 39 def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false) href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI) begin req = Net::HTTP::Post.new(href) req.set_form_data(params) header.each { |k, v| req[k] = v } if header res = Net::HTTP.start(href.hostname, href.port) do |http| http.request(req) end if res.is_a?(Net::HTTPSuccess) local_dir = File.dirname(local_path) FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir) content = res.body content = to_utf8(content) if convert_to_utf8 File.write(local_path, content) puts 'succeed' return true else puts res end rescue StandardError => e puts e false end false end |
.extract_href_last(origin_href) ⇒ Object
71 72 73 |
# File 'lib/spider_helper.rb', line 71 def extract_href_last(origin_href) origin_href.split('/')[-1] end |
.smart_to_utf8(str) ⇒ Object
此函数有时此判断有误,使用to_utf8函数直接转换
89 90 91 92 |
# File 'lib/spider_helper.rb', line 89 def smart_to_utf8(str) return str if str.encoding == Encoding::UTF_8 to_utf8(str) end |
.string_to_uri(href) ⇒ Object
75 76 77 78 79 80 |
# File 'lib/spider_helper.rb', line 75 def string_to_uri(href) l = href l.sub!('http:///', 'http://') l = Addressable::URI.parse(l) l.normalize! end |
.to_utf8(str) ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/spider_helper.rb', line 94 def to_utf8(str) # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题 str.force_encoding(Encoding::ASCII_8BIT) cd = CharDet.detect(str) if cd['confidence'] > 0.6 puts cd['encoding'] str.force_encoding(cd['encoding']) # 移除BOM头 bom_header = BomHeaderMap[cd['encoding']] str.sub!(bom_header, '') if bom_header end str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace) str end |