Class: WebPageParser::HTTP::Session

Inherits:
Object
  • Object
show all
Defined in:
lib/web-page-parser/http.rb

Defined Under Namespace

Classes: CurlError

Instance Method Summary collapse

Instance Method Details

#curlObject



19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/web-page-parser/http.rb', line 19

def curl
  @curl ||= Curl::Easy.new do |c|
    c.timeout = 8
    c.connect_timeout = 8
    c.dns_cache_timeout = 600
    c.enable_cookies = true
    c.follow_location = true
    c.max_redirects = 6
    c.autoreferer = true
    c.headers["User-Agent"] = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'
    c.headers["Accept-encoding"] = 'gzip, deflate'
  end
end

#get(url) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/web-page-parser/http.rb', line 33

def get(url)
  curl.url = url
  if curl.perform == false
    raise CurlError, "curl.perform returned false"
  end
  uncompressed = gunzip(curl.body_str)
  uncompressed = inflate(curl.body_str) if uncompressed.nil?
  final_body = uncompressed || curl.body_str
  if final_body.respond_to?(:force_encoding)
    # Not sure if this is right. works for BBC/Guardian/New York Times anyway
    final_body.force_encoding("utf-8")
  end
  Response.new(final_body, curl)
end

#gunzip(s) ⇒ Object



54
55
56
57
58
59
60
# File 'lib/web-page-parser/http.rb', line 54

def gunzip(s)
  s = StringIO.new(s)
  Zlib::GzipReader.new(s).read
rescue Zlib::DataError
rescue Zlib::GzipFile::Error
  nil
end

#inflate(s) ⇒ Object



48
49
50
51
52
# File 'lib/web-page-parser/http.rb', line 48

def inflate(s)
  Zlib::Inflate.inflate(s)
rescue Zlib::DataError
  nil
end