Module: GrabberUtils

Defined Under Namespace

Classes: DownloadError

Constant Summary collapse

CACHE_DIR =
'cache'
USER_AGENT =
'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0'
AVG_DAYS_IN_MONTH =

Average number of days per month

365.25 / 12

Instance Method Summary collapse

Instance Method Details

#download(url, encoding = 'UTF-8', options = {}) ⇒ Object



10
11
12
13
14
15
16
17
# File 'lib/web_analytics_discovery/grabberutils.rb', line 10

def download(url, encoding = 'UTF-8', options = {})
  fn = download_file(url, options)

  # Truly horrible hack to work around Ruby 1.9.2+ strict handling of invalid UTF-8 characters
  s = File.read(fn)
  s.encode!('UTF-16', encoding, :invalid => :replace, :replace => '?')
  s.encode!('UTF-8', 'UTF-16', :invalid => :replace, :replace => '?')
end

#download_file(url, options = {}) ⇒ Object

Downloads a file, returns filename in cache directory



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/web_analytics_discovery/grabberutils.rb', line 20

def download_file(url, options = {})
  FileUtils.mkdir_p(CACHE_DIR)
  localfile = options['localfile'] || mangle_url(url)
  fn = CACHE_DIR + '/' + localfile
  unless FileTest.exists?(fn)
    opt = {
      'user-agent' => USER_AGENT,
      'load-cookies' => 'cookies.txt',
      'save-cookies' => 'cookies.txt',
    }
    if options['Referer']
      opt['referer'] = options['Referer']
    end
    opt = opt.map { |k, v| "--#{k}='#{v}'" }.join(' ')
    system("wget --append-output=wget.log --keep-session-cookies -O'#{fn}' #{opt} '#{url}'")
    if $?.exitstatus != 0
      File.delete(fn)
      raise DownloadError.new
    end
  end

  return fn
end

#mangle_url(url) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/web_analytics_discovery/grabberutils.rb', line 44

def mangle_url(url)
  if url.length < 200
    f = url.gsub(/[:\/]/, '_')
  else
    f = Digest::MD5.hexdigest(url)
  end
end