Method: Wmap::Utils::UrlMagic#normalize_url

Defined in:
lib/wmap/utils/url_magic.rb

#normalize_url(url) ⇒ Object

Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before See en.wikipedia.org/wiki/URL_normalization for more explanation



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# File 'lib/wmap/utils/url_magic.rb', line 271

def normalize_url(url)
  url.strip!
  # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
  # Normalize the base
  base=url_2_site(url)
  # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
  base=base.sub(/\.\/$/,'/')
  # Normalize the relative path, case#1
  # retrieve the file path and remove the first '/' or '.',
  # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
  path=url_2_path(url).sub(/^(\/|\.)*/,'')
  # Normalize the relative path, case#2
  # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
  path=path.gsub(/\/\.{1,2}\//,'/')
  if path.nil?
    return base
  else
    return base+path
  end
rescue => ee
  puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
  return url
end