Method: Wmap::Utils::UrlMagic#normalize_url
- Defined in:
- lib/wmap/utils/url_magic.rb
#normalize_url(url) ⇒ Object
Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before See en.wikipedia.org/wiki/URL_normalization for more explanation
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
# File 'lib/wmap/utils/url_magic.rb', line 271 def normalize_url(url) url.strip! # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/' # Normalize the base base=url_2_site(url) # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/' base=base.sub(/\.\/$/,'/') # Normalize the relative path, case#1 # retrieve the file path and remove the first '/' or '.', # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath' path=url_2_path(url).sub(/^(\/|\.)*/,'') # Normalize the relative path, case#2 # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html' path=path.gsub(/\/\.{1,2}\//,'/') if path.nil? return base else return base+path end rescue => ee puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose return url end |