Module: Wmap::Utils::UrlMagic

Extended by:
UrlMagic
Included in:
Wmap::Utils, UrlMagic
Defined in:
lib/wmap/utils/url_magic.rb

Instance Method Summary collapse

Instance Method Details

#create_absolute_url_from_base(potential_base, relative_url) ⇒ Object

Create / construct the absolute URL from a known URL and relative file path. For example, ‘images.search.yahoo.com/images’ + ‘/search/images?p=raiders’ => ‘images.search.yahoo.com/search/images?p=raiders



271
272
273
274
275
276
277
278
279
280
281
# File 'lib/wmap/utils/url_magic.rb', line 271

def create_absolute_url_from_base(potential_base, relative_url)
    begin
      #puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
      naked_base = url_2_site(potential_base).strip.chop        
      puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
      return naked_base + relative_url
    rescue => ee
      puts "Exception on method #{__method__}: #{ee}" if @verbose
        return nil
    end
end

#create_absolute_url_from_context(potential_base, relative_url) ⇒ Object

Construct the absolute URL by comparing a known URL and the relative file path



284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/wmap/utils/url_magic.rb', line 284

def create_absolute_url_from_context(potential_base, relative_url)
    puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
    begin
      absolute_url = nil
      # make relative URL naked by removing the beginning '/'
      relative_url.sub!(/^\//,'')
      if potential_base =~ /\/$/
absolute_url = potential_base+relative_url.strip
      else
last_index_of_slash = potential_base.rindex('/')
if potential_base[last_index_of_slash-2, 2] == ':/'
  absolute_url = potential_base+relative_url
else
  last_index_of_dot = potential_base.rindex('.')
  if last_index_of_dot < last_index_of_slash
    absolute_url = potential_base.strip.chop+relative_url
  else
    absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
  end
end
      end
      puts "Found absolute URL: #{absolute_url}" if @verbose
      return absolute_url
    rescue => ee
      puts "Exception on method #{__method__}: #{ee}" if @verbose
        return nil
    end
end

#host_2_url(host, port = 80) ⇒ Object

Input is host and open port, output is a URL for valid http response code or nil



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# File 'lib/wmap/utils/url_magic.rb', line 222

def host_2_url (host,port=80)
  puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
  begin
    host=host.strip
    if port.to_i == 80 
      url_1 = "http://" + host + "/"
    elsif port.to_i ==443
      url_1 = "https://" + host + "/"
    else
      url_1 = "http://" + host + ":" + port.to_s + "/"
      url_2 = "https://" + host + ":" + port.to_s + "/"
    end
    puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
    checker=Wmap::UrlChecker.new 
    if checker.response_code(url_1) != 10000
      puts "Found URL: #{url_1}" if @verbose
      return url_1
    elsif checker.response_code(url_2) != 10000
      puts "Found URL: #{url_2}" if @verbose
      return url_2
    else
      puts "No http(s) service found on: #{host}:#{port}" if @verbose
      return nil
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return nil
  end
end

#is_site?(url) ⇒ Boolean

Simple sanity check on a ‘claimed’ web site base string.

Returns:

  • (Boolean)


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/wmap/utils/url_magic.rb', line 55

def is_site?(url)
  puts "Validate the website string format for: #{url}" if @verbose
  begin
    url=url.strip.downcase
    if is_url?(url)
      if url == url_2_site(url)
        return true
      else
        return false
      end      
    else
      puts "Unknown site format: #{url}" if @verbose
      return false
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return nil
  end
end

#is_ssl?(url) ⇒ Boolean Also known as: is_https?

Simple sanity check on a ‘claimed’ SSL enabled URL string

Returns:

  • (Boolean)


38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/wmap/utils/url_magic.rb', line 38

def is_ssl?(url)
  puts "Validate if SSL is enabled on: #{url}" if @verbose
  begin
    url=url.strip
    if is_url?(url) && url =~ /https/i
      return true
    else
      return false
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return false
  end
end

#is_url?(url) ⇒ Boolean

Simple sanity check on a ‘claimed’ URL string.

Returns:

  • (Boolean)


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/wmap/utils/url_magic.rb', line 16

def is_url?(url)
  puts "Validate the URL format is valid: #{url}" if @verbose
  begin
    if url =~ /(http|https)\:\/\/((.)+)/i
      host=$2.split('/')[0]
      host=host.split(':')[0]
      if is_ip?(host) or is_fqdn?(host)
        return true
      else
        return false
      end
    else
      puts "Unknown URL format: #{url}" if @verbose
      return false
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return false
  end
end

#make_absolute(base, relative_url) ⇒ Object

Convert a relative URL to an absolute one. For example, from URL base ‘games.yahoo.com/’ and file path ‘/game/the-magic-snowman-flash.html’ => ‘games.yahoo.com/game/the-magic-snowman-flash.html



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/wmap/utils/url_magic.rb', line 253

def make_absolute(base, relative_url)
    puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
    begin
      absolute_url = nil;
      if relative_url =~ /^\//
absolute_url = create_absolute_url_from_base(base, relative_url)
      else
absolute_url = create_absolute_url_from_context(base, relative_url)
      end
      puts "Found absolute URL: #{absolute_url}" if @verbose
      return absolute_url
    rescue => ee
      puts "Exception on method #{__method__}: #{ee}" if @verbose
        return nil
    end
end

#normalize_url(url) ⇒ Object

Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before See en.wikipedia.org/wiki/URL_normalization for more explanation



315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/wmap/utils/url_magic.rb', line 315

def normalize_url(url)
  begin
    url.strip!
    # Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'  
    # Normalize the base
    base=url_2_site(url) 
    # Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
    base=base.sub(/\.\/$/,'/')
    # Normalize the relative path, case#1
    # retrieve the file path and remove the first '/' or '.', 
    # i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
    path=url_2_path(url).sub(/^(\/|\.)*/,'')
    # Normalize the relative path, case#2
    # Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
    path=path.gsub(/\/\.{1,2}\//,'/')
    if path.nil?
      return base
    else
      return base+path
    end
  rescue => ee
    puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
    return url
  end
end

#url_2_host(url) ⇒ Object

Extract the web server host’s Fully Qualified Domain Name (FQDN) from the url. For example: “login.yahoo.com/email/help” -> “login.yahoo.com”



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/wmap/utils/url_magic.rb', line 99

def url_2_host (url)
  begin
    url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
    record1 = url.split('/')
    if record1[0].nil?
      puts "Error process url: #{url}" 
      return nil
    else
      record2 = record1[0].split(':')    
      return record2[0]
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return nil
  end
end

#url_2_path(url) ⇒ Object

Wrapper to return relative path component of the URL. i.e. www.yahoo.com/login.html => /login.html



194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/wmap/utils/url_magic.rb', line 194

def url_2_path(url)
  #puts "Retrieve the relative path component of the url: #{url}" if @verbose
  begin
    url.strip!
    base = url_2_site(url).chop
    path=url.sub(base,'')
    #puts "Path component found: #{path}" if @verbose
    return path
  rescue => ee
    puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
  end

end

#url_2_port(url) ⇒ Object

Extract web service port from the url. For example: “login.yahoo.com/email/help” -> 443



117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/wmap/utils/url_magic.rb', line 117

def url_2_port (url)
  puts "Retrieve service port on URL: #{url}" if @verbose
  begin
    ssl = (url =~ /https/i)
    url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
    record1 = url.split('/')
    record2 = record1[0].split(':') 
    if (record2.length == 2) 
      puts "The service port: #{record2[1]}" if @verbose
      return record2[1].to_i
    elsif ssl
      puts "The service port: 443" if @verbose
      return 443
    else
      puts "The service port: 80" if @verbose
      return 80
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return nil
  end
end

#url_2_site(url) ⇒ Object

Extract site in (host:port) format from a url: “login.yahoo.com:8443/email/help” -> “login.yahoo.com:8443/”



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/wmap/utils/url_magic.rb', line 141

def url_2_site (url)
  puts "Retrieve the web site base for url: #{url}" if @verbose
  begin
    url = url.downcase
    url = url.sub(/^(.*?)http/i,'http')
    entry = url.split(%r{\/\/})
    prot=entry[0]
    # step 1, extract the host:port pair from the url
    host_port=entry[1].split(%r{\/})[0]
    if host_port =~ /\:/
      host=host_port.split(%r{\:})[0]
      port=host_port.split(%r{\:})[1].to_i
    elsif prot =~ /https/i
      host=host_port
      port=443
    elsif prot =~ /http/i
      host=host_port
      port=80
    else
      host=host_port
      #raise "Unknown url format: #{url}"
    end
    # additional logic to handle uncommon url base structures
    unless is_fqdn?(host)
      case host
        # "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
        when /\?|\#/  
          host=host.split(%r{\?|\#})[0]
        else
          #do nothing
      end
    end
    # step 2, put the host:port pair back to the normal site format 
    prot="https:" if port==443
    if port==80 || port==443
      site=prot+"//"+host+"/"
    else
      site=prot+"//"+host+":"+port.to_s+"/"
    end
    if site=~ /http/i
      #puts "Base found: #{site}" if @verbose
      return site
    else  
      raise "Problem encountered on method url_2_site: Unable to convert #{url}"
      return nil
    end
  rescue => ee
    puts "Exception on method #{__method__}: #{ee}" if @verbose
    return nil
  end
end

#urls_on_same_domain?(url1, url2) ⇒ Boolean

Test if the two URLs are both under the same domain: login.yahoo.com, mail.yahoo.com => true

Returns:

  • (Boolean)


209
210
211
212
213
214
215
216
217
218
219
# File 'lib/wmap/utils/url_magic.rb', line 209

def urls_on_same_domain?(url1, url2)
    puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
    begin
      host1=url_2_host(url1)
      host2=url_2_host(url2)
      return get_domain_root(host1) == get_domain_root(host2)
    rescue => ee
      puts "Error searching the object content: #{ee}" if @verbose
        return nil
    end
end