Module: Wmap::Utils::UrlMagic

Extended by:
UrlMagic
Included in:
Wmap::Utils, UrlMagic
Defined in:
lib/wmap/utils/url_magic.rb

Constant Summary collapse

Max_http_timeout =

set hard stop limit of http time-out to 8 seconds, in order to avoid severe performance penalty for certain ‘weird’ site(s)

15000
User_agent =
"OWASP WMAP Spider"

Instance Method Summary collapse

Instance Method Details

#create_absolute_url_from_base(potential_base, relative_url) ⇒ Object

Create / construct the absolute URL from a known URL and relative file path. For example, ‘images.search.yahoo.com/images’ + ‘/search/images?p=raiders’ => ‘images.search.yahoo.com/search/images?p=raiders



232
233
234
235
236
237
238
239
240
# File 'lib/wmap/utils/url_magic.rb', line 232

def create_absolute_url_from_base(potential_base, relative_url)
#puts "Determine the absolute URL from potential base #{potential_base} and relative URL #{relative_url}" if @verbose
naked_base = url_2_site(potential_base).strip.chop
puts "Found absolute URL: #{naked_base+relative_url}" if @verbose
return naked_base + relative_url
rescue => ee
puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end

#create_absolute_url_from_context(potential_base, relative_url) ⇒ Object

Construct the absolute URL by comparing a known URL and the relative file path



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# File 'lib/wmap/utils/url_magic.rb', line 243

def create_absolute_url_from_context(potential_base, relative_url)
  puts "Determine the absolute URL from context:\n Known base: #{potential_base}, Relative path: #{relative_url}" if @verbose
absolute_url = nil
# make relative URL naked by removing the beginning '/'
relative_url.sub!(/^\//,'')
if potential_base =~ /\/$/
	absolute_url = potential_base+relative_url.strip
else
	last_index_of_slash = potential_base.rindex('/')
	if potential_base[last_index_of_slash-2, 2] == ':/'
		absolute_url = potential_base+relative_url
	else
		last_index_of_dot = potential_base.rindex('.')
		if last_index_of_dot < last_index_of_slash
			absolute_url = potential_base.strip.chop+relative_url
		else
			absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
		end
	end
end
puts "Found absolute URL: #{absolute_url}" if @verbose
return absolute_url
rescue => ee
puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end

#host_2_url(host, port = 80) ⇒ Object

Input is host and open port, output is a URL for valid http response code or nil



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/wmap/utils/url_magic.rb', line 187

def host_2_url (host,port=80)
	puts "Perform simple http(s) service detection on host #{host}, port #{port}" if @verbose
	host=host.strip
	if port.to_i == 80
		url_1 = "http://" + host + "/"
	elsif port.to_i ==443
		url_1 = "https://" + host + "/"
	else
		url_1 = "http://" + host + ":" + port.to_s + "/"
		url_2 = "https://" + host + ":" + port.to_s + "/"
	end
	puts "Please ensure your internet connection is active before running this method: #{__method__}" if @verbose
	checker=Wmap::UrlChecker.new
	if checker.response_code(url_1) != 10000
		puts "Found URL: #{url_1}" if @verbose
		return url_1
	elsif checker.response_code(url_2) != 10000
		puts "Found URL: #{url_2}" if @verbose
		return url_2
	else
		puts "No http(s) service found on: #{host}:#{port}" if @verbose
		return nil
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#is_site?(url) ⇒ Boolean

Simple sanity check on a ‘claimed’ web site base string.

Returns:

  • (Boolean)


56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/wmap/utils/url_magic.rb', line 56

def is_site?(url)
  puts "Validate the website string format for: #{url}" if @verbose
	url=url.strip.downcase
	if is_url?(url)
		if url == url_2_site(url)
			return true
		else
			return false
		end
	else
		puts "Unknown site format: #{url}" if @verbose
		return false
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#is_ssl?(url) ⇒ Boolean Also known as: is_https?

Simple sanity check on a ‘claimed’ SSL enabled URL string

Returns:

  • (Boolean)


41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/wmap/utils/url_magic.rb', line 41

def is_ssl?(url)
	puts "Validate if SSL is enabled on: #{url}" if @verbose
	url=url.strip
	if is_url?(url) && url =~ /https/i
		return true
	else
		return false
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return false
end

#is_url?(url) ⇒ Boolean

Simple sanity check on a ‘claimed’ URL string.

Returns:

  • (Boolean)


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/wmap/utils/url_magic.rb', line 21

def is_url?(url)
	puts "Validate the URL format is valid: #{url}" if @verbose
	if url =~ /(http|https)\:\/\/((.)+)/i
		host=$2.split('/')[0]
		host=host.split(':')[0]
		if is_ip?(host) or is_fqdn?(host)
			return true
		else
			return false
		end
	else
		puts "Unknown URL format: #{url}" if @verbose
		return false
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return false
end

#landing_location(depth = 5, url) ⇒ Object

Test the URL / Site and return the landing url location (recursive with the depth = 4 )



439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# File 'lib/wmap/utils/url_magic.rb', line 439

def landing_location (depth=5, url)
	depth -= 1
	return url if depth < 1
	timeo = Max_http_timeout/1000.0
	uri = URI.parse(url)
	code = response_code (url)
	if code >= 300 && code < 400
		url = redirect_location (url)
		url = landing_location(depth,url)
	else
		return url
	end
	return url
rescue Exception => ee
	puts "Exception on method #{__method__} on URL #{url}: #{ee}" if @verbose
end

#make_absolute(base, relative_url) ⇒ Object

Convert a relative URL to an absolute one. For example, from URL base ‘games.yahoo.com/’ and file path ‘/game/the-magic-snowman-flash.html’ => ‘games.yahoo.com/game/the-magic-snowman-flash.html



216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/wmap/utils/url_magic.rb', line 216

def make_absolute(base, relative_url)
  puts "Determine and return the absolute URL:\n Base: #{base}, Relative: #{relative_url} " if @verbose
absolute_url = nil;
if relative_url =~ /^\//
	absolute_url = create_absolute_url_from_base(base, relative_url)
else
	absolute_url = create_absolute_url_from_context(base, relative_url)
end
puts "Found absolute URL: #{absolute_url}" if @verbose
return absolute_url
rescue => ee
puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end

#normalize_url(url) ⇒ Object

Normalize the URL to a consistent manner in order to determine if a link has been visited or cached before See en.wikipedia.org/wiki/URL_normalization for more explanation



272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# File 'lib/wmap/utils/url_magic.rb', line 272

def normalize_url(url)
	url.strip!
	# Converting the scheme and host to lower case in the process, i.e. 'HTTP://www.Example.com/' => 'http://www.example.com/'
	# Normalize the base
	base=url_2_site(url)
	# Case#1, remove the trailing dot after the hostname, i.e, 'http://www.yahoo.com./' => 'http://www.yahoo.com/'
	base=base.sub(/\.\/$/,'/')
	# Normalize the relative path, case#1
	# retrieve the file path and remove the first '/' or '.',
	# i.e. 'http://www.example.com/mypath' or 'http://www.example.com/./mypath' => 'mypath'
	path=url_2_path(url).sub(/^(\/|\.)*/,'')
	# Normalize the relative path, case#2
	# Replace dot-segments. "/../" and "/./" with "/", i.e. 'http://www.example.com/../a/b/../c/./d.html" => 'http://www.example.com/a/c/d.html'
	path=path.gsub(/\/\.{1,2}\//,'/')
	if path.nil?
		return base
	else
		return base+path
	end
rescue => ee
	puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
	return url
end

#open_page(url) ⇒ Object

Given an URL, open the page, then return the DOM text from a normal user perspective



380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# File 'lib/wmap/utils/url_magic.rb', line 380

def open_page(url)
  args = {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, allow_redirections: :safe, \
    read_timeout: Max_http_timeout/1000, "User-Agent"=>User_agent}
  doc = Nokogiri::HTML(open(url, args))
  if doc.text.include?("Please enable JavaScript to view the page content")
    puts "Invoke headless chrome through webdriver ..." if @verbose
    #Selenium::WebDriver::Chrome.path = "/usr/local/bin/chromedriver"
    #driver = Selenium::WebDriver.for :chrome
    # http://watir.com/guides/chrome/
    args = ['--ignore-certificate-errors', '--disable-popup-blocking', '--disable-translate', '--disk-cache-size 8192']
    browser = Watir::Browser.new :chrome, headless: true, switches: %w[--user-agent=OWASP\ WMAP\ Spider]
    browser.goto(url)
    sleep(2) # wait for the loading
    doc = Nokogiri::HTML(browser.html)
    browser.close
  end
  puts doc.text if @verbose
  return doc
rescue => ee
  puts "Exception on method #{__method__} for #{url}: #{ee}"
  browser.close unless browser.nil?
  return doc.text
end

#redirect_location(url) ⇒ Object Also known as: location

Test the URL / site and return the redirection location (3xx response code only)



405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
# File 'lib/wmap/utils/url_magic.rb', line 405

def redirect_location (url)
	puts "Test the redirection location for the url: #{url}" if @verbose
	location=""
	raise "Invalid url: #{url}" unless is_url?(url)
	url=url.strip.downcase
	timeo = Max_http_timeout/1000.0
	uri = URI.parse(url)
	code = response_code (url)
	if code >= 300 && code < 400
		http = Net::HTTP.new(uri.host, uri.port)
		http.open_timeout = timeo
		http.read_timeout = timeo
		if (url =~ /https\:/i)
			http.use_ssl = true
			# Bypass the remote web server cert validation test
			http.verify_mode = OpenSSL::SSL::VERIFY_NONE
			http.ssl_version = @ssl_version
		end
		request = Net::HTTP::Get.new(uri.request_uri)
		response = http.request(request)
		puts "Response: #{response}" if @verbose
		case response
		when Net::HTTPRedirection then
			location = response['location']
		end
	end
	return location
rescue Exception => ee
	puts "Exception on method redirect_location for URL #{url}: #{ee}" if @verbose
	return ""
end

#response_code(url) ⇒ Object

Test the URL and return the response code



297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# File 'lib/wmap/utils/url_magic.rb', line 297

def response_code (url)
	puts "Check the http response code on the url: #{url}" if @verbose
	code = 10000	# All unknown url connection exceptions go here
	raise "Invalid url: #{url}" unless is_url?(url)
	url=url.strip.downcase
	timeo = Max_http_timeout/1000.0
	uri = URI.parse(url)
	http = Net::HTTP.new(uri.host, uri.port)
	http.open_timeout = timeo
	http.read_timeout = timeo
	if (url =~ /https\:/i)
		http.use_ssl = true
		#http.ssl_version = :SSLv3
		# Bypass the remote web server cert validation test
		http.verify_mode = OpenSSL::SSL::VERIFY_NONE
	end
	request = Net::HTTP::Get.new(uri.request_uri)
	response = http.request(request)
	puts "Server response the following: #{response}" if @verbose
	code = response.code.to_i
	#response.finish if response.started?()
	@url_code=Hash.new unless @url_code
   @url_code[url]=code
	puts "Response code on #{url}: #{code}" if @verbose
	return code
rescue Exception => ee
	puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
	case ee
	# rescue "Connection reset by peer" error type
	when Errno::ECONNRESET
		code=104
	when Errno::ECONNABORTED,Errno::ETIMEDOUT
		#code=10000
	when Timeout::Error				# Quick fix
		if (url =~ /https\:/i)		# try again for ssl timeout session, in case of default :TLSv1 failure
			http.ssl_version = :SSLv3
			response = http.request(request)
			code = response.code.to_i
			unless code.nil?
				@ssl_version = http.ssl_version
			end
		end
	else
		#code=10000
	end
   @url_code=Hash.new unless @url_code
	@url_code[url]=code
	return code
end

#response_headers(url) ⇒ Object

Test the URL and return the response headers



348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# File 'lib/wmap/utils/url_magic.rb', line 348

def response_headers (url)
puts "Check the http response headers on the url: #{url}" if @verbose
raise "Invalid url: #{url}" unless is_url?(url)
  headers = Hash.new
url=url.strip.downcase
timeo = Max_http_timeout/1000.0
uri = URI.parse(url)
http = Net::HTTP.new(uri.host, uri.port)
http.open_timeout = timeo
http.read_timeout = timeo
if (url =~ /https\:/i)
	http.use_ssl = true
	#http.ssl_version = :SSLv3
	# Bypass the remote web server cert validation test
	http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
puts "Server response the following: #{response}" if @verbose
response.each_header do |key,val|
    puts "#{key} => #{val}" if @verbose
    headers.merge!({key => val})
  end
puts "Response headers on #{url}: #{headers}" if @verbose
return headers
rescue => ee
puts "Exception on method #{__method__}: #{ee}" if @verbose
  return nil
end

#url_2_host(url) ⇒ Object

Extract the web server host’s Fully Qualified Domain Name (FQDN) from the url. For example: “login.yahoo.com/email/help” -> “login.yahoo.com”



75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/wmap/utils/url_magic.rb', line 75

def url_2_host (url)
	url = url.strip.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
	record1 = url.split('/')
	if record1[0].nil?
		puts "Error process url: #{url}"
		return nil
	else
		record2 = record1[0].split(':')
		return record2[0]
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#url_2_path(url) ⇒ Object

Wrapper to return relative path component of the URL. i.e. www.yahoo.com/login.html => /login.html



164
165
166
167
168
169
170
171
172
173
# File 'lib/wmap/utils/url_magic.rb', line 164

def url_2_path(url)
	#puts "Retrieve the relative path component of the url: #{url}" if @verbose
	url.strip!
	base = url_2_site(url).chop
	path=url.sub(base,'')
	#puts "Path component found: #{path}" if @verbose
	return path
rescue => ee
	puts "Exception on method #{__method__} for #{url}: #{ee}" if @verbose
end

#url_2_port(url) ⇒ Object

Extract web service port from the url. For example: “login.yahoo.com/email/help” -> 443



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/wmap/utils/url_magic.rb', line 91

def url_2_port (url)
	puts "Retrieve service port on URL: #{url}" if @verbose
	ssl = (url =~ /https/i)
	url = url.downcase.gsub(/(http:\/\/|https:\/\/)/, "")
	record1 = url.split('/')
	record2 = record1[0].split(':')
	if (record2.length == 2)
		puts "The service port: #{record2[1]}" if @verbose
		return record2[1].to_i
	elsif ssl
		puts "The service port: 443" if @verbose
		return 443
	else
		puts "The service port: 80" if @verbose
		return 80
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#url_2_site(url) ⇒ Object

Extract site in (host:port) format from a url: “login.yahoo.com:8443/email/help” -> “login.yahoo.com:8443/”



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/wmap/utils/url_magic.rb', line 113

def url_2_site (url)
	puts "Retrieve the web site base for url: #{url}" if @verbose
	url = url.downcase
	url = url.sub(/^(.*?)http/i,'http')
	entry = url.split(%r{\/\/})
	prot=entry[0]
	# step 1, extract the host:port pair from the url
	host_port=entry[1].split(%r{\/})[0]
	if host_port =~ /\:/
		host=host_port.split(%r{\:})[0]
		port=host_port.split(%r{\:})[1].to_i
	elsif prot =~ /https/i
		host=host_port
		port=443
	elsif prot =~ /http/i
		host=host_port
		port=80
	else
		host=host_port
		#raise "Unknown url format: #{url}"
	end
	# additional logic to handle uncommon url base structures
	unless is_fqdn?(host)
		case host
			# "https://letmechoose.barclays.co.uk?source=btorganic/" => "https://letmechoose.barclays.co.uk"
			when /\?|\#/
				host=host.split(%r{\?|\#})[0]
			else
				#do nothing
		end
	end
	# step 2, put the host:port pair back to the normal site format
	prot="https:" if port==443
	if port==80 || port==443
		site=prot+"//"+host+"/"
	else
		site=prot+"//"+host+":"+port.to_s+"/"
	end
	if site=~ /http/i
		#puts "Base found: #{site}" if @verbose
		return site
	else
		raise "Problem encountered on method url_2_site: Unable to convert #{url}"
		return nil
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#urls_on_same_domain?(url1, url2) ⇒ Boolean

Test if the two URLs are both under the same domain: login.yahoo.com, mail.yahoo.com => true

Returns:

  • (Boolean)


176
177
178
179
180
181
182
183
184
# File 'lib/wmap/utils/url_magic.rb', line 176

def urls_on_same_domain?(url1, url2)
puts "Determine if two URLs under the same domain: #{url1}, #{url2}" if @verbose
host1=url_2_host(url1)
host2=url_2_host(url2)
return get_domain_root(host1) == get_domain_root(host2)
rescue => ee
puts "Error searching the object content: #{ee}" if @verbose
  return nil
end