Class: GoogleSafeBrowsing::Canonicalize

Inherits:
Object
  • Object
show all
Defined in:
lib/google_safe_browsing/canonicalize.rb

Overview

Helpers to Canonicalize urls and generate url permutations for lookups

Constant Summary collapse

PROTOCOL_DELIMITER =
'://'
DEFAULT_PROTOCOL =
'http'

Class Method Summary collapse

Class Method Details

.cart_prod(a_one, a_two) ⇒ Array

Returns the cartesian product of two arrays by concatination of the string representation of the elements

Parameters:

  • a_one (Array)

    array of strings

  • a_two (Array)

    array of strings

Returns:

  • (Array)

    cartesian product of arrays with elements concatinated



131
132
133
134
135
136
137
138
139
# File 'lib/google_safe_browsing/canonicalize.rb', line 131

def self.cart_prod(a_one, a_two)
  result = []
  a_one.each do |i|
    a_two.each do |j|
      result << "#{i}#{j}"
    end
  end
  result
end

.fix_host(host) ⇒ String

Apply initial fixes to host string

Parameters:

  • host (String)

    host string

Returns:

  • (String)

    standardized host string



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/google_safe_browsing/canonicalize.rb', line 183

def self.fix_host(host)
  #puts "In Host: #{host}"
  # remove leading and trailing dots, multiple dots to one
  host.gsub!(/\A\.+|\.+\Z/, '')
  host.gsub!(/\.+/, '.')

  host.downcase!

  host_splits = self.split_username_password_and_port(host)

  if host_splits[:host] =~ /^\d+$/
    host_splits[:host] = IP::V4.new(host.to_i).to_addr
  elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
    begin
      host_splits[:host] = IP.new(host).to_addr 
    rescue ArgumentError
    end
  end

  result = host_splits[:host]
  result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank?
  result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank?
  result
end

.fix_path(path) ⇒ String

Apply initial fixes to path string

Parameters:

  • path (String)

    path string

Returns:

  • (String)

    standardized path string



212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/google_safe_browsing/canonicalize.rb', line 212

def self.fix_path(path)
  #puts "In Path: #{path}"

  #remove leading slash
  path = path[1..-1] if path[0..0] == '/'

  preserve_trailing_slash = ( path[-1..-1] == '/' )

  if path.index('?')
    first_ques = path.index('?')
    params = path[first_ques..-1]
    path = path[0..first_ques-1]
  end

  # remove multiple '/'
  path.gsub!(/\/+/, '/')

  new_path_array = []
  path.split('/').each do |p|
    new_path_array << p unless p == '.' || p == '..'
    new_path_array.pop if p == '..'
  end

  path = new_path_array.join('/')
  path += '/' if preserve_trailing_slash
  path += params if params

  path
end

.generate_path_strings(raw_path) ⇒ Array

Generates the path permutations from the raw path string

Parameters:

  • raw_path (String)

    path split from the full url string

Returns:

  • (Array)

    array of path permutation strings



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/google_safe_browsing/canonicalize.rb', line 87

def self.generate_path_strings(raw_path)
  return [ '/', '' ] if raw_path == ''

  path_split = raw_path.split('?')
  path = path_split[0] || ''
  params = path_split[1] || ''


  path_components = path.split('/').first(3)
  path_strings = [ '/' ]
  path_components.length.times do
    path_strings << '/' + path_components.join('/')
    path_components.pop
  end

  path_strings.map! do |p|
    unless p.index('.')
      p + '/'
    else
      p
    end
  end
  path_strings.map!{ |p| p.to_s.gsub!(/\/+/, '/') }
  path_strings.compact!
  path_strings.uniq!

  unless params.blank?
    path_strings | path_strings.map do |p|
      if p[-1] == '/'
        p
      else
        "#{p}?#{params}"
      end
    end
  else
    return path_strings
  end
end

.recursively_unescape(url) ⇒ String

Continues to unescape the url until unescaping has no effect

Parameters:

  • url (String)

    url string

Returns:

  • (String)

    fully unescaped url string



169
170
171
172
173
174
175
176
177
# File 'lib/google_safe_browsing/canonicalize.rb', line 169

def self.recursively_unescape(url)
  compare_url = url.clone 
  url = URI.unescape(url)
  while(compare_url != url)
    compare_url = url.clone
    url = URI.unescape(url)
  end
  url
end

.remove_fragment(string) ⇒ String

Strips the fragment portion of the url string (the last ‘#’ and everything after)

Parameters:

  • string (String)

    url

Returns:

  • (String)

    parameter with the fragment removed



160
161
162
163
# File 'lib/google_safe_browsing/canonicalize.rb', line 160

def self.remove_fragment(string)
  string = string[0..string.index('#')-1] if string.index('#')
  string
end

.remove_port(host_string) ⇒ String

Strip port number from host string

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (String)

    host part without the port number



282
283
284
# File 'lib/google_safe_browsing/canonicalize.rb', line 282

def self.remove_port(host_string)
  self.split_port(host_string)[:host]
end

.remove_protocol(cann) ⇒ String

Strip the leading protocol from the url string

Parameters:

  • cann (String)

    url string

Returns:

  • (String)

    url string without the protocol



259
260
261
262
263
264
265
266
267
# File 'lib/google_safe_browsing/canonicalize.rb', line 259

def self.remove_protocol(cann)
  if cann.index(PROTOCOL_DELIMITER)
    delimiting_index = cann.index(PROTOCOL_DELIMITER)
    @protocol = cann[0..delimiting_index-1]
    protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
    cann = cann[protocol_end_index..-1]
  end
  cann
end

.remove_username_and_password(host_string) ⇒ String

Strip user name and password from host part of url

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (String)

    host part of url without user name or password



290
291
292
# File 'lib/google_safe_browsing/canonicalize.rb', line 290

def self.remove_username_and_password(host_string)
  self.split_username_and_password(host_string)[:host]
end

.split_host_path(cann) ⇒ Hash

Takes the canonicalized url and splits the host and the path apart

Parameters:

  • cann (String)

    canonicalized url string

Returns:

  • (Hash)

    !{ :host => host_part, :path => path_part }



145
146
147
148
149
150
151
152
153
154
# File 'lib/google_safe_browsing/canonicalize.rb', line 145

def self.split_host_path(cann)
  ret= { :host => cann, :path => '' }
  split_point = cann.index('/')
  if split_point
    ret[:host] = cann[0..split_point-1]
    ret[:path] = cann[split_point+1..-1]
  end

  ret
end

.split_port(host_string) ⇒ Hash

Split post number and host string into a hash

Parameters:

Returns:

  • (Hash)

    :host has the host string, :port holds the port number



316
317
318
319
320
321
322
323
324
325
326
327
328
# File 'lib/google_safe_browsing/canonicalize.rb', line 316

def self.split_port(host_string)
  port_sep = host_string.rindex(':')
  result = {}
  if port_sep
    splits = host_string.split(':')
    result[:host] = splits[0]
    result[:port] = splits[1]
  else
    result[:host] = host_string
    result[:port] = nil
  end
  result
end

.split_username_and_password(host_string) ⇒ Hash

Split user name, passowrd from the host

Parameters:

Returns:

  • (Hash)

    :host has the host string, :creds holds the username and password string



298
299
300
301
302
303
304
305
306
307
308
309
310
# File 'lib/google_safe_browsing/canonicalize.rb', line 298

def self.split_username_and_password(host_string)
  un_sep = host_string.index('@')
  result = {}
  if un_sep
    splits = host_string.split('@')
    result[:host] = splits[1]
    result[:creds] = splits[0]
  else
    result[:host] = host_string
    result[:creds] = nil
  end
  result
end

.split_username_password_and_port(host_string) ⇒ Hash

Split the user name, password and port from the host string

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (Hash)

    :host as the host string; :creds has the username and password; :port holds the port number



334
335
336
337
# File 'lib/google_safe_browsing/canonicalize.rb', line 334

def self.split_username_password_and_port(host_string)
  result = self.split_username_and_password(host_string)
  result.merge(self.split_port(result[:host]))
end

.strict_escape(url) ⇒ String

Escape the url, but do not escape certain characters; such as the carat

Parameters:

  • url (String)

    url string

Returns:

  • (String)

    escaped url string



246
247
248
249
250
251
252
253
# File 'lib/google_safe_browsing/canonicalize.rb', line 246

def self.strict_escape(url)
  url = URI.escape url

  # unescape carat, may need other optionally escapeable chars
  url.gsub!('%5E','^')

  url
end

.strip_username_password_and_port_from_host(host_string) ⇒ String

Strip the user name, password and port number from the url

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (String)

    host portion of the url without the username, password and port



273
274
275
276
# File 'lib/google_safe_browsing/canonicalize.rb', line 273

def self.strip_username_password_and_port_from_host(host_string)
  host_string = remove_port(host_string)
  remove_username_and_password(host_string)
end

.url(raw_url) ⇒ String

Base Canonicalizer method

Parameters:

  • uncanonicalized (String)

    url string

Returns:

  • (String)

    canonicalized url string



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/google_safe_browsing/canonicalize.rb', line 16

def self.url(raw_url)
  raw_url = raw_url.to_s
  
  # Change encoding from UTF-8 to ASCII-8BIT to avoid InvalidByteSequenceError
  raw_url = raw_url.force_encoding("ASCII-8BIT")

  #remove tabs, carriage returns and line feeds
  raw_url.gsub!("\t",'')
  raw_url.gsub!("\r",'')
  raw_url.gsub!("\n",'')

  cann = raw_url.clone
  cann.gsub!(/\A\s+|\s+\Z/, '')

  cann = remove_fragment(cann)

  # repeatedly unescape until no more escaping
  cann = recursively_unescape(cann)

  # remove leading PROTOCOL
  cann = remove_protocol(cann)

  #split into host and path components
  splits = split_host_path(cann)

  cann = fix_host( splits[:host] ) + '/' + fix_path( splits[:path] )

  # add leading protocol
  @protocol ||= DEFAULT_PROTOCOL
  cann = @protocol + PROTOCOL_DELIMITER + cann

  strict_escape(cann)
end

.urls_for_lookup(lookup_url) ⇒ Array

Generate the url permutations for lookup

Parameters:

  • lookup_url (String)

    uncanonicalized url string

Returns:

  • (Array)

    array of cannonicalized url permutation strings



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/google_safe_browsing/canonicalize.rb', line 54

def self.urls_for_lookup(lookup_url)
  lookup_url = url(lookup_url)
  #return empty array if url returns nil; for invalid url
  return [] if lookup_url.blank?

  lookup_url = remove_protocol(lookup_url)

  splits = split_host_path(lookup_url)

  host_string = strip_username_password_and_port_from_host(splits[:host])

  #return empty array unless host_string has at least one period
  return [] unless host_string.include?('.')

  host_strings = [host_string]
  host = TopLevelDomain.split_from_host(host_string).last(5)
  ( host.length - 1 ).times do 
    host_strings << host.join('.')
    host.shift
  end
  host_strings.uniq!

  path_strings = generate_path_strings(splits[:path])

  cart_prod(host_strings, path_strings)
end