Class: GoogleSafeBrowsing::Canonicalize

Inherits:
Object
  • Object
show all
Defined in:
lib/google_safe_browsing/canonicalize.rb

Overview

Helpers to Canonicalize urls and generate url permutations for lookups

Constant Summary collapse

PROTOCOL_DELIMITER =
'://'
DEFAULT_PROTOCOL =
'http'

Class Method Summary collapse

Class Method Details

.cart_prod(a_one, a_two) ⇒ Array

Returns the cartesian product of two arrays by concatination of the string representation of the elements

Parameters:

  • a_one (Array)

    array of strings

  • a_two (Array)

    array of strings

Returns:

  • (Array)

    cartesian product of arrays with elements concatinated



122
123
124
125
126
127
128
129
130
# File 'lib/google_safe_browsing/canonicalize.rb', line 122

def self.cart_prod(a_one, a_two)
  result = []
  a_one.each do |i|
    a_two.each do |j|
      result << "#{i}#{j}"
    end
  end
  result
end

.fix_host(host) ⇒ String

Apply initial fixes to host string

Parameters:

  • host (String)

    host string

Returns:

  • (String)

    standardized host string



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/google_safe_browsing/canonicalize.rb', line 175

def self.fix_host(host)
  # remove leading and trailing dots, multiple dots to one
  host.gsub!(/\A\.+|\.+\Z/, '')
  host.gsub!(/\.+/, '.')

  host.downcase!

  host_splits = self.split_username_password_and_port(host)

  if host_splits[:host] =~ /^\d+$/
    host_splits[:host] = IP::V4.new(host.to_i).to_addr
  elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
    begin
      host_splits[:host] = IP.new(host).to_addr
    rescue ArgumentError
    end
  end

  result = host_splits[:host]
  result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank?
  result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank?
  result
end

.fix_path(path) ⇒ String

Apply initial fixes to path string

Parameters:

  • path (String)

    path string

Returns:

  • (String)

    standardized path string



203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'lib/google_safe_browsing/canonicalize.rb', line 203

def self.fix_path(path)
  # remove leading slash
  path = path[1..-1] if path[0..0] == '/'

  preserve_trailing_slash = (path[-1..-1] == '/')

  if path.index('?')
    first_ques = path.index('?')
    params = path[first_ques..-1]
    path = path[0..(first_ques - 1)]
  end

  # remove multiple '/'
  path.gsub!(/\/+/, '/')

  new_path_array = []
  path.split('/').each do |p|
    new_path_array << p unless p == '.' || p == '..'
    new_path_array.pop if p == '..'
  end

  path = new_path_array.join('/')
  path += '/' if preserve_trailing_slash
  path += params if params

  path
end

.generate_path_strings(raw_path) ⇒ Array

Generates the path permutations from the raw path string

Parameters:

  • raw_path (String)

    path split from the full url string

Returns:

  • (Array)

    array of path permutation strings



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/google_safe_browsing/canonicalize.rb', line 85

def self.generate_path_strings(raw_path)
  return ['/', ''] if raw_path == ''

  path_split = raw_path.split('?')
  path = path_split[0] || ''
  params = path_split[1] || ''

  path_components = path.split('/').first(3)
  path_strings = ['/']
  path_components.length.times do
    path_strings << '/' + path_components.join('/')
    path_components.pop
  end

  path_strings.map! do |p|
    if p.index('.')
      p
    else
      p + '/'
    end
  end
  path_strings.map! { |p| p.to_s.gsub!(/\/+/, '/') }
  path_strings.compact!
  path_strings.uniq!

  return path_strings if params.blank?
  path_strings | path_strings.map do |p|
    p[-1] == '/' ?  p : "#{p}?#{params}"
  end
end

.recursively_unescape(url) ⇒ String

Continues to unescape the url until unescaping has no effect

Parameters:

  • url (String)

    url string

Returns:

  • (String)

    fully unescaped url string



161
162
163
164
165
166
167
168
169
# File 'lib/google_safe_browsing/canonicalize.rb', line 161

def self.recursively_unescape(url)
  compare_url = url.clone
  url = URI.unescape(url)
  until compare_url == url
    compare_url = url.clone
    url = URI.unescape(url)
  end
  url
end

.remove_fragment(string) ⇒ String

Strips the fragment portion of the url string (the last ‘#’ and everything after)

Parameters:

  • string (String)

    url

Returns:

  • (String)

    parameter with the fragment removed



152
153
154
155
# File 'lib/google_safe_browsing/canonicalize.rb', line 152

def self.remove_fragment(string)
  string = string[0..(string.index('#') - 1)] if string.index('#')
  string
end

.remove_port(host_string) ⇒ String

Strip port number from host string

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (String)

    host part without the port number



271
272
273
# File 'lib/google_safe_browsing/canonicalize.rb', line 271

def self.remove_port(host_string)
  self.split_port(host_string)[:host]
end

.remove_protocol(cann) ⇒ String

Strip the leading protocol from the url string

Parameters:

  • cann (String)

    url string

Returns:

  • (String)

    url string without the protocol



248
249
250
251
252
253
254
255
256
# File 'lib/google_safe_browsing/canonicalize.rb', line 248

def self.remove_protocol(cann)
  if cann.index(PROTOCOL_DELIMITER)
    delimiting_index = cann.index(PROTOCOL_DELIMITER)
    @protocol = cann[0..(delimiting_index - 1)]
    protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
    cann = cann[protocol_end_index..-1]
  end
  cann
end

.remove_username_and_password(host_string) ⇒ String

Strip user name and password from host part of url

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (String)

    host part of url without user name or password



279
280
281
# File 'lib/google_safe_browsing/canonicalize.rb', line 279

def self.remove_username_and_password(host_string)
  self.split_username_and_password(host_string)[:host]
end

.split_host_path(cann) ⇒ Hash

Takes the canonicalized url and splits the host and the path apart

Parameters:

  • cann (String)

    canonicalized url string

Returns:

  • (Hash)

    !{ host: host_part, path: path_part }



136
137
138
139
140
141
142
143
144
145
# File 'lib/google_safe_browsing/canonicalize.rb', line 136

def self.split_host_path(cann)
  ret = { host: cann, path: '' }
  split_point = cann.index('/')
  if split_point
    ret[:host] = cann[0..split_point - 1]
    ret[:path] = cann[(split_point + 1)..-1]
  end

  ret
end

.split_port(host_string) ⇒ Hash

Split post number and host string into a hash

Parameters:

Returns:

  • (Hash)

    :host has the host string, :port holds the port number



305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/google_safe_browsing/canonicalize.rb', line 305

def self.split_port(host_string)
  port_sep = host_string.rindex(':')
  result = {}
  if port_sep
    splits = host_string.split(':')
    result[:host] = splits[0]
    result[:port] = splits[1]
  else
    result[:host] = host_string
    result[:port] = nil
  end
  result
end

.split_username_and_password(host_string) ⇒ Hash

Split user name, passowrd from the host

Parameters:

Returns:

  • (Hash)

    :host has the host string, :creds holds the username and password string



287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/google_safe_browsing/canonicalize.rb', line 287

def self.split_username_and_password(host_string)
  un_sep = host_string.index('@')
  result = {}
  if un_sep
    splits = host_string.split('@')
    result[:host] = splits[1]
    result[:creds] = splits[0]
  else
    result[:host] = host_string
    result[:creds] = nil
  end
  result
end

.split_username_password_and_port(host_string) ⇒ Hash

Split the user name, password and port from the host string

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (Hash)

    :host as the host string; :creds has the username and password; :port holds the port number



323
324
325
326
# File 'lib/google_safe_browsing/canonicalize.rb', line 323

def self.split_username_password_and_port(host_string)
  result = self.split_username_and_password(host_string)
  result.merge(self.split_port(result[:host]))
end

.strict_escape(url) ⇒ String

Escape the url, but do not escape certain characters; such as the carat

Parameters:

  • url (String)

    url string

Returns:

  • (String)

    escaped url string



235
236
237
238
239
240
241
242
# File 'lib/google_safe_browsing/canonicalize.rb', line 235

def self.strict_escape(url)
  url = URI.escape url

  # unescape carat, may need other optionally escapeable chars
  url.gsub!('%5E', '^')

  url
end

.strip_username_password_and_port_from_host(host_string) ⇒ String

Strip the user name, password and port number from the url

Parameters:

  • host_string (String)

    host portion of the url

Returns:

  • (String)

    host portion of the url without the username, password and port



262
263
264
265
# File 'lib/google_safe_browsing/canonicalize.rb', line 262

def self.strip_username_password_and_port_from_host(host_string)
  host_string = remove_port(host_string)
  remove_username_and_password(host_string)
end

.url(raw_url) ⇒ String

Base Canonicalizer method

Parameters:

  • uncanonicalized (String)

    url string

Returns:

  • (String)

    canonicalized url string



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/google_safe_browsing/canonicalize.rb', line 15

def self.url(raw_url)
  raw_url = raw_url.to_s

  # Change encoding from UTF-8 to ASCII-8BIT to avoid
  # InvalidByteSequenceError
  raw_url = raw_url.force_encoding('ASCII-8BIT')

  # remove tabs, carriage returns and line feeds
  raw_url.gsub!("\t", '')
  raw_url.gsub!("\r", '')
  raw_url.gsub!("\n", '')

  cann = raw_url.clone
  cann.gsub!(/\A\s+|\s+\Z/, '')

  cann = remove_fragment(cann)

  # repeatedly unescape until no more escaping
  cann = recursively_unescape(cann)

  # remove leading PROTOCOL
  cann = remove_protocol(cann)

  # split into host and path components
  splits = split_host_path(cann)

  cann = fix_host(splits[:host]) + '/' + fix_path(splits[:path])

  # add leading protocol
  @protocol ||= DEFAULT_PROTOCOL
  cann = @protocol + PROTOCOL_DELIMITER + cann

  strict_escape(cann)
end

.urls_for_lookup(lookup_url) ⇒ Array

Generate the url permutations for lookup

Parameters:

  • lookup_url (String)

    uncanonicalized url string

Returns:

  • (Array)

    array of cannonicalized url permutation strings



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/google_safe_browsing/canonicalize.rb', line 54

def self.urls_for_lookup(lookup_url)
  lookup_url = url(lookup_url)
  # return empty array if url returns nil; for invalid url
  return [] if lookup_url.blank?

  lookup_url = remove_protocol(lookup_url)

  splits = split_host_path(lookup_url)

  host_string = strip_username_password_and_port_from_host(splits[:host])

  # return empty array unless host_string has at least one period
  return [] unless host_string.include?('.')

  host_strings = [host_string]
  host = TopLevelDomain.split_from_host(host_string).last(5)
  (host.length - 1).times do
    host_strings << host.join('.')
    host.shift
  end
  host_strings.uniq!

  path_strings = generate_path_strings(splits[:path])

  cart_prod(host_strings, path_strings)
end