Class: GoogleSafeBrowsing::Canonicalize
- Inherits:
-
Object
- Object
- GoogleSafeBrowsing::Canonicalize
- Defined in:
- lib/google_safe_browsing/canonicalize.rb
Overview
Helpers to Canonicalize urls and generate url permutations for lookups
Constant Summary collapse
- PROTOCOL_DELIMITER =
'://'- DEFAULT_PROTOCOL =
'http'
Class Method Summary collapse
-
.cart_prod(a_one, a_two) ⇒ Array
Returns the cartesian product of two arrays by concatination of the string representation of the elements.
-
.fix_host(host) ⇒ String
Apply initial fixes to host string.
-
.fix_path(path) ⇒ String
Apply initial fixes to path string.
-
.generate_path_strings(raw_path) ⇒ Array
Generates the path permutations from the raw path string.
-
.recursively_unescape(url) ⇒ String
Continues to unescape the url until unescaping has no effect.
-
.remove_fragment(string) ⇒ String
Strips the fragment portion of the url string (the last ‘#’ and everything after).
-
.remove_port(host_string) ⇒ String
Strip port number from host string.
-
.remove_protocol(cann) ⇒ String
Strip the leading protocol from the url string.
-
.remove_username_and_password(host_string) ⇒ String
Strip user name and password from host part of url.
-
.split_host_path(cann) ⇒ Hash
Takes the canonicalized url and splits the host and the path apart.
-
.split_port(host_string) ⇒ Hash
Split post number and host string into a hash.
-
.split_username_and_password(host_string) ⇒ Hash
Split user name, passowrd from the host.
-
.split_username_password_and_port(host_string) ⇒ Hash
Split the user name, password and port from the host string.
-
.strict_escape(url) ⇒ String
Escape the url, but do not escape certain characters; such as the carat.
-
.strip_username_password_and_port_from_host(host_string) ⇒ String
Strip the user name, password and port number from the url.
-
.url(raw_url) ⇒ String
Base Canonicalizer method.
-
.urls_for_lookup(lookup_url) ⇒ Array
Generate the url permutations for lookup.
Class Method Details
.cart_prod(a_one, a_two) ⇒ Array
Returns the cartesian product of two arrays by concatination of the string representation of the elements
131 132 133 134 135 136 137 138 139 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 131 def self.cart_prod(a_one, a_two) result = [] a_one.each do |i| a_two.each do |j| result << "#{i}#{j}" end end result end |
.fix_host(host) ⇒ String
Apply initial fixes to host string
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 183 def self.fix_host(host) #puts "In Host: #{host}" # remove leading and trailing dots, multiple dots to one host.gsub!(/\A\.+|\.+\Z/, '') host.gsub!(/\.+/, '.') host.downcase! host_splits = self.split_username_password_and_port(host) if host_splits[:host] =~ /^\d+$/ host_splits[:host] = IP::V4.new(host.to_i).to_addr elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/ begin host_splits[:host] = IP.new(host).to_addr rescue ArgumentError end end result = host_splits[:host] result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank? result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank? result end |
.fix_path(path) ⇒ String
Apply initial fixes to path string
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 212 def self.fix_path(path) #puts "In Path: #{path}" #remove leading slash path = path[1..-1] if path[0..0] == '/' preserve_trailing_slash = ( path[-1..-1] == '/' ) if path.index('?') first_ques = path.index('?') params = path[first_ques..-1] path = path[0..first_ques-1] end # remove multiple '/' path.gsub!(/\/+/, '/') new_path_array = [] path.split('/').each do |p| new_path_array << p unless p == '.' || p == '..' new_path_array.pop if p == '..' end path = new_path_array.join('/') path += '/' if preserve_trailing_slash path += params if params path end |
.generate_path_strings(raw_path) ⇒ Array
Generates the path permutations from the raw path string
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 87 def self.generate_path_strings(raw_path) return [ '/', '' ] if raw_path == '' path_split = raw_path.split('?') path = path_split[0] || '' params = path_split[1] || '' path_components = path.split('/').first(3) path_strings = [ '/' ] path_components.length.times do path_strings << '/' + path_components.join('/') path_components.pop end path_strings.map! do |p| unless p.index('.') p + '/' else p end end path_strings.map!{ |p| p.to_s.gsub!(/\/+/, '/') } path_strings.compact! path_strings.uniq! unless params.blank? path_strings | path_strings.map do |p| if p[-1] == '/' p else "#{p}?#{params}" end end else return path_strings end end |
.recursively_unescape(url) ⇒ String
Continues to unescape the url until unescaping has no effect
169 170 171 172 173 174 175 176 177 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 169 def self.recursively_unescape(url) compare_url = url.clone url = URI.unescape(url) while(compare_url != url) compare_url = url.clone url = URI.unescape(url) end url end |
.remove_fragment(string) ⇒ String
Strips the fragment portion of the url string (the last ‘#’ and everything after)
160 161 162 163 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 160 def self.remove_fragment(string) string = string[0..string.index('#')-1] if string.index('#') string end |
.remove_port(host_string) ⇒ String
Strip port number from host string
282 283 284 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 282 def self.remove_port(host_string) self.split_port(host_string)[:host] end |
.remove_protocol(cann) ⇒ String
Strip the leading protocol from the url string
259 260 261 262 263 264 265 266 267 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 259 def self.remove_protocol(cann) if cann.index(PROTOCOL_DELIMITER) delimiting_index = cann.index(PROTOCOL_DELIMITER) @protocol = cann[0..delimiting_index-1] protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length cann = cann[protocol_end_index..-1] end cann end |
.remove_username_and_password(host_string) ⇒ String
Strip user name and password from host part of url
290 291 292 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 290 def self.remove_username_and_password(host_string) self.split_username_and_password(host_string)[:host] end |
.split_host_path(cann) ⇒ Hash
Takes the canonicalized url and splits the host and the path apart
145 146 147 148 149 150 151 152 153 154 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 145 def self.split_host_path(cann) ret= { :host => cann, :path => '' } split_point = cann.index('/') if split_point ret[:host] = cann[0..split_point-1] ret[:path] = cann[split_point+1..-1] end ret end |
.split_port(host_string) ⇒ Hash
Split post number and host string into a hash
316 317 318 319 320 321 322 323 324 325 326 327 328 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 316 def self.split_port(host_string) port_sep = host_string.rindex(':') result = {} if port_sep splits = host_string.split(':') result[:host] = splits[0] result[:port] = splits[1] else result[:host] = host_string result[:port] = nil end result end |
.split_username_and_password(host_string) ⇒ Hash
Split user name, passowrd from the host
298 299 300 301 302 303 304 305 306 307 308 309 310 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 298 def self.split_username_and_password(host_string) un_sep = host_string.index('@') result = {} if un_sep splits = host_string.split('@') result[:host] = splits[1] result[:creds] = splits[0] else result[:host] = host_string result[:creds] = nil end result end |
.split_username_password_and_port(host_string) ⇒ Hash
Split the user name, password and port from the host string
334 335 336 337 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 334 def self.split_username_password_and_port(host_string) result = self.split_username_and_password(host_string) result.merge(self.split_port(result[:host])) end |
.strict_escape(url) ⇒ String
Escape the url, but do not escape certain characters; such as the carat
246 247 248 249 250 251 252 253 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 246 def self.strict_escape(url) url = URI.escape url # unescape carat, may need other optionally escapeable chars url.gsub!('%5E','^') url end |
.strip_username_password_and_port_from_host(host_string) ⇒ String
Strip the user name, password and port number from the url
273 274 275 276 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 273 def self.strip_username_password_and_port_from_host(host_string) host_string = remove_port(host_string) remove_username_and_password(host_string) end |
.url(raw_url) ⇒ String
Base Canonicalizer method
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 16 def self.url(raw_url) raw_url = raw_url.to_s # Change encoding from UTF-8 to ASCII-8BIT to avoid InvalidByteSequenceError raw_url = raw_url.force_encoding("ASCII-8BIT") #remove tabs, carriage returns and line feeds raw_url.gsub!("\t",'') raw_url.gsub!("\r",'') raw_url.gsub!("\n",'') cann = raw_url.clone cann.gsub!(/\A\s+|\s+\Z/, '') cann = remove_fragment(cann) # repeatedly unescape until no more escaping cann = recursively_unescape(cann) # remove leading PROTOCOL cann = remove_protocol(cann) #split into host and path components splits = split_host_path(cann) cann = fix_host( splits[:host] ) + '/' + fix_path( splits[:path] ) # add leading protocol @protocol ||= DEFAULT_PROTOCOL cann = @protocol + PROTOCOL_DELIMITER + cann strict_escape(cann) end |
.urls_for_lookup(lookup_url) ⇒ Array
Generate the url permutations for lookup
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/google_safe_browsing/canonicalize.rb', line 54 def self.urls_for_lookup(lookup_url) lookup_url = url(lookup_url) #return empty array if url returns nil; for invalid url return [] if lookup_url.blank? lookup_url = remove_protocol(lookup_url) splits = split_host_path(lookup_url) host_string = strip_username_password_and_port_from_host(splits[:host]) #return empty array unless host_string has at least one period return [] unless host_string.include?('.') host_strings = [host_string] host = TopLevelDomain.split_from_host(host_string).last(5) ( host.length - 1 ).times do host_strings << host.join('.') host.shift end host_strings.uniq! path_strings = generate_path_strings(splits[:path]) cart_prod(host_strings, path_strings) end |