Class: FeedTools::URI

Inherits:
Object
  • Object
show all
Defined in:
lib/feed_tools/vendor/uri.rb

Overview

This is an implementation of a URI parser based on RFC 3986.

Defined Under Namespace

Modules: IDNA Classes: InvalidOptionError, InvalidURIError

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(scheme, userinfo, host, port, path, query, fragment) ⇒ URI

Creates a new uri object from component parts. Passing nil for any of these parameters is acceptable.



166
167
168
# File 'lib/feed_tools/vendor/uri.rb', line 166

def initialize(scheme, userinfo, host, port, path, query, fragment)
  assign_components(scheme, userinfo, host, port, path, query, fragment)
end

Class Method Details

.convert_path(path) ⇒ Object

Converts a path to a file protocol URI. If the path supplied is relative, it will be returned as a relative URI. If the path supplied is actually a URI, it will return the parsed URI.



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/feed_tools/vendor/uri.rb', line 61

def self.convert_path(path)
  return nil if path.nil?
  
  converted_uri = path.strip
  if converted_uri.length > 0 && converted_uri[0..0] == "/"
    converted_uri = "file://" + converted_uri
  end
  if converted_uri.length > 0 &&
      converted_uri.scan(/^[a-zA-Z]:[\\\/]/).size > 0
    converted_uri = "file:///" + converted_uri
  end
  converted_uri.gsub!(/^file:\/*/i, "file:///")
  if converted_uri =~ /^file:/i
    # Adjust windows-style uris
    converted_uri.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
    converted_uri.gsub!(/\\/, '/')
    converted_uri = self.parse(converted_uri).normalize
  else
    converted_uri = self.parse(converted_uri)
  end
  
  return converted_uri
end

.escape(uri) ⇒ Object

Correctly escapes a uri.



98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/feed_tools/vendor/uri.rb', line 98

def self.escape(uri)
  uri_object = uri.kind_of?(self) ? uri : self.parse(uri.to_s)
  return URI.new(
    uri_object.scheme,
    uri_object.userinfo,
    uri_object.host,
    uri_object.specified_port,
    self.normalize_escaping(uri_object.path),
    self.normalize_escaping(uri_object.query),
    self.normalize_escaping(uri_object.fragment)
  ).to_s
end

.extract(text, options = {}) ⇒ Object

Extracts uris from an arbitrary body of text.

Raises:



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/feed_tools/vendor/uri.rb', line 112

def self.extract(text, options={})
  defaults = {:base => nil, :parse => false} 
  options = defaults.merge(options)
  raise InvalidOptionError unless (options.keys - defaults.keys).empty?
  # This regular expression needs to be less forgiving or else it would
  # match virtually all text.  Which isn't exactly what we're going for.
  extract_regex = /((([a-z\+]+):)[^ \n\<\>\"\\]+[\w\/])/
  extracted_uris =
    text.scan(extract_regex).collect { |match| match[0] }
  sgml_extract_regex = /<[^>]+href=\"([^\"]+?)\"[^>]*>/
  sgml_extracted_uris =
    text.scan(sgml_extract_regex).collect { |match| match[0] }
  extracted_uris.concat(sgml_extracted_uris - extracted_uris)
  textile_extract_regex = /\".+?\":([^ ]+\/[^ ]+)[ \,\.\;\:\?\!\<\>\"]/i
  textile_extracted_uris =
    text.scan(textile_extract_regex).collect { |match| match[0] }
  extracted_uris.concat(textile_extracted_uris - extracted_uris)
  parsed_uris = []
  base_uri = nil
  if options[:base] != nil
    base_uri = options[:base] if options[:base].kind_of?(self)
    base_uri = self.parse(options[:base].to_s) if base_uri == nil
  end
  for uri_string in extracted_uris
    begin
      if base_uri == nil
        parsed_uris << self.parse(uri_string)
      else
        parsed_uris << (base_uri + self.parse(uri_string))
      end
    rescue Exception
      nil
    end
  end
  parsed_uris.reject! do |uri|
    (uri.scheme =~ /T\d+/ ||
     uri.scheme == "xmlns" ||
     uri.scheme == "xml" ||
     uri.scheme == "thr" ||
     uri.scheme == "this" ||
     uri.scheme == "float" ||
     uri.scheme == "user" ||
     uri.scheme == "username" ||
     uri.scheme == "out")
  end
  if options[:parse]
    return parsed_uris
  else
    return parsed_uris.collect { |uri| uri.to_s }
  end
end

.ip_based_schemesObject

Returns an array of known ip-based schemes. These schemes typically use a similar URI form: //<user>:<password>@<host>:<port>/<url-path>



225
226
227
# File 'lib/feed_tools/vendor/uri.rb', line 225

def self.ip_based_schemes
  return self.scheme_mapping.keys
end

.join(*uris) ⇒ Object

Joins several uris together.



86
87
88
89
90
91
92
93
94
95
# File 'lib/feed_tools/vendor/uri.rb', line 86

def self.join(*uris)
  uri_objects = uris.collect do |uri|
    uri.kind_of?(self) ? uri : self.parse(uri.to_s)
  end
  result = uri_objects.shift.dup
  for uri in uri_objects
    result.merge!(uri)
  end
  return result
end

.parse(uri_string) ⇒ Object

Returns a URI object based on the parsed string.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/feed_tools/vendor/uri.rb', line 12

def self.parse(uri_string)
  return nil if uri_string.nil?
  
  # If a URI object is passed, just return itself.
  return uri_string if uri_string.kind_of?(self)
  
  uri_regex =
    /^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?/
  scan = uri_string.scan(uri_regex)
  fragments = scan[0]
  return nil if fragments.nil?
  scheme = fragments[1]
  authority = fragments[3]
  path = fragments[4]
  query = fragments[6]
  fragment = fragments[8]
  userinfo = nil
  host = nil
  port = nil
  if authority != nil
    userinfo = authority.scan(/^([^\[\]]*)@/).flatten[0]
    host = authority.gsub(/^([^\[\]]*)@/, "").gsub(/:([^:@\[\]]*?)$/, "")
    port = authority.scan(/:([^:@\[\]]*?)$/).flatten[0]
  end
  if port.nil? || port == ""
    port = nil
  end
  
  # WARNING: Not standards-compliant, but follows the theme
  # of Postel's law:
  #
  # Special exception for dealing with the retarded idea of the
  # feed pseudo-protocol.  Without this exception, the parser will read
  # the URI as having a blank port number, instead of as having a second
  # URI embedded within.  This exception translates these broken URIs
  # and instead treats the inner URI as opaque.
  if scheme == "feed" && host == "http"
    userinfo = nil
    host = nil
    port = nil
    path = authority + path
  end
  
  return URI.new(scheme, userinfo, host, port, path, query, fragment)
end

.scheme_mappingObject

Returns a hash of common IP-based schemes and their default port numbers. Adding new schemes to this hash, as necessary, will allow for better URI normalization.



232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/feed_tools/vendor/uri.rb', line 232

def self.scheme_mapping
  if !defined?(@protocol_mapping) || @protocol_mapping.nil?
    @protocol_mapping = {
      "http" => 80,
      "https" => 443,
      "ftp" => 21,
      "tftp" => 69,
      "ssh" => 22,
      "svn+ssh" => 22,
      "telnet" => 23,
      "nntp" => 119,
      "gopher" => 70,
      "wais" => 210,
      "prospero" => 1525
    }
  end
  return @protocol_mapping
end

Instance Method Details

#+(uri) ⇒ Object

Joins two URIs together.



312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# File 'lib/feed_tools/vendor/uri.rb', line 312

def +(uri)
  if !uri.kind_of?(self.class)
    uri = URI.parse(uri.to_s)
  end
  if uri.to_s == ""
    return self.dup
  end
  
  joined_scheme = nil
  joined_userinfo = nil
  joined_host = nil
  joined_port = nil
  joined_path = nil
  joined_query = nil
  joined_fragment = nil
  
  # Section 5.2.2 of RFC 3986
  if uri.scheme != nil
    joined_scheme = uri.scheme
    joined_userinfo = uri.userinfo
    joined_host = uri.host
    joined_port = uri.specified_port
    joined_path = self.class.normalize_path(uri.path)
    joined_query = uri.query
  else
    if uri.authority != nil
      joined_userinfo = uri.userinfo
      joined_host = uri.host
      joined_port = uri.specified_port
      joined_path = self.class.normalize_path(uri.path)
      joined_query = uri.query
    else
      if uri.path == nil || uri.path == ""
        joined_path = self.path
        if uri.query != nil
          joined_query = uri.query
        else
          joined_query = self.query
        end
      else
        if uri.path[0..0] == "/"
          joined_path = self.class.normalize_path(uri.path)
        else
          base_path = self.path.nil? ? "" : self.path.dup
          base_path = self.class.normalize_path(base_path)
          base_path.gsub!(/\/[^\/]+$/, "/")
          joined_path = self.class.normalize_path(base_path + uri.path)
        end
        joined_query = uri.query
      end
      joined_userinfo = self.userinfo
      joined_host = self.host
      joined_port = self.specified_port
    end
    joined_scheme = self.scheme
  end
  joined_fragment = uri.fragment
  
  return URI.new(
    joined_scheme,
    joined_userinfo,
    joined_host,
    joined_port,
    joined_path,
    joined_query,
    joined_fragment
  )
end

#==(uri) ⇒ Object

Returns true if the URI objects are equal. This method normalizes both URIs before doing the comparison.



549
550
551
552
# File 'lib/feed_tools/vendor/uri.rb', line 549

def ==(uri)
  return false unless uri.kind_of?(self.class) 
  return self.normalize.to_s == uri.normalize.to_s
end

#===(uri) ⇒ Object

Returns true if the URI objects are equal. This method normalizes both URIs before doing the comparison, and allows comparison against strings.



533
534
535
536
537
538
539
540
541
542
543
544
545
# File 'lib/feed_tools/vendor/uri.rb', line 533

def ===(uri)
  uri_string = nil
  if uri.respond_to?(:normalize)
    uri_string = uri.normalize.to_s
  else
    begin
      uri_string = URI.parse(uri.to_s).normalize.to_s
    rescue Exception
      return false
    end
  end
  return self.normalize.to_s == uri_string
end

#absolute?Boolean

Returns true if this URI is known to be absolute.

Returns:

  • (Boolean)


307
308
309
# File 'lib/feed_tools/vendor/uri.rb', line 307

def absolute?
  return !relative?
end

#authorityObject

Returns the authority segment of this URI.



187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/feed_tools/vendor/uri.rb', line 187

def authority
  if !defined?(@authority) || @authority.nil?
    return nil if self.host.nil?
    @authority = ""
    if self.userinfo != nil
      @authority << "#{self.userinfo}@"
    end
    @authority << self.host
    if self.specified_port != nil
      @authority << ":#{self.specified_port}"
    end
  end
  return @authority
end

#display_uriObject

Creates a URI suitable for display to users. If semantic attacks are likely, the application should try to detect these and warn the user. See RFC 3986 section 7.6 for more information.



520
521
522
523
524
525
526
527
528
# File 'lib/feed_tools/vendor/uri.rb', line 520

def display_uri
  display_uri = self.normalize
  begin
    display_uri.instance_variable_set("@host",
      URI::IDNA.to_unicode(display_uri.host))
  rescue Exception
  end
  return display_uri
end

#dupObject

Clones the URI object.



562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
# File 'lib/feed_tools/vendor/uri.rb', line 562

def dup
  duplicated_scheme = nil
  duplicated_scheme = self.scheme.dup if self.scheme != nil
  duplicated_userinfo = nil
  duplicated_userinfo = self.userinfo.dup if self.userinfo != nil
  duplicated_host = nil
  duplicated_host = self.host.dup if self.host != nil
  duplicated_port = self.port
  duplicated_path = nil
  duplicated_path = self.path.dup if self.path != nil
  duplicated_query = nil
  duplicated_query = self.query.dup if self.query != nil
  duplicated_fragment = nil
  duplicated_fragment = self.fragment.dup if self.fragment != nil
  duplicated_uri = URI.new(
    duplicated_scheme,
    duplicated_userinfo,
    duplicated_host,
    duplicated_port,
    duplicated_path,
    duplicated_query,
    duplicated_fragment
  )
  @specified_port = nil if !defined?(@specified_port)
  duplicated_uri.instance_variable_set("@specified_port", @specified_port)
  return duplicated_uri
end

#eql?(uri) ⇒ Boolean

Returns true if the URI objects are equal. This method does NOT normalize either URI before doing the comparison.

Returns:

  • (Boolean)


556
557
558
559
# File 'lib/feed_tools/vendor/uri.rb', line 556

def eql?(uri)
  return false unless uri.kind_of?(self.class) 
  return self.to_s == uri.to_s
end

#fragmentObject

Returns the fragment for this URI.



291
292
293
# File 'lib/feed_tools/vendor/uri.rb', line 291

def fragment
  return @fragment
end

#hostObject

Returns the host for this URI.



182
183
184
# File 'lib/feed_tools/vendor/uri.rb', line 182

def host
  return @host
end

#inspectObject

Returns a string representation of the URI object’s state.



612
613
614
# File 'lib/feed_tools/vendor/uri.rb', line 612

def inspect
  sprintf("#<%s:%#0x URL:%s>", self.class.to_s, self.object_id, self.to_s)
end

#ip_based?Boolean

Returns true if the URI uses an IP-based protocol.

Returns:

  • (Boolean)


296
297
298
299
# File 'lib/feed_tools/vendor/uri.rb', line 296

def ip_based?
  return false if self.scheme.nil?
  return self.class.ip_based_schemes.include?(self.scheme.strip.downcase)
end

#merge(uri) ⇒ Object

Merges two URIs together.



382
383
384
# File 'lib/feed_tools/vendor/uri.rb', line 382

def merge(uri)
  return self + uri
end

#merge!(uri) ⇒ Object

Destructive form of merge.



387
388
389
# File 'lib/feed_tools/vendor/uri.rb', line 387

def merge!(uri)
  replace_self(self.merge(uri))
end

#normalizeObject

Returns a normalized URI object.

NOTE: This method does not attempt to conform to specifications. It exists largely to correct other people’s failures to read the specifications, and also to deal with caching issues since several different URIs may represent the same resource and should not be cached multiple times.



398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
# File 'lib/feed_tools/vendor/uri.rb', line 398

def normalize
  normalized_scheme = nil
  normalized_scheme = self.scheme.strip.downcase if self.scheme != nil
  normalized_scheme = "svn+ssh" if normalized_scheme == "ssh+svn"
  if normalized_scheme == "feed"
    if self.to_s =~ /^feed:\/*http:\/*/
      return self.class.parse(
        self.to_s.scan(/^feed:\/*(http:\/*.*)/).flatten[0]).normalize
    end
  end
  normalized_userinfo = nil
  normalized_userinfo = self.userinfo.strip if self.userinfo != nil
  normalized_host = nil
  normalized_host = self.host.strip.downcase if self.host != nil
  if normalized_host != nil
    begin
      normalized_host = URI::IDNA.to_ascii(normalized_host)
    rescue Exception
    end
  end
  
  # Normalize IPv4 addresses that were generated with the stupid
  # assumption that inet_addr() would be used to parse the IP address.
  if normalized_host != nil && normalized_host.strip =~ /^\d+$/
    # Decimal IPv4 address.
    decimal = normalized_host.to_i
    if decimal < (256 ** 4)
      octets = [0,0,0,0]
      octets[0] = decimal >> 24
      decimal -= (octets[0] * (256 ** 3))
      octets[1] = decimal >> 16
      decimal -= (octets[1] * (256 ** 2))
      octets[2] = decimal >> 8
      decimal -= (octets[2] * (256 ** 1))
      octets[3] = decimal
      normalized_host = octets.join(".")
    end
  elsif (normalized_host != nil && normalized_host.strip =~
      /^0+[0-7]{3}.0+[0-7]{3}.0+[0-7]{3}.0+[0-7]{3}$/)
    # Octal IPv4 address.
    octet_strings = normalized_host.split('.')
    octets = []
    octet_strings.each do |octet_string|
      decimal = octet_string.to_i(8)
      octets << decimal
    end
    normalized_host = octets.join(".")
  elsif (normalized_host != nil && normalized_host.strip =~
      /^0x[0-9a-f]{2}.0x[0-9a-f]{2}.0x[0-9a-f]{2}.0x[0-9a-f]{2}$/i)
    # Hexidecimal IPv4 address.
    octet_strings = normalized_host.split('.')
    octets = []
    octet_strings.each do |octet_string|
      decimal = octet_string[2...4].to_i(16)
      octets << decimal
    end
    normalized_host = octets.join(".")
  end
  normalized_port = self.port
  if self.class.scheme_mapping[normalized_scheme] == normalized_port
    normalized_port = nil
  end
  normalized_path = nil
  normalized_path = self.path.strip if self.path != nil
  if normalized_scheme != nil && normalized_host == nil
    if self.class.ip_based_schemes.include?(normalized_scheme) &&
        normalized_path =~ /[\w\.]+/
      normalized_host = normalized_path
      normalized_path = nil
      unless normalized_host =~ /\./
        normalized_host = normalized_host + ".com"
      end
    end
  end
  if normalized_path == nil &&
      normalized_scheme != nil &&
      normalized_host != nil
    normalized_path = "/"
  end
  if normalized_path != nil
    normalized_path = self.class.normalize_path(normalized_path)
    normalized_path = self.class.normalize_escaping(normalized_path)
  end
  if normalized_path == ""
    if ["http", "https", "ftp", "tftp"].include?(normalized_scheme)
      normalized_path = "/"
    end
  end
  normalized_path.gsub!(/%3B/, ";") if normalized_path != nil
  normalized_path.gsub!(/%3A/, ":") if normalized_path != nil
  normalized_path.gsub!(/%40/, "@") if normalized_path != nil
  normalized_path.gsub!(/%2B/, "+") if normalized_path != nil

  normalized_query = nil
  normalized_query = self.query.strip if self.query != nil
  normalized_query = self.class.normalize_escaping(normalized_query)
  normalized_query.gsub!(/%3D/, "=") if normalized_query != nil
  normalized_query.gsub!(/%26/, "&") if normalized_query != nil
  normalized_query.gsub!(/%2B/, "+") if normalized_query != nil
  
  normalized_fragment = nil
  normalized_fragment = self.fragment.strip if self.fragment != nil
  normalized_fragment = self.class.normalize_escaping(normalized_fragment)
  return URI.new(
    normalized_scheme,
    normalized_userinfo,
    normalized_host,
    normalized_port,
    normalized_path,
    normalized_query,
    normalized_fragment
  )
end

#normalize!Object

Destructively normalizes this URI object.



513
514
515
# File 'lib/feed_tools/vendor/uri.rb', line 513

def normalize!
  replace_self(self.normalize)
end

#passwordObject

Returns the password for this URI.



213
214
215
216
217
218
219
220
# File 'lib/feed_tools/vendor/uri.rb', line 213

def password
  if !defined?(@password) || @password.nil?
    @password = nil
    return @password if @userinfo.nil?
    @password = @userinfo.strip.scan(/:(.*)$/).flatten[0].strip
  end
  return @password
end

#pathObject

Returns the path for this URI.



281
282
283
# File 'lib/feed_tools/vendor/uri.rb', line 281

def path
  return @path
end

#portObject

Returns the port number for this URI. This method will normalize to the default port for the URI’s scheme if the port isn’t explicitly specified in the URI.



254
255
256
257
258
259
260
261
262
263
264
265
266
# File 'lib/feed_tools/vendor/uri.rb', line 254

def port
  if @port.to_i == 0
    if self.scheme.nil?
      @port = nil
    else
      @port = self.class.scheme_mapping[self.scheme.strip.downcase]
    end
    return @port
  else
    @port = @port.to_i
    return @port
  end
end

#queryObject

Returns the query string for this URI.



286
287
288
# File 'lib/feed_tools/vendor/uri.rb', line 286

def query
  return @query
end

#relative?Boolean

Returns true if this URI is known to be relative.

Returns:

  • (Boolean)


302
303
304
# File 'lib/feed_tools/vendor/uri.rb', line 302

def relative?
  return self.scheme.nil?
end

#schemeObject

Returns the scheme (protocol) for this URI.



171
172
173
174
# File 'lib/feed_tools/vendor/uri.rb', line 171

def scheme
  return nil if @scheme.nil? || @scheme.strip == ""
  return @scheme
end

#specified_portObject

Returns the port number that was actually specified in the URI string.



269
270
271
272
273
274
275
276
277
278
# File 'lib/feed_tools/vendor/uri.rb', line 269

def specified_port
  @specified_port = nil if !defined?(@specified_port)
  return nil if @specified_port.nil?
  port = @specified_port.to_s.to_i
  if port == 0
    return nil
  else
    return port
  end
end

#to_sObject

Returns the assembled URI as a string.



591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
# File 'lib/feed_tools/vendor/uri.rb', line 591

def to_s
  uri_string = ""
  if self.scheme != nil
    uri_string << "#{self.scheme}:"
  end
  if self.authority != nil
    uri_string << "//#{self.authority}"
  end
  if self.path != nil
    uri_string << self.path
  end
  if self.query != nil
    uri_string << "?#{self.query}"
  end
  if self.fragment != nil
    uri_string << "##{self.fragment}"
  end
  return uri_string
end

#userObject

Returns the user for this URI.



203
204
205
206
207
208
209
210
# File 'lib/feed_tools/vendor/uri.rb', line 203

def user
  if !defined?(@user) || @user.nil?
    @user = nil
    return @user if @userinfo.nil?
    @user = @userinfo.strip.scan(/^(.*):/).flatten[0].strip
  end
  return @user
end

#userinfoObject

Returns the username and password segment of this URI.



177
178
179
# File 'lib/feed_tools/vendor/uri.rb', line 177

def userinfo
  return @userinfo
end