Class: URIChunk

Inherits:
Chunk::Abstract show all
Includes:
URI::REGEXP::PATTERN
Defined in:
app/models/chunks/uri.rb

Overview

This wiki chunk matches arbitrary URIs, using patterns from the Ruby URI modules. It parses out a variety of fields that could be used by renderers to format the links in various ways (shortening domain names, hiding email addresses) It matches email addresses and host.com.au domains without schemes (http://) but adds these on as required.

The heuristic used to match a URI is designed to err on the side of caution. That is, it is more likely to not autolink a URI than it is to accidently autolink something that is not a URI. The reason behind this is it is easier to force a URI link by prefixing ‘http://’ to it than it is to escape and incorrectly marked up non-URI.

I’m using a part of the [ISO 3166-1 Standard] for country name suffixes. The generic names are from www.bnoack.com/data/countrycode2.html)

[iso3166]: http://geotags.com/iso3166/

Direct Known Subclasses

LocalURIChunk

Constant Summary collapse

GENERIC =
'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org'
COUNTRY =
'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' + 
'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' + 
'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' + 
'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' + 
'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' + 
'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' + 
'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' + 
'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' + 
'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' + 
'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' + 
'ws|ye|yt|yu|za|zm|zr|zw'
TLDS =

These are needed otherwise HOST will match almost anything

"(?:#{GENERIC}|#{COUNTRY})"
USERINFO =

Redefine USERINFO so that it must have non-zero length

"(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})+"
UNRESERVED_NO_ENDING =

unreserved_no_ending = alphanum | mark, but URI_ENDING [)!] excluded

"-_.~*'(#{ALNUM}"
URIC_NO_ENDING =

uric_no_ending = reserved | unreserved_no_ending | escaped

"(?:[#{UNRESERVED_NO_ENDING}#{RESERVED}]|#{ESCAPED})"
QUERY =

query = *uric

"#{URIC_NO_ENDING}*"
FRAGMENT =

fragment = *uric

"#{URIC_NO_ENDING}*"
INTERNET_HOSTNAME =

DOMLABEL is defined in the ruby uri library, TLDS is defined above

"(?:#{DOMLABEL}\\.)+#{TLDS}"
PORT =

Correct a typo bug in ruby 1.8.x lib/uri/common.rb

'\\d*'
INTERNET_URI =
"(?:(#{SCHEME}):/{0,2})?" +   # Optional scheme:        (\1)
"(?:(#{USERINFO})@)?" +       # Optional userinfo@      (\2)
"(#{INTERNET_HOSTNAME})" +    # Mandatory hostname      (\3)
"(?::(#{PORT}))?" +           # Optional :port          (\4)
"(#{ABS_PATH})?"  +           # Optional absolute path  (\5)
"(?:\\?(#{QUERY}))?" +        # Optional ?query         (\6)
"(?:\\#(#{FRAGMENT}))?"  +    # Optional #fragment      (\7)
'(?=\.?(?:\s|\)|\z))'
SUSPICIOUS_PRECEDING_CHARACTER =

ends only with optional dot + space or “)” or end of the string

'(!|\"\:|\"|\\\'|\]\()?'
INTERNET_URI_REGEXP =

any of !, “:, ”, ‘, ](

Regexp.new(SUSPICIOUS_PRECEDING_CHARACTER + INTERNET_URI, Regexp::EXTENDED, 'N')

Instance Attribute Summary collapse

Attributes inherited from Chunk::Abstract

#text, #unmask_mode, #unmask_text

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Chunk::Abstract

#escaped?, #id, inherited, #mask, mask_re, mask_string, #rendered?, #revert, #unmask

Constructor Details

#initialize(match_data, content) ⇒ URIChunk

Returns a new instance of URIChunk.



100
101
102
103
104
105
106
107
# File 'app/models/chunks/uri.rb', line 100

def initialize(match_data, content)
  super
  @link_text = match_data[0]
  @suspicious_preceding_character = match_data[1]
  @original_scheme, @user, @host, @port, @path, @query, @fragment = match_data[2..-1]
  treat_trailing_character
  @unmask_text = "<a href=\"#{uri}\">#{link_text}</a>"
end

Instance Attribute Details

#fragmentObject (readonly)

Returns the value of attribute fragment.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def fragment
  @fragment
end

#hostObject (readonly)

Returns the value of attribute host.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def host
  @host
end

Returns the value of attribute link_text.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def link_text
  @link_text
end

#pathObject (readonly)

Returns the value of attribute path.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def path
  @path
end

#portObject (readonly)

Returns the value of attribute port.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def port
  @port
end

#queryObject (readonly)

Returns the value of attribute query.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def query
  @query
end

#userObject (readonly)

Returns the value of attribute user.



85
86
87
# File 'app/models/chunks/uri.rb', line 85

def user
  @user
end

Class Method Details

.apply_to(content) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
# File 'app/models/chunks/uri.rb', line 87

def self.apply_to(content)
  content.gsub!( self.pattern ) do |matched_text|
    chunk = self.new($~, content)
    if chunk.avoid_autolinking?
      # do not substitute nor register the chunk
      matched_text
    else
      content.add_chunk(chunk)
      chunk.mask
    end
  end
end

.patternObject



81
82
83
# File 'app/models/chunks/uri.rb', line 81

def URIChunk.pattern
  INTERNET_URI_REGEXP
end

Instance Method Details

#avoid_autolinking?Boolean

Returns:

  • (Boolean)


109
110
111
# File 'app/models/chunks/uri.rb', line 109

def avoid_autolinking?
  not @suspicious_preceding_character.nil?
end

#port_delimiterObject



139
140
141
# File 'app/models/chunks/uri.rb', line 139

def port_delimiter
   ':' unless @port.nil?
end

#query_delimiterObject



143
144
145
# File 'app/models/chunks/uri.rb', line 143

def query_delimiter
   '?' unless @query.nil?
end

#schemeObject



127
128
129
# File 'app/models/chunks/uri.rb', line 127

def scheme
  @original_scheme or (@user ? 'mailto' : 'http')
end

#scheme_delimiterObject



131
132
133
# File 'app/models/chunks/uri.rb', line 131

def scheme_delimiter
  scheme == 'mailto' ? ':' : '://'
end

#treat_trailing_characterObject



113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'app/models/chunks/uri.rb', line 113

def treat_trailing_character
  # If the last character matched by URI pattern is in ! or ), this may be part of the markup,
  # not a URL. We should handle it as such. It is possible to do it by a regexp, but 
  # much easier to do programmatically
  last_char = @link_text[-1..-1]
  if last_char == ')' or last_char == '!'
    @trailing_punctuation = last_char
    @link_text.chop!
    [@original_scheme, @user, @host, @port, @path, @query, @fragment].compact.last.chop!
  else 
    @trailing_punctuation = nil
  end
end

#uriObject



147
148
149
150
# File 'app/models/chunks/uri.rb', line 147

def uri
  [scheme, scheme_delimiter, user, user_delimiter, host, port_delimiter, port, path, 
    query_delimiter, query].compact.join
end

#user_delimiterObject



135
136
137
# File 'app/models/chunks/uri.rb', line 135

def user_delimiter
   '@' unless @user.nil?
end