Module: Twitter::TwitterText::Validation
- Extended by:
- Deprecation, Validation
- Included in:
- Validation
- Defined in:
- lib/twitter-text/validation.rb
Defined Under Namespace
Classes: ParseResults
Constant Summary collapse
- DEFAULT_TCO_URL_LENGTHS =
{ :short_url_length => 23, }
- VALID_LIST_RE =
/\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
- MAX_LENGTH_LEGACY =
140
Instance Method Summary collapse
- #contains_invalid?(text) ⇒ Boolean
-
#parse_tweet(text, options = {}) ⇒ Object
Parse input text and return hash with descriptive parameters populated.
-
#tweet_invalid?(text) ⇒ Boolean
DEPRECATED: Please use parse_text instead.
-
#tweet_length(text, options = {}) ⇒ Object
DEPRECATED: Please use parse_text instead.
- #valid_hashtag?(hashtag) ⇒ Boolean
- #valid_list?(username_list) ⇒ Boolean
- #valid_tweet_text?(text) ⇒ Boolean
- #valid_url?(url, unicode_domains = true, require_protocol = true) ⇒ Boolean
- #valid_username?(username) ⇒ Boolean
Methods included from Deprecation
Instance Method Details
#contains_invalid?(text) ⇒ Boolean
127 128 129 130 131 132 133 134 135 136 |
# File 'lib/twitter-text/validation.rb', line 127 def contains_invalid?(text) return false if !text || text.empty? begin return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } rescue ArgumentError # non-Unicode value. return true end return false end |
#parse_tweet(text, options = {}) ⇒ Object
Parse input text and return hash with descriptive parameters populated.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/twitter-text/validation.rb', line 37 def parse_tweet(text, = {}) = DEFAULT_TCO_URL_LENGTHS.merge() config = [:config] || Twitter::TwitterText::Configuration.default_configuration normalized_text = text.to_nfc unless (normalized_text.length > 0) ParseResults.empty() end scale = config.scale max_weighted_tweet_length = config.max_weighted_tweet_length scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale transformed_url_length = config.transformed_url_length * scale ranges = config.ranges url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text) emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : [] has_invalid_chars = false weighted_count = 0 offset = 0 display_offset = 0 valid_offset = 0 while offset < normalized_text.codepoint_length # Reset the default char weight each pass through the loop char_weight = config.default_weight entity_length = 0 url_entities.each do |url_entity| if url_entity[:indices].first == offset entity_length = url_entity[:indices].last - url_entity[:indices].first weighted_count += transformed_url_length offset += entity_length display_offset += entity_length if weighted_count <= scaled_max_weighted_tweet_length valid_offset += entity_length end # Finding a match breaks the loop break end end emoji_entities.each do |emoji_entity| if emoji_entity[:indices].first == offset entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first weighted_count += char_weight # the default weight offset += entity_length display_offset += entity_length if weighted_count <= scaled_max_weighted_tweet_length valid_offset += entity_length end # Finding a match breaks the loop break end end next if entity_length > 0 if offset < normalized_text.codepoint_length code_point = normalized_text[offset] ranges.each do |range| if range.contains?(code_point.unpack("U").first) char_weight = range.weight break end end weighted_count += char_weight has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars codepoint_length = code_point.codepoint_length offset += codepoint_length display_offset += codepoint_length # index += codepoint_length if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length) valid_offset += codepoint_length end end end normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length scaled_weighted_length = weighted_count / scale is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length) permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1)) end |
#tweet_invalid?(text) ⇒ Boolean
DEPRECATED: Please use parse_text instead.
Check the text
for any reason that it may not be valid as a Tweet. This is meant as a pre-validation before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation will allow quicker feedback.
Returns false
if this text
is valid. Otherwise one of the following Symbols will be returned:
<tt>:too_long</tt>:: if the <tt>text</tt> is too long
<tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
<tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
223 224 225 226 227 228 229 230 231 232 233 234 |
# File 'lib/twitter-text/validation.rb', line 223 def tweet_invalid?(text) return :empty if !text || text.empty? begin return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) } rescue ArgumentError # non-Unicode value. return :invalid_characters end return false end |
#tweet_length(text, options = {}) ⇒ Object
DEPRECATED: Please use parse_text instead.
Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC (See: www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a string no matter which actual form was transmitted. For example:
U+0065 Latin Small Letter E
+ U+0301 Combining Acute Accent
2 bytes, 2 characters, displayed as é (1 visual glyph)
… The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1
The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/twitter-text/validation.rb', line 198 def tweet_length(text, = {}) = DEFAULT_TCO_URL_LENGTHS.merge() length = text.to_nfc.unpack("U*").length Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position| length += start_position - end_position length += [:short_url_length] if url.length > 0 end length end |
#valid_hashtag?(hashtag) ⇒ Boolean
153 154 155 156 157 158 159 |
# File 'lib/twitter-text/validation.rb', line 153 def valid_hashtag?(hashtag) return false if !hashtag || hashtag.empty? extracted = Twitter::TwitterText::Extractor.(hashtag) # Should extract the hashtag minus the # sign, hence the [1..-1] extracted.size == 1 && extracted.first == hashtag[1..-1] end |
#valid_list?(username_list) ⇒ Boolean
147 148 149 150 151 |
# File 'lib/twitter-text/validation.rb', line 147 def valid_list?(username_list) match = username_list.match(VALID_LIST_RE) # Must have matched and had nothing before or after !!(match && match[1] == "" && match[4] && !match[4].empty?) end |
#valid_tweet_text?(text) ⇒ Boolean
237 238 239 |
# File 'lib/twitter-text/validation.rb', line 237 def valid_tweet_text?(text) !tweet_invalid?(text) end |
#valid_url?(url, unicode_domains = true, require_protocol = true) ⇒ Boolean
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/twitter-text/validation.rb', line 161 def valid_url?(url, unicode_domains=true, require_protocol=true) return false if !url || url.empty? url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded]) return false unless (url_parts && url_parts.to_s == url) scheme, , path, query, fragment = url_parts.captures return false unless ((!require_protocol || (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) && valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) && valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) && valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true)) return (unicode_domains && valid_match?(, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) || (!unicode_domains && valid_match?(, Twitter::TwitterText::Regex[:validate_url_authority])) end |
#valid_username?(username) ⇒ Boolean
138 139 140 141 142 143 144 |
# File 'lib/twitter-text/validation.rb', line 138 def valid_username?(username) return false if !username || username.empty? extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username) # Should extract the username minus the @ sign, hence the [1..-1] extracted.size == 1 && extracted.first == username[1..-1] end |