Module: Twitter::TwitterText::Validation

Extended by:
Deprecation, Validation
Included in:
Validation
Defined in:
lib/twitter-text/validation.rb

Defined Under Namespace

Classes: ParseResults

Constant Summary collapse

DEFAULT_TCO_URL_LENGTHS =
{
  :short_url_length => 23,
}
VALID_LIST_RE =
/\A#{Twitter::TwitterText::Regex[:valid_mention_or_list]}\z/o
MAX_LENGTH_LEGACY =
140

Instance Method Summary collapse

Methods included from Deprecation

deprecate

Instance Method Details

#contains_invalid?(text) ⇒ Boolean

Returns:

  • (Boolean)


127
128
129
130
131
132
133
134
135
136
# File 'lib/twitter-text/validation.rb', line 127

def contains_invalid?(text)
  return false if !text || text.empty?
  begin
    return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
  rescue ArgumentError
    # non-Unicode value.
    return true
  end
  return false
end

#parse_tweet(text, options = {}) ⇒ Object

Parse input text and return hash with descriptive parameters populated.



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/twitter-text/validation.rb', line 37

def parse_tweet(text, options = {})
  options = DEFAULT_TCO_URL_LENGTHS.merge(options)
  config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
  normalized_text = text.to_nfc
  unless (normalized_text.length > 0)
    ParseResults.empty()
  end

  scale = config.scale
  max_weighted_tweet_length = config.max_weighted_tweet_length
  scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
  transformed_url_length = config.transformed_url_length * scale
  ranges = config.ranges

  url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
  emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []

  has_invalid_chars = false
  weighted_count = 0
  offset = 0
  display_offset = 0
  valid_offset = 0

  while offset < normalized_text.codepoint_length
    # Reset the default char weight each pass through the loop
    char_weight = config.default_weight
    entity_length = 0

    url_entities.each do |url_entity|
      if url_entity[:indices].first == offset
        entity_length = url_entity[:indices].last - url_entity[:indices].first
        weighted_count += transformed_url_length
        offset += entity_length
        display_offset += entity_length
        if weighted_count <= scaled_max_weighted_tweet_length
          valid_offset += entity_length
        end
        # Finding a match breaks the loop
        break
      end
    end

    emoji_entities.each do |emoji_entity|
      if emoji_entity[:indices].first == offset
        entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
        weighted_count += char_weight # the default weight
        offset += entity_length
        display_offset += entity_length
        if weighted_count <= scaled_max_weighted_tweet_length
          valid_offset += entity_length
        end
        # Finding a match breaks the loop
        break
      end
    end

    next if entity_length > 0

    if offset < normalized_text.codepoint_length
      code_point = normalized_text[offset]

      ranges.each do |range|
        if range.contains?(code_point.unpack("U").first)
          char_weight = range.weight
          break
        end
      end

      weighted_count += char_weight

      has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
      codepoint_length = code_point.codepoint_length
      offset += codepoint_length
      display_offset += codepoint_length
      #          index += codepoint_length

      if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
        valid_offset += codepoint_length
      end
    end
  end

  normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
  scaled_weighted_length = weighted_count / scale
  is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
  permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length

  return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
end

#tweet_invalid?(text) ⇒ Boolean

DEPRECATED: Please use parse_text instead.

Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation will allow quicker feedback.

Returns false if this text is valid. Otherwise one of the following Symbols will be returned:

<tt>:too_long</tt>:: if the <tt>text</tt> is too long
<tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
<tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters

Returns:

  • (Boolean)


223
224
225
226
227
228
229
230
231
232
233
234
# File 'lib/twitter-text/validation.rb', line 223

def tweet_invalid?(text)
  return :empty if !text || text.empty?
  begin
    return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
    return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
  rescue ArgumentError
    # non-Unicode value.
    return :invalid_characters
  end

  return false
end

#tweet_length(text, options = {}) ⇒ Object

DEPRECATED: Please use parse_text instead.

Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC (See: www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a string no matter which actual form was transmitted. For example:

U+0065  Latin Small Letter E

+ U+0301 Combining Acute Accent


2 bytes, 2 characters, displayed as é (1 visual glyph)

… The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1

The string could also contain U+00E9 already, in which case the canonicalization will not change the value.



198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/twitter-text/validation.rb', line 198

def tweet_length(text, options = {})
  options = DEFAULT_TCO_URL_LENGTHS.merge(options)

  length = text.to_nfc.unpack("U*").length

  Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
    length += start_position - end_position
    length += options[:short_url_length] if url.length > 0
  end

  length
end

#valid_hashtag?(hashtag) ⇒ Boolean

Returns:

  • (Boolean)


153
154
155
156
157
158
159
# File 'lib/twitter-text/validation.rb', line 153

def valid_hashtag?(hashtag)
  return false if !hashtag || hashtag.empty?

  extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
  # Should extract the hashtag minus the # sign, hence the [1..-1]
  extracted.size == 1 && extracted.first == hashtag[1..-1]
end

#valid_list?(username_list) ⇒ Boolean

Returns:

  • (Boolean)


147
148
149
150
151
# File 'lib/twitter-text/validation.rb', line 147

def valid_list?(username_list)
  match = username_list.match(VALID_LIST_RE)
  # Must have matched and had nothing before or after
  !!(match && match[1] == "" && match[4] && !match[4].empty?)
end

#valid_tweet_text?(text) ⇒ Boolean

Returns:

  • (Boolean)


237
238
239
# File 'lib/twitter-text/validation.rb', line 237

def valid_tweet_text?(text)
  !tweet_invalid?(text)
end

#valid_url?(url, unicode_domains = true, require_protocol = true) ⇒ Boolean

Returns:

  • (Boolean)


161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/twitter-text/validation.rb', line 161

def valid_url?(url, unicode_domains=true, require_protocol=true)
  return false if !url || url.empty?

  url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
  return false unless (url_parts && url_parts.to_s == url)

  scheme, authority, path, query, fragment = url_parts.captures

  return false unless ((!require_protocol ||
                        (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
                       valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
                       valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
                       valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))

  return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
         (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
end

#valid_username?(username) ⇒ Boolean

Returns:

  • (Boolean)


138
139
140
141
142
143
144
# File 'lib/twitter-text/validation.rb', line 138

def valid_username?(username)
  return false if !username || username.empty?

  extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
  # Should extract the username minus the @ sign, hence the [1..-1]
  extracted.size == 1 && extracted.first == username[1..-1]
end