Module: Twitter::Extractor

Extended by:
Extractor
Included in:
Extractor
Defined in:
lib/twitter-text/extractor.rb

Overview

A module for including Tweet parsing in a class. This module provides function for the extraction and processing of usernames, lists, URLs and hashtags.

Instance Method Summary collapse

Instance Method Details

#extract_cashtags(text, &block) ⇒ Object

Extracts a list of all cashtags included in the Tweet text. If the text is nil or contains no cashtags an empty array will be returned. The array returned will not include the leading $ character.

If a block is given then it will be called for each cashtag.



298
299
300
301
302
# File 'lib/twitter-text/extractor.rb', line 298

def extract_cashtags(text, &block) # :yields: cashtag_text
  cashtags = extract_cashtags_with_indices(text).map{|h| h[:cashtag]}
  cashtags.each(&block) if block_given?
  cashtags
end

#extract_cashtags_with_indices(text) ⇒ Object

Extracts a list of all cashtags included in the Tweet text. If the text is nil or contains no cashtags an empty array will be returned. The array returned will not include the leading $ character.

If a block is given then it will be called for each cashtag.



310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
# File 'lib/twitter-text/extractor.rb', line 310

def extract_cashtags_with_indices(text) # :yields: cashtag_text, start, end
  return [] unless text =~ /\$/

  tags = []
  text.scan(Twitter::Regex[:valid_cashtag]) do |before, dollar, cash_text|
    match_data = $~
    start_position = match_data.char_begin(2)
    end_position = match_data.char_end(3)
    tags << {
      :cashtag => cash_text,
      :indices => [start_position, end_position]
    }
  end

  tags.each{|tag| yield tag[:cashtag], tag[:indices].first, tag[:indices].last} if block_given?
  tags
end

#extract_entities_with_indices(text, options = {}, &block) ⇒ Object

Extracts all usernames, lists, hashtags and URLs in the Tweet text along with the indices for where the entity ocurred If the text is nil or contains no entity an empty array will be returned.

If a block is given then it will be called for each entity.



68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/twitter-text/extractor.rb', line 68

def extract_entities_with_indices(text, options = {}, &block)
  # extract all entities
  entities = extract_urls_with_indices(text, options) +
             extract_hashtags_with_indices(text, :check_url_overlap => false) +
             extract_mentions_or_lists_with_indices(text) +
             extract_cashtags_with_indices(text)

  return [] if entities.empty?

  entities = remove_overlapping_entities(entities)

  entities.each(&block) if block_given?
  entities
end

#extract_hashtags(text, &block) ⇒ Object

Extracts a list of all hashtags included in the Tweet text. If the text is nil or contains no hashtags an empty array will be returned. The array returned will not include the leading # character.

If a block is given then it will be called for each hashtag.



247
248
249
250
251
# File 'lib/twitter-text/extractor.rb', line 247

def extract_hashtags(text, &block) # :yields: hashtag_text
  hashtags = extract_hashtags_with_indices(text).map{|h| h[:hashtag]}
  hashtags.each(&block) if block_given?
  hashtags
end

#extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) ⇒ Object

Extracts a list of all hashtags included in the Tweet text. If the text is nil or contains no hashtags an empty array will be returned. The array returned will not include the leading # character.

If a block is given then it will be called for each hashtag.



259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/twitter-text/extractor.rb', line 259

def extract_hashtags_with_indices(text, options = {:check_url_overlap => true}) # :yields: hashtag_text, start, end
  return [] unless text =~ /[##]/

  tags = []
  text.scan(Twitter::Regex[:valid_hashtag]) do |before, hash, hash_text|
    match_data = $~
    start_position = match_data.char_begin(2)
    end_position = match_data.char_end(3)
    after = $'
    unless after =~ Twitter::Regex[:end_hashtag_match]
      tags << {
        :hashtag => hash_text,
        :indices => [start_position, end_position]
      }
    end
  end

  if options[:check_url_overlap]
    # extract URLs
    urls = extract_urls_with_indices(text)
    unless urls.empty?
      tags.concat(urls)
      # remove duplicates
      tags = remove_overlapping_entities(tags)
      # remove URL entities
      tags.reject!{|entity| !entity[:hashtag] }
    end
  end

  tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last} if block_given?
  tags
end

#extract_mentioned_screen_names(text, &block) ⇒ Object

Extracts a list of all usernames mentioned in the Tweet text. If the text is nil or contains no username mentions an empty array will be returned.

If a block is given then it will be called for each username.



88
89
90
91
92
# File 'lib/twitter-text/extractor.rb', line 88

def extract_mentioned_screen_names(text, &block) # :yields: username
  screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]}
  screen_names.each(&block) if block_given?
  screen_names
end

#extract_mentioned_screen_names_with_indices(text) ⇒ Object

Extracts a list of all usernames mentioned in the Tweet text along with the indices for where the mention ocurred. If the text is nil or contains no username mentions, an empty array will be returned.

If a block is given, then it will be called with each username, the start index, and the end index in the text.



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/twitter-text/extractor.rb', line 101

def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
  return [] unless text

  possible_screen_names = []
  extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position|
    next unless list_slug.empty?
    possible_screen_names << {
      :screen_name => screen_name,
      :indices => [start_position, end_position]
    }
  end

  if block_given?
    possible_screen_names.each do |mention|
      yield mention[:screen_name], mention[:indices].first, mention[:indices].last
    end
  end

  possible_screen_names
end

#extract_mentions_or_lists_with_indices(text) ⇒ Object

Extracts a list of all usernames or lists mentioned in the Tweet text along with the indices for where the mention ocurred. If the text is nil or contains no username or list mentions, an empty array will be returned.

If a block is given, then it will be called with each username, list slug, the start index, and the end index in the text. The list_slug will be an empty stirng if this is a username mention.



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/twitter-text/extractor.rb', line 130

def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end
  return [] unless text =~ /[@@]/

  possible_entries = []
  text.to_s.scan(Twitter::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug|
    match_data = $~
    after = $'
    unless after =~ Twitter::Regex[:end_mention_match]
      start_position = match_data.char_begin(3) - 1
      end_position = match_data.char_end(list_slug.nil? ? 3 : 4)
      possible_entries << {
        :screen_name => screen_name,
        :list_slug => list_slug || "",
        :indices => [start_position, end_position]
      }
    end
  end

  if block_given?
    possible_entries.each do |mention|
      yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last
    end
  end

  possible_entries
end

#extract_reply_screen_name(text) {|screen_name| ... } ⇒ Object

Extracts the username username replied to in the Tweet text. If the text is nil or is not a reply nil will be returned.

If a block is given then it will be called with the username replied to (if any)

Yields:

  • (screen_name)


161
162
163
164
165
166
167
168
169
170
# File 'lib/twitter-text/extractor.rb', line 161

def extract_reply_screen_name(text) # :yields: username
  return nil unless text

  possible_screen_name = text.match(Twitter::Regex[:valid_reply])
  return unless possible_screen_name.respond_to?(:captures)
  return if $' =~ Twitter::Regex[:end_mention_match]
  screen_name = possible_screen_name.captures.first
  yield screen_name if block_given?
  screen_name
end

#extract_urls(text, &block) ⇒ Object

Extracts a list of all URLs included in the Tweet text. If the text is nil or contains no URLs an empty array will be returned.

If a block is given then it will be called for each URL.



177
178
179
180
181
# File 'lib/twitter-text/extractor.rb', line 177

def extract_urls(text, &block) # :yields: url
  urls = extract_urls_with_indices(text).map{|u| u[:url]}
  urls.each(&block) if block_given?
  urls
end

#extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) ⇒ Object

Extracts a list of all URLs included in the Tweet text along with the indices. If the text is nil or contains no URLs an empty array will be returned.

If a block is given then it will be called for each URL.



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/twitter-text/extractor.rb', line 188

def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end
  return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":"))
  urls = []
  position = 0

  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query|
    valid_url_match_data = $~

    start_position = valid_url_match_data.char_begin(3)
    end_position = valid_url_match_data.char_end(3)

    # If protocol is missing and domain contains non-ASCII characters,
    # extract ASCII-only domains.
    if !protocol
      next if !options[:extract_url_without_protocol] || before =~ Twitter::Regex[:invalid_url_without_protocol_preceding_chars]
      last_url = nil
      last_url_invalid_match = nil
      domain.scan(Twitter::Regex[:valid_ascii_domain]) do |ascii_domain|
        last_url = {
          :url => ascii_domain,
          :indices => [start_position + $~.char_begin(0),
                       start_position + $~.char_end(0)]
        }
        last_url_invalid_match = ascii_domain =~ Twitter::Regex[:invalid_short_domain]
        urls << last_url unless last_url_invalid_match
      end

      # no ASCII-only domain found. Skip the entire URL
      next unless last_url

      # last_url only contains domain. Need to add path and query if they exist.
      if path
        # last_url was not added. Add it to urls here.
        urls << last_url if last_url_invalid_match
        last_url[:url] = url.sub(domain, last_url[:url])
        last_url[:indices][1] = end_position
      end
    else
      # In the case of t.co URLs, don't allow additional path characters
      if url =~ Twitter::Regex[:valid_tco_url]
        url = $&
        end_position = start_position + url.char_length
      end
      urls << {
        :url => url,
        :indices => [start_position, end_position]
      }
    end
  end
  urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
  urls
end

#remove_overlapping_entities(entities) ⇒ Object

Remove overlapping entities. This returns a new array with no overlapping entities.



52
53
54
55
56
57
58
59
60
# File 'lib/twitter-text/extractor.rb', line 52

def remove_overlapping_entities(entities)
  # sort by start index
  entities = entities.sort_by{|entity| entity[:indices].first}

  # remove duplicates
  prev = nil
  entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false}
  entities
end