Module: RbLibText
- Defined in:
- lib/rb_lib_text.rb,
lib/rb_lib_text/version.rb
Constant Summary collapse
- VERSION =
"0.0.1"
Class Method Summary collapse
Class Method Details
.or_pattern ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/rb_lib_text.rb', line 5 def self.or_pattern patterns = { html_chars: '&\w+;', # separates the junk that comes from > and < and & numbers_commas: '[\-\$]?\d{1,3}(?:,\d{3})+', # like 2,000,000 times: '\d?\d:\d{2}', # like 2:12 money: '-?\$?\d+[.]\d+%?', #Catch money numerics acronyms: '(?:\w{1}\.{1})+', # like U.T. possessive_mentions: '@\w+', #splits possessive off of @jimbob's possessive_hashtags: '#\w+', #splits possessive off of #tcot's tags_contractions: '[\w]+[\'‘’][\w]+', #don't split don't and can't and it's emails: '[\w\.\d]+@[\w\.\d]+\.[\w]+', #catch email addresses urls: 'https?://[-_/~%\w\d\.]*[_/~\w\d]', #Catch url addresses #sideways_text_emoji: '>?[:;=][\'\-D\)\]\(\[pPdoO/\*3\\]+', ellipses: '\.{3}', en_em_dash: '-{2,3}', #Catch en and em dashes slashes: '[\w]+(?:[/\-][\w]+)+', #Grammatical / - punct: '[\"“”‘’\'\\.\\?!…,:;»«\(\)]', #punctuation to split on tags_mentions: '[\w#@\d%$\u00B0]+', #Group all of these things together emoji_block0: '[\U00002600-\U000027BF]', emoji_block1: '[\U0001f300-\U0001f64F]', emoji_block2: '[\U0001f680-\U0001f6FF]', hearts: '<+/?3+', # <3 other_punct: '[\u2014\u2013]', all_other: '[^\s]', #Split any other weird chars that may have been missed } return Regexp.union(patterns.values.map{|value| Regexp.new(value)}) end |
.tokens(text) ⇒ Object
34 35 36 37 38 |
# File 'lib/rb_lib_text.rb', line 34 def self.tokens(text) text = text.gsub("\u2026", "...") text = text.gsub(/\.{2,}/, "...") return text.scan(self.or_pattern) end |