Class: Attentive::Tokenizer
- Inherits:
-
Object
- Object
- Attentive::Tokenizer
- Includes:
- Tokens
- Defined in:
- lib/attentive/tokenizer.rb
Instance Attribute Summary collapse
-
#chars ⇒ Object
readonly
Returns the value of attribute chars.
-
#message ⇒ Object
readonly
Returns the value of attribute message.
-
#options ⇒ Object
readonly
Returns the value of attribute options.
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(message, options = {}) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #match_entities? ⇒ Boolean
- #match_regexps? ⇒ Boolean
- #perform_substitutions? ⇒ Boolean
- #tokenize ⇒ Object
Methods included from Tokens
#any_of, #emoji, #entity, #invocation, #punctuation, #regexp, #whitespace, #word
Constructor Details
#initialize(message, options = {}) ⇒ Tokenizer
Returns a new instance of Tokenizer.
18 19 20 21 22 |
# File 'lib/attentive/tokenizer.rb', line 18 def initialize(, ={}) @message = .downcase @chars = self..each_char.to_a @options = end |
Instance Attribute Details
#chars ⇒ Object (readonly)
Returns the value of attribute chars.
10 11 12 |
# File 'lib/attentive/tokenizer.rb', line 10 def chars @chars end |
#message ⇒ Object (readonly)
Returns the value of attribute message.
10 11 12 |
# File 'lib/attentive/tokenizer.rb', line 10 def @message end |
#options ⇒ Object (readonly)
Returns the value of attribute options.
10 11 12 |
# File 'lib/attentive/tokenizer.rb', line 10 def @options end |
Class Method Details
.tokenize(message, options = {}) ⇒ Object
12 13 14 |
# File 'lib/attentive/tokenizer.rb', line 12 def self.tokenize(, ={}) self.new(, ).tokenize end |
Instance Method Details
#match_entities? ⇒ Boolean
24 25 26 |
# File 'lib/attentive/tokenizer.rb', line 24 def match_entities? .fetch(:entities, false) end |
#match_regexps? ⇒ Boolean
28 29 30 |
# File 'lib/attentive/tokenizer.rb', line 28 def match_regexps? .fetch(:regexps, false) end |
#perform_substitutions? ⇒ Boolean
32 33 34 |
# File 'lib/attentive/tokenizer.rb', line 32 def perform_substitutions? .fetch(:substitutions, true) end |
#tokenize ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/attentive/tokenizer.rb', line 38 def tokenize i = 0 @tokens = [] @leaves = [] while i < chars.length char = chars[i] char = CHARACTER_SUBSTITIONS.fetch(char, char) pos = tokens.any? ? tokens.last.end : 0 if WHITESPACE === char && string = match_whitespace_at(i) add_token whitespace(string, pos: pos) i += string.length elsif ENTITY_START === char && string = match_entity_at(i) add_token entity(string, pos: pos) i += string.length + 4 elsif NUMBER_START === char && string = match_number_at(i) add_token word(string, pos: pos) i += string.length elsif EMOJI_START === char && string = match_emoji_at(i) add_token emoji(string, pos: pos) i += string.length + 2 elsif REGEXP_START === char && string = match_regexp_at(i) add_token regexp(string, pos: pos) i += string.length elsif PUNCTUATION === char add_token punctuation(char, pos: pos) i += 1 else string = match_word_at(i) add_token word(string, pos: pos) i += string.length end end Attentive::Phrase.new(tokens) end |