Class: Attentive::Tokenizer
- Inherits:
-
Object
- Object
- Attentive::Tokenizer
show all
- Includes:
- Tokens
- Defined in:
- lib/attentive/tokenizer.rb
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
Methods included from Tokens
#any_of, #emoji, #entity, #invocation, #punctuation, #regexp, #whitespace, #word
Constructor Details
#initialize(message, options = {}) ⇒ Tokenizer
Returns a new instance of Tokenizer.
18
19
20
21
22
|
# File 'lib/attentive/tokenizer.rb', line 18
def initialize(message, options={})
@message = message.downcase
@chars = self.message.each_char.to_a
@options = options
end
|
Instance Attribute Details
#chars ⇒ Object
Returns the value of attribute chars.
10
11
12
|
# File 'lib/attentive/tokenizer.rb', line 10
def chars
@chars
end
|
#message ⇒ Object
Returns the value of attribute message.
10
11
12
|
# File 'lib/attentive/tokenizer.rb', line 10
def message
@message
end
|
#options ⇒ Object
Returns the value of attribute options.
10
11
12
|
# File 'lib/attentive/tokenizer.rb', line 10
def options
@options
end
|
Class Method Details
.tokenize(message, options = {}) ⇒ Object
12
13
14
|
# File 'lib/attentive/tokenizer.rb', line 12
def self.tokenize(message, options={})
self.new(message, options).tokenize
end
|
Instance Method Details
#fail_if_ambiguous? ⇒ Boolean
36
37
38
|
# File 'lib/attentive/tokenizer.rb', line 36
def fail_if_ambiguous?
!options.fetch(:ambiguous, true)
end
|
#match_entities? ⇒ Boolean
24
25
26
|
# File 'lib/attentive/tokenizer.rb', line 24
def match_entities?
options.fetch(:entities, false)
end
|
#match_regexps? ⇒ Boolean
28
29
30
|
# File 'lib/attentive/tokenizer.rb', line 28
def match_regexps?
options.fetch(:regexps, false)
end
|
32
33
34
|
# File 'lib/attentive/tokenizer.rb', line 32
def perform_substitutions?
options.fetch(:substitutions, true)
end
|
#tokenize ⇒ Object
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
# File 'lib/attentive/tokenizer.rb', line 42
def tokenize
i = 0
@tokens = []
@leaves = []
while i < chars.length
char = chars[i]
char = CHARACTER_SUBSTITIONS.fetch(char, char)
pos = tokens.any? ? tokens.last.end : 0
if WHITESPACE === char && string = match_whitespace_at(i)
add_token whitespace(string, pos: pos)
i += string.length
elsif ENTITY_START === char && string = match_entity_at(i)
add_token entity(string, pos: pos)
i += string.length + 4
elsif NUMBER_START === char && string = match_number_at(i)
add_token word(string, pos: pos)
i += string.length
elsif EMOJI_START === char && string = match_emoji_at(i)
add_token emoji(string, pos: pos)
i += string.length + 2
elsif REGEXP_START === char && string = match_regexp_at(i)
add_token regexp(string, pos: pos)
i += string.length
elsif PUNCTUATION === char
add_token punctuation(char, pos: pos)
i += 1
else string = match_word_at(i)
add_token word(string, pos: pos)
i += string.length
end
end
fail_if_ambiguous!(message, tokens) if fail_if_ambiguous?
Attentive::Phrase.new(tokens)
end
|