Class: Attentive::Tokenizer

Inherits:
Object
  • Object
show all
Includes:
Tokens
Defined in:
lib/attentive/tokenizer.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Tokens

#any_of, #emoji, #entity, #invocation, #punctuation, #regexp, #whitespace, #word

Constructor Details

#initialize(message, options = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.



18
19
20
21
22
# File 'lib/attentive/tokenizer.rb', line 18

def initialize(message, options={})
  @message = message.downcase
  @chars = self.message.each_char.to_a
  @options = options
end

Instance Attribute Details

#charsObject (readonly)

Returns the value of attribute chars.



10
11
12
# File 'lib/attentive/tokenizer.rb', line 10

def chars
  @chars
end

#messageObject (readonly)

Returns the value of attribute message.



10
11
12
# File 'lib/attentive/tokenizer.rb', line 10

def message
  @message
end

#optionsObject (readonly)

Returns the value of attribute options.



10
11
12
# File 'lib/attentive/tokenizer.rb', line 10

def options
  @options
end

Class Method Details

.tokenize(message, options = {}) ⇒ Object



12
13
14
# File 'lib/attentive/tokenizer.rb', line 12

def self.tokenize(message, options={})
  self.new(message, options).tokenize
end

Instance Method Details

#match_entities?Boolean

Returns:

  • (Boolean)


24
25
26
# File 'lib/attentive/tokenizer.rb', line 24

def match_entities?
  options.fetch(:entities, false)
end

#match_regexps?Boolean

Returns:

  • (Boolean)


28
29
30
# File 'lib/attentive/tokenizer.rb', line 28

def match_regexps?
  options.fetch(:regexps, false)
end

#perform_substitutions?Boolean

Returns:

  • (Boolean)


32
33
34
# File 'lib/attentive/tokenizer.rb', line 32

def perform_substitutions?
  options.fetch(:substitutions, true)
end

#tokenizeObject



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/attentive/tokenizer.rb', line 38

def tokenize
  i = 0
  @tokens = []
  @leaves = []

  while i < chars.length
    char = chars[i]
    char = CHARACTER_SUBSTITIONS.fetch(char, char)
    pos = tokens.any? ? tokens.last.end : 0

    if WHITESPACE === char && string = match_whitespace_at(i)
      add_token whitespace(string, pos: pos)
      i += string.length

    elsif ENTITY_START === char && string = match_entity_at(i)
      add_token entity(string, pos: pos)
      i += string.length + 4

    elsif NUMBER_START === char && string = match_number_at(i)
      add_token word(string, pos: pos)
      i += string.length

    elsif EMOJI_START === char && string = match_emoji_at(i)
      add_token emoji(string, pos: pos)
      i += string.length + 2

    elsif REGEXP_START === char && string = match_regexp_at(i)
      add_token regexp(string, pos: pos)
      i += string.length

    elsif PUNCTUATION === char
      add_token punctuation(char, pos: pos)
      i += 1

    else string = match_word_at(i)
      add_token word(string, pos: pos)
      i += string.length

    end
  end

  Attentive::Phrase.new(tokens)
end