Class: TwitterCldr::Tokenizers::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_cldr/tokenizers/tokenizer.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(recognizers, splitter = nil, remove_empty_entries = true) ⇒ Tokenizer

Returns a new instance of Tokenizer.



59
60
61
62
63
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 59

def initialize(recognizers, splitter = nil, remove_empty_entries = true)
  @recognizers = recognizers
  @custom_splitter = splitter
  @remove_empty_entries = remove_empty_entries
end

Instance Attribute Details

#custom_splitterObject (readonly)

Returns the value of attribute custom_splitter.



36
37
38
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 36

def custom_splitter
  @custom_splitter
end

#recognizersObject (readonly)

Returns the value of attribute recognizers.



36
37
38
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 36

def recognizers
  @recognizers
end

#remove_empty_entriesObject (readonly)

Returns the value of attribute remove_empty_entries.



36
37
38
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 36

def remove_empty_entries
  @remove_empty_entries
end

Class Method Details

.union(*tokenizers) ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 38

def self.union(*tokenizers)
  recognizers = tokenizers.inject([]) do |ret, tokenizer|
    ret + tokenizer.recognizers.inject([]) do |recog_ret, recognizer|
      if (block_given? && yield(recognizer)) || !block_given?
        recog_ret << recognizer
      end
      recog_ret
    end
  end

  splitter = if tokenizers.all?(&:custom_splitter)
    Regexp.compile(
      tokenizers.map do |tokenizer|
        tokenizer.custom_splitter.source
      end.join("|")
    )
  end

  new(recognizers, splitter)
end

Instance Method Details

#insert_before(token_type, *new_recognizers) ⇒ Object



69
70
71
72
73
74
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 69

def insert_before(token_type, *new_recognizers)
  idx = recognizers.find_index { |rec| rec.token_type == token_type }
  recognizers.insert(idx, *new_recognizers)
  clear_splitter
  nil
end

#recognizer_at(token_type) ⇒ Object



65
66
67
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 65

def recognizer_at(token_type)
  recognizers.find { |r| r.token_type == token_type }
end

#tokenize(text) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 76

def tokenize(text)
  text.split(splitter).inject([]) do |ret, token_text|
    recognizer = recognizers.find do |recognizer|
      recognizer.recognizes?(token_text)
    end

    if recognizer
      if recognizer.token_type == :composite
        content = token_text.match(recognizer.content)[1]
        ret << CompositeToken.new(tokenize(content))
      else
        cleaned_text = recognizer.clean(token_text)

        if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
          ret << Token.new(
            value: cleaned_text,
            type: recognizer.token_type
          )
        end
      end
    end

    ret
  end
end