Class: TwitterCldr::Tokenizers::NumberTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_cldr/tokenizers/numbers/number_tokenizer.rb

Constant Summary collapse

SPECIAL_SYMBOLS_MAP =
{
  '.' => '{DOT}',
  ',' => '{COMMA}',
  '0' => '{ZERO}',
  '#' => '{POUND}',
  'ยค' => '{CURRENCY}',
  '%' => '{PERCENT}',
  'E' => '{SCIENTIFIC}'
}
SPECIAL_SYMBOLS_REGEX =
/'(?:#{SPECIAL_SYMBOLS_MAP.keys.map { |s| Regexp.escape(s) }.join('|')})'/
INVERSE_SPECIAL_SYMBOLS_MAP =
SPECIAL_SYMBOLS_MAP.invert
INVERSE_SPECIAL_SYMBOLS_REGEX =
/#{INVERSE_SPECIAL_SYMBOLS_MAP.keys.map { |s| Regexp.escape(s) }.join('|')}/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data_reader) ⇒ NumberTokenizer

Returns a new instance of NumberTokenizer.



28
29
30
# File 'lib/twitter_cldr/tokenizers/numbers/number_tokenizer.rb', line 28

def initialize(data_reader)
  @data_reader = data_reader
end

Instance Attribute Details

#data_readerObject (readonly)

Returns the value of attribute data_reader.



26
27
28
# File 'lib/twitter_cldr/tokenizers/numbers/number_tokenizer.rb', line 26

def data_reader
  @data_reader
end

Instance Method Details

#tokenize(pattern) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/twitter_cldr/tokenizers/numbers/number_tokenizer.rb', line 32

def tokenize(pattern)
  escaped_pattern = pattern.gsub(SPECIAL_SYMBOLS_REGEX) do |match|
    SPECIAL_SYMBOLS_MAP[match[1..-2]]
  end

  tokens = PatternTokenizer.new(data_reader, tokenizer).tokenize(escaped_pattern)

  tokens.each do |token|
    token.value = token.value.gsub(INVERSE_SPECIAL_SYMBOLS_REGEX) do |match|
      INVERSE_SPECIAL_SYMBOLS_MAP[match]
    end
  end

  if tokens.first.value == ""
    tokens[1..-1]
  else
    tokens
  end
end