Class: Company::Mapping::BasicTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/company/mapping/document_utils/basic_tokenizer.rb

Instance Method Summary collapse

Constructor Details

#initialize(ignorePunctuation = true, ignoreCase = true) ⇒ BasicTokenizer

Returns a new instance of BasicTokenizer.



7
8
9
# File 'lib/company/mapping/document_utils/basic_tokenizer.rb', line 7

def initialize(ignorePunctuation = true, ignoreCase = true)
  @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase
end

Instance Method Details

#to_sObject



11
12
13
# File 'lib/company/mapping/document_utils/basic_tokenizer.rb', line 11

def to_s
  "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}"
end

#tokenize(text) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/company/mapping/document_utils/basic_tokenizer.rb', line 15

def tokenize(text)
  text = tranform(text)
  tokens = Array.new
  index = 0
  while (index < text.length)
    char = text[index]
    case char
    when /\s/
      index = index + 1
    when /\w/ #/(?<word>\w+)/

      buf = ""
      while ((index < text.length) && (text[index].match(/\w/)))
        buf << text[index]
        index += 1
      end
      tokens.push buf
      index += 1
    else
      tokens.push(char) unless @doIgnorePunctuation
      index += 1
    end
  end
  tokens
end