Class: Company::Mapping::BasicTokenizer
- Inherits:
-
Object
- Object
- Company::Mapping::BasicTokenizer
- Defined in:
- lib/company/mapping/document_utils/basic_tokenizer.rb
Instance Method Summary collapse
-
#initialize(ignorePunctuation = true, ignoreCase = true) ⇒ BasicTokenizer
constructor
A new instance of BasicTokenizer.
- #to_s ⇒ Object
- #tokenize(text) ⇒ Object
Constructor Details
#initialize(ignorePunctuation = true, ignoreCase = true) ⇒ BasicTokenizer
Returns a new instance of BasicTokenizer.
7 8 9 |
# File 'lib/company/mapping/document_utils/basic_tokenizer.rb', line 7 def initialize(ignorePunctuation = true, ignoreCase = true) @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase end |
Instance Method Details
#to_s ⇒ Object
11 12 13 |
# File 'lib/company/mapping/document_utils/basic_tokenizer.rb', line 11 def to_s "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}" end |
#tokenize(text) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/company/mapping/document_utils/basic_tokenizer.rb', line 15 def tokenize(text) text = tranform(text) tokens = Array.new index = 0 while (index < text.length) char = text[index] case char when /\s/ index = index + 1 when /\w/ #/(?<word>\w+)/ buf = "" while ((index < text.length) && (text[index].match(/\w/))) buf << text[index] index += 1 end tokens.push buf index += 1 else tokens.push(char) unless @doIgnorePunctuation index += 1 end end tokens end |