Class: Tokenizer
- Inherits:
-
Object
- Object
- Tokenizer
- Defined in:
- lib/libsvm_preprocessor/tokenizer.rb
Instance Method Summary collapse
-
#initialize(options = {}) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #process_text(string) ⇒ Object
-
#remove_stopwords(ary) ⇒ Object
Remove stopwords according to the selected language.
-
#stem_each(ary) ⇒ Object
Stem each word according to the selected language.
- #tokenize(string) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Tokenizer
Returns a new instance of Tokenizer.
7 8 9 10 11 12 13 14 |
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 7 def initialize( = {}) @options = @options[:stopword] ||= false @options[:stemming] ||= false @options[:lang] ||= "it" @filter = Stopwords::Snowball::Filter.new(@options[:lang]) @stemmer = Lingua::Stemmer.new(language: @options[:lang]) end |
Instance Method Details
#process_text(string) ⇒ Object
23 24 25 26 27 28 29 30 31 32 |
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 23 def process_text(string) string.downcase! string = Unicode.nfd(string) string.gsub!(/[^[:alpha:]]/, ' ') string.gsub!(/([a-z])([0-9])/, '\1 \2') string.gsub!(/([0-9])([a-z])/, '\1 \2') string.gsub!(/\s+/, ' ') string.strip! string.split(' ') end |
#remove_stopwords(ary) ⇒ Object
Remove stopwords according to the selected language
35 36 37 |
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 35 def remove_stopwords(ary) @filter.filter(ary) end |
#stem_each(ary) ⇒ Object
Stem each word according to the selected language
40 41 42 |
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 40 def stem_each(ary) ary.map { |term| @stemmer.stem(term) } end |
#tokenize(string) ⇒ Object
16 17 18 19 20 21 |
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 16 def tokenize(string) result = process_text(string) result = remove_stopwords(result) if @options[:stopword] result = stem_each(result) if @options[:stemming] result end |