Class: Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/libsvm_preprocessor/tokenizer.rb

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.



7
8
9
10
11
12
13
14
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 7

def initialize(options = {})
  @options = options
  @options[:stopword] ||= false
  @options[:stemming] ||= false
  @options[:lang]     ||= "it"
  @filter  = Stopwords::Snowball::Filter.new(@options[:lang])
  @stemmer = Lingua::Stemmer.new(language: @options[:lang])
end

Instance Method Details

#process_text(string) ⇒ Object



23
24
25
26
27
28
29
30
31
32
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 23

def process_text(string)
  string.downcase!
  string = Unicode.nfd(string)
  string.gsub!(/[^[:alpha:]]/, ' ')
  string.gsub!(/([a-z])([0-9])/, '\1 \2')
  string.gsub!(/([0-9])([a-z])/, '\1 \2')
  string.gsub!(/\s+/, ' ')
  string.strip!
  string.split(' ')
end

#remove_stopwords(ary) ⇒ Object

Remove stopwords according to the selected language



35
36
37
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 35

def remove_stopwords(ary)
  @filter.filter(ary)
end

#stem_each(ary) ⇒ Object

Stem each word according to the selected language



40
41
42
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 40

def stem_each(ary)
  ary.map { |term| @stemmer.stem(term) }
end

#tokenize(string) ⇒ Object



16
17
18
19
20
21
# File 'lib/libsvm_preprocessor/tokenizer.rb', line 16

def tokenize(string)
  result = process_text(string)
  result = remove_stopwords(result) if @options[:stopword]
  result = stem_each(result) if @options[:stemming]
  result
end