Module: StuffClassifier::Tokenizer

Included in:
Base
Defined in:
lib/stuff-classifier/tokenizer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#stemming=(value) ⇒ Object (writeonly)

Sets the attribute stemming

Parameters:

  • value

    the value to set the attribute stemming to.


4
5
6
# File 'lib/stuff-classifier/tokenizer.rb', line 4

def stemming=(value)
  @stemming = value
end

Instance Method Details

#each_word(string) ⇒ Object


18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/stuff-classifier/tokenizer.rb', line 18

def each_word(string)
  string = string.strip
  return if string == ''

  words = []
  
  cnt = string.gsub(/['`]/, '')
  cnt.split("\n").each do |line|
    line_cnt = line.gsub(/[^a-zA-Z]+/, ' ')
    line_cnt.split(/\s+/).each do |w|
      next if w == '' || ignore_words.member?(w.downcase)

      if stemming?
        w = w.stem.downcase
        next if ignore_words.member?(w)
      else
        w = w.downcase
      end

      words << (block_given? ? (yield w) : w)
    end
  end

  return words
end

#ignore_wordsObject


10
11
12
# File 'lib/stuff-classifier/tokenizer.rb', line 10

def ignore_words
  @ignore_words || StuffClassifier::STOP_WORDS
end

#ignore_words=(value) ⇒ Object


6
7
8
# File 'lib/stuff-classifier/tokenizer.rb', line 6

def ignore_words=(value)
  @ignore_words = value
end

#stemming?Boolean

Returns:

  • (Boolean)

14
15
16
# File 'lib/stuff-classifier/tokenizer.rb', line 14

def stemming?
  defined?(@stemming) ? @stemming : false
end