Class: StuffClassifier::Base

Inherits:
Object
  • Object
show all
Includes:
Tokenizer
Defined in:
lib/stuff-classifier/base.rb

Direct Known Subclasses

Bayes, TfIdf

Class Attribute Summary collapse

Instance Attribute Summary collapse

Attributes included from Tokenizer

#stemming

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Tokenizer

#each_word, #ignore_words, #ignore_words=, #stemming?

Constructor Details

#initialize(name, opts = {}) ⇒ Base


5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/stuff-classifier/base.rb', line 5

def initialize(name, opts={})
  @stemming = opts.key?(:stemming) ? opts[:stemming] : true
  purge_state = opts[:purge_state]

  @name = name
  @wcount = {}
  @ccount = {}
  @ignore_words = nil

  @storage = opts[:storage] || StuffClassifier::Base.storage
  unless purge_state
    @storage.load_state(self)
  else
    @storage.purge_state(self)
  end
end

Class Attribute Details

.storageObject


84
85
86
87
# File 'lib/stuff-classifier/base.rb', line 84

def storage
  @storage = StuffClassifier::InMemoryStorage.new unless defined? @storage
  @storage
end

Instance Attribute Details

#nameObject (readonly)

Returns the value of attribute name


3
4
5
# File 'lib/stuff-classifier/base.rb', line 3

def name
  @name
end

Class Method Details

.open(name) ⇒ Object


89
90
91
92
93
94
95
96
97
# File 'lib/stuff-classifier/base.rb', line 89

def open(name)
  inst = self.new(name)
  if block_given?
    yield inst
    inst.save_state
  else
    inst
  end
end

Instance Method Details

#cat_count(category) ⇒ Object


38
39
40
# File 'lib/stuff-classifier/base.rb', line 38

def cat_count(category)
  @ccount[category] ? @ccount[category].to_f : 0.0
end

#categoriesObject


46
47
48
# File 'lib/stuff-classifier/base.rb', line 46

def categories
  @ccount.keys
end

#incr_cat(category) ⇒ Object


28
29
30
31
# File 'lib/stuff-classifier/base.rb', line 28

def incr_cat(category)
  @ccount[category] ||= 0
  @ccount[category] += 1
end

#incr_word(word, category) ⇒ Object


22
23
24
25
26
# File 'lib/stuff-classifier/base.rb', line 22

def incr_word(word, category)
  @wcount[word] ||= {}
  @wcount[word][category] ||= 0
  @wcount[word][category] += 1
end

#save_stateObject


77
78
79
# File 'lib/stuff-classifier/base.rb', line 77

def save_state
  @storage.save_state(self)
end

#total_countObject


42
43
44
# File 'lib/stuff-classifier/base.rb', line 42

def total_count
  @ccount.values.inject(0){|s,c| s + c}.to_f
end

#train(category, text) ⇒ Object


50
51
52
53
# File 'lib/stuff-classifier/base.rb', line 50

def train(category, text)
  each_word(text) {|w| incr_word(w, category) }
  incr_cat(category)
end

#word_count(word, category) ⇒ Object


33
34
35
36
# File 'lib/stuff-classifier/base.rb', line 33

def word_count(word, category)
  return 0.0 unless @wcount[word] && @wcount[word][category]
  @wcount[word][category].to_f
end

#word_prob(word, cat) ⇒ Object


55
56
57
58
# File 'lib/stuff-classifier/base.rb', line 55

def word_prob(word, cat)
  return 0.0 if cat_count(cat) == 0
  word_count(word, cat) / cat_count(cat)
end

#word_weighted_average(word, cat, opts = {}) ⇒ Object


60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/stuff-classifier/base.rb', line 60

def word_weighted_average(word, cat, opts={})
  func = opts[:func]
  weight = opts[:weight] || 1.0
  assumed_prob = opts[:assumed_prob] || 0.5

  # calculate current probability
  basic_prob = func ? func.call(word, cat) 
    : word_prob(word, cat)
  
  # count the number of times this word has appeared in all
  # categories
  totals = categories.map{|c| word_count(word, c)}.inject(0){|s,c| s + c}
  
  # the final weighted average
  (weight * assumed_prob + totals * basic_prob) / (weight + totals)
end