Class: StuffClassifier::Base
- Inherits:
-
Object
- Object
- StuffClassifier::Base
- Extended by:
- Storage::ActAsStorable
- Defined in:
- lib/stuff-classifier/base.rb
Class Attribute Summary collapse
Instance Attribute Summary collapse
-
#category_list ⇒ Object
readonly
Returns the value of attribute category_list.
-
#language ⇒ Object
Returns the value of attribute language.
-
#min_prob ⇒ Object
Returns the value of attribute min_prob.
-
#name ⇒ Object
readonly
Returns the value of attribute name.
-
#thresholds ⇒ Object
Returns the value of attribute thresholds.
-
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
-
#training_count ⇒ Object
readonly
Returns the value of attribute training_count.
-
#word_list ⇒ Object
readonly
Returns the value of attribute word_list.
Class Method Summary collapse
Instance Method Summary collapse
-
#cat_count(category) ⇒ Object
return the number of training document for a category.
-
#categories ⇒ Object
return categories list.
-
#categories_with_word_count(word) ⇒ Object
return the number of time categories in wich a word appear.
-
#classify(text, default = nil) ⇒ Object
classify a text.
- #incr_cat(category) ⇒ Object
- #incr_word(word, category) ⇒ Object
-
#initialize(name, opts = {}) ⇒ Base
constructor
opts : language stemming : true | false weight assumed_prob storage purge_state ?.
- #save_state ⇒ Object
-
#total_cat_count ⇒ Object
return the number of training item.
-
#total_categories ⇒ Object
return the number of categories.
-
#total_word_count(word) ⇒ Object
return the number of times the word appears in all categories.
-
#total_word_count_in_cat(cat) ⇒ Object
return the number of words in a categories.
-
#train(category, text) ⇒ Object
train the classifier.
-
#word_count(word, category) ⇒ Object
return number of times the word appears in a category.
Methods included from Storage::ActAsStorable
Constructor Details
#initialize(name, opts = {}) ⇒ Base
opts : language stemming : true | false weight assumed_prob storage purge_state ?
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/stuff-classifier/base.rb', line 27 def initialize(name, opts={}) @version = StuffClassifier::VERSION @name = name # This values are nil or are loaded from storage @word_list = {} @category_list = {} @training_count=0 # storage purge_state = opts[:purge_state] @storage = opts[:storage] || StuffClassifier::Base.storage unless purge_state @storage.load_state(self) else @storage.purge_state(self) end # This value can be set during initialization or overrided after load_state @thresholds = opts[:thresholds] || {} @min_prob = opts[:min_prob] || 0.0 @ignore_words = nil @tokenizer = StuffClassifier::Tokenizer.new(opts) end |
Class Attribute Details
.storage ⇒ Object
175 176 177 178 |
# File 'lib/stuff-classifier/base.rb', line 175 def storage @storage = StuffClassifier::InMemoryStorage.new unless defined? @storage @storage end |
Instance Attribute Details
#category_list ⇒ Object (readonly)
Returns the value of attribute category_list.
7 8 9 |
# File 'lib/stuff-classifier/base.rb', line 7 def category_list @category_list end |
#language ⇒ Object
Returns the value of attribute language.
11 12 13 |
# File 'lib/stuff-classifier/base.rb', line 11 def language @language end |
#min_prob ⇒ Object
Returns the value of attribute min_prob.
14 15 16 |
# File 'lib/stuff-classifier/base.rb', line 14 def min_prob @min_prob end |
#name ⇒ Object (readonly)
Returns the value of attribute name.
5 6 7 |
# File 'lib/stuff-classifier/base.rb', line 5 def name @name end |
#thresholds ⇒ Object
Returns the value of attribute thresholds.
13 14 15 |
# File 'lib/stuff-classifier/base.rb', line 13 def thresholds @thresholds end |
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
10 11 12 |
# File 'lib/stuff-classifier/base.rb', line 10 def tokenizer @tokenizer end |
#training_count ⇒ Object (readonly)
Returns the value of attribute training_count.
8 9 10 |
# File 'lib/stuff-classifier/base.rb', line 8 def training_count @training_count end |
#word_list ⇒ Object (readonly)
Returns the value of attribute word_list.
6 7 8 |
# File 'lib/stuff-classifier/base.rb', line 6 def word_list @word_list end |
Class Method Details
.open(name) ⇒ Object
180 181 182 183 184 185 186 187 188 |
# File 'lib/stuff-classifier/base.rb', line 180 def open(name) inst = self.new(name) if block_given? yield inst inst.save_state else inst end end |
Instance Method Details
#cat_count(category) ⇒ Object
return the number of training document for a category
108 109 110 |
# File 'lib/stuff-classifier/base.rb', line 108 def cat_count(category) @category_list[category][:_count] ? @category_list[category][:_count].to_f : 0.0 end |
#categories ⇒ Object
return categories list
124 125 126 |
# File 'lib/stuff-classifier/base.rb', line 124 def categories @category_list.keys end |
#categories_with_word_count(word) ⇒ Object
return the number of time categories in wich a word appear
113 114 115 116 |
# File 'lib/stuff-classifier/base.rb', line 113 def categories_with_word_count(word) return 0 unless @word_list[word] && @word_list[word][:categories] @word_list[word][:categories].length end |
#classify(text, default = nil) ⇒ Object
classify a text
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/stuff-classifier/base.rb', line 135 def classify(text, default=nil) # Find the category with the highest probability max_prob = @min_prob best = nil scores = cat_scores(text) scores.each do |score| cat, prob = score if prob > max_prob max_prob = prob best = cat end end # Return the default category in case the threshold condition was # not met. For example, if the threshold for :spam is 1.2 # # :spam => 0.73, :ham => 0.40 (OK) # :spam => 0.80, :ham => 0.70 (Fail, :ham is too close) return default unless best threshold = @thresholds[best] || 1.0 scores.each do |score| cat, prob = score next if cat == best return default if prob * threshold > max_prob end return best end |
#incr_cat(category) ⇒ Object
74 75 76 77 78 79 80 81 82 |
# File 'lib/stuff-classifier/base.rb', line 74 def incr_cat(category) @category_list[category] ||= {} @category_list[category][:_count] ||= 0 @category_list[category][:_count] += 1 @training_count ||= 0 @training_count += 1 end |
#incr_word(word, category) ⇒ Object
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/stuff-classifier/base.rb', line 56 def incr_word(word, category) @word_list[word] ||= {} @word_list[word][:categories] ||= {} @word_list[word][:categories][category] ||= 0 @word_list[word][:categories][category] += 1 @word_list[word][:_total_word] ||= 0 @word_list[word][:_total_word] += 1 # words count by categroy @category_list[category] ||= {} @category_list[category][:_total_word] ||= 0 @category_list[category][:_total_word] += 1 end |
#save_state ⇒ Object
168 169 170 |
# File 'lib/stuff-classifier/base.rb', line 168 def save_state @storage.save_state(self) end |
#total_cat_count ⇒ Object
return the number of training item
103 104 105 |
# File 'lib/stuff-classifier/base.rb', line 103 def total_cat_count @training_count end |
#total_categories ⇒ Object
return the number of categories
119 120 121 |
# File 'lib/stuff-classifier/base.rb', line 119 def total_categories categories.length end |
#total_word_count(word) ⇒ Object
return the number of times the word appears in all categories
91 92 93 94 |
# File 'lib/stuff-classifier/base.rb', line 91 def total_word_count(word) return 0.0 unless @word_list[word] && @word_list[word][:_total_word] @word_list[word][:_total_word].to_f end |
#total_word_count_in_cat(cat) ⇒ Object
return the number of words in a categories
97 98 99 100 |
# File 'lib/stuff-classifier/base.rb', line 97 def total_word_count_in_cat(cat) return 0.0 unless @category_list[cat] && @category_list[cat][:_total_word] @category_list[cat][:_total_word].to_f end |
#train(category, text) ⇒ Object
train the classifier
129 130 131 132 |
# File 'lib/stuff-classifier/base.rb', line 129 def train(category, text) @tokenizer.each_word(text) {|w| incr_word(w, category) } incr_cat(category) end |
#word_count(word, category) ⇒ Object
return number of times the word appears in a category
85 86 87 88 |
# File 'lib/stuff-classifier/base.rb', line 85 def word_count(word, category) return 0.0 unless @word_list[word] && @word_list[word][:categories] && @word_list[word][:categories][category] @word_list[word][:categories][category].to_f end |