Class: NBayes::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/nbayes.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Base

Returns a new instance of Base.



183
184
185
186
187
188
189
190
# File 'lib/nbayes.rb', line 183

def initialize(options={})
  @debug = false
  @k = 1
  @binarized = options[:binarized] || false
  @assume_uniform = false
  @vocab = Vocab.new(:log_size => options[:log_vocab])
  @data = Data.new
end

Instance Attribute Details

#assume_uniformObject

Returns the value of attribute assume_uniform.



180
181
182
# File 'lib/nbayes.rb', line 180

def assume_uniform
  @assume_uniform
end

#binarizedObject (readonly)

Returns the value of attribute binarized.



181
182
183
# File 'lib/nbayes.rb', line 181

def binarized
  @binarized
end

#dataObject

Returns the value of attribute data.



180
181
182
# File 'lib/nbayes.rb', line 180

def data
  @data
end

#debugObject

Returns the value of attribute debug.



180
181
182
# File 'lib/nbayes.rb', line 180

def debug
  @debug
end

#kObject

Returns the value of attribute k.



180
181
182
# File 'lib/nbayes.rb', line 180

def k
  @k
end

#vocabObject

Returns the value of attribute vocab.



180
181
182
# File 'lib/nbayes.rb', line 180

def vocab
  @vocab
end

Class Method Details

.from(yml_file) ⇒ Object

Loads class instance from a data file (e.g., yaml)



321
322
323
324
325
# File 'lib/nbayes.rb', line 321

def self.from(yml_file)
  File.open(yml_file, "rb") do |file|
    self.from_yml(file.read)
  end
end

.from_yml(yml_data) ⇒ Object



314
315
316
317
318
# File 'lib/nbayes.rb', line 314

def self.from_yml(yml_data)
  nbayes = YAML.load(yml_data)
  nbayes.reset_after_import()  # yaml does not properly set the defaults on the Hashes
  nbayes
end

Instance Method Details

#calculate_probabilities(tokens) ⇒ Object

Calculates the actual probability of a class given the tokens (this is the work horse of the code)



256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# File 'lib/nbayes.rb', line 256

def calculate_probabilities(tokens)
  # P(class|words) = P(w1,...,wn|class) * P(class) / P(w1,...,wn)
  #                = argmax P(w1,...,wn|class) * P(class)
  #
  # P(wi|class) = (count(wi, class) + k)/(count(w,class) + kV)
  prob_numerator = {}
  v_size = vocab.size

  cat_prob = Math.log(1 / data.categories.count.to_f)
  total_example_count = data.total_examples.to_f

  data.each do |category|
    unless assume_uniform
      cat_prob = Math.log(data.example_count(category) / total_example_count)
    end

    log_probs = 0
    denominator = (data.token_count(category) + @k * v_size).to_f
    tokens.each do |token|
      numerator = data.count_of_token_in_category(category, token) + @k
      log_probs += Math.log( numerator / denominator )
    end
    prob_numerator[category] = log_probs + cat_prob
  end
  normalize(prob_numerator)
end

#category_statsObject



250
251
252
# File 'lib/nbayes.rb', line 250

def category_stats
  data.category_stats
end

#classify(tokens) ⇒ Object



240
241
242
243
244
245
246
247
248
# File 'lib/nbayes.rb', line 240

def classify(tokens)
  print "classify: #{tokens.join(', ')}\n" if @debug
  probs = {}
  tokens = tokens.uniq if binarized
  probs = calculate_probabilities(tokens)
  print "results: #{probs.to_yaml}\n" if @debug
  probs.extend(NBayes::Result)
  probs
end

#delete_category(category) ⇒ Object

Delete an entire category from the classification data



211
212
213
# File 'lib/nbayes.rb', line 211

def delete_category(category)
  data.delete_category(category)
end

#dump(arg) ⇒ Object

Dumps class instance to a data file (e.g., yaml) or a string



340
341
342
343
344
345
346
# File 'lib/nbayes.rb', line 340

def dump(arg)
  if arg.instance_of? String
    File.open(arg, "w") {|f| YAML.dump(self, f) }
  else
    YAML.dump(arg)
  end
end

#load(yml) ⇒ Object

Load class instance



328
329
330
331
332
333
334
335
336
337
# File 'lib/nbayes.rb', line 328

def load(yml)
  if yml.nil?
    nbayes = NBayes::Base.new
  elsif yml[0..2] == "---"
    nbayes = self.class.from_yml(yml)
  else
    nbayes = self.class.from(yml)
  end
  nbayes
end

#normalize(prob_numerator) ⇒ Object



283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/nbayes.rb', line 283

def normalize(prob_numerator)
  # calculate the denominator, which normalizes this into a probability; it's just the sum of all numerators from above
  normalizer = 0
  prob_numerator.each {|cat, numerator| normalizer += numerator }
  # One more caveat:
  # We're using log probabilities, so the numbers are negative and the smallest negative number is actually the largest prob.
  # To convert, we need to maintain the relative distance between all of the probabilities:
  # - divide log prob by normalizer: this keeps ratios the same, but reverses the ordering
  # - re-normalize based off new counts
  # - final calculation
  # Ex: -1,-1,-2  =>  -4/-1, -4/-1, -4/-2
  #   - renormalize and calculate => 4/10, 4/10, 2/10
  intermed = {}
  renormalizer = 0
  prob_numerator.each do |cat, numerator|
    intermed[cat] = normalizer / numerator.to_f
    renormalizer += intermed[cat]
  end
  # calculate final probs
  final_probs = {}
  intermed.each do |cat, value|
    final_probs[cat] = value / renormalizer.to_f
  end
  final_probs
end

#purge_less_than(x) ⇒ Object

Allows removal of low frequency words that increase processing time and may overfit

  • tokens with a count less than x (measured by summing across all classes) are removed

Ex: nb.purge_less_than(2)

NOTE: this does not decrement the “examples” count, so purging is not always the same as if the item was never added in the first place, but usually so



198
199
200
201
202
203
204
205
206
207
208
# File 'lib/nbayes.rb', line 198

def purge_less_than(x)
  remove_list = {}
  @vocab.each do |token|
    if data.purge_less_than(token, x)
      # print "removing #{token}\n"
      remove_list[token] = 1
    end
  end  # each vocab word
  remove_list.keys.each {|token| @vocab.delete(token) }
  # print "total vocab size is now #{vocab.size}\n"
end

#reset_after_importObject

called internally after yaml import to reset Hash defaults



310
311
312
# File 'lib/nbayes.rb', line 310

def reset_after_import
  data.reset_after_import
end

#train(tokens, category) ⇒ Object



215
216
217
218
219
220
221
222
# File 'lib/nbayes.rb', line 215

def train(tokens, category)
  tokens = tokens.uniq if binarized
  data.increment_examples(category)
  tokens.each do |token|
    vocab.seen_token(token)
    data.add_token_to_category(category, token)
  end
end

#untrain(tokens, category) ⇒ Object

Be carefull with this function:

  • It decrement the number of examples for the category. If the being-untrained category has no more examples, it is removed from the category list.

  • It untrain already trained tokens, non existing tokens are not considered.



228
229
230
231
232
233
234
235
236
237
238
# File 'lib/nbayes.rb', line 228

def untrain(tokens, category)
  tokens = tokens.uniq if binarized
  data.decrement_examples(category)
  
  tokens.each do |token|
    if data.token_trained?(token, category)
      vocab.delete(token)
      data.remove_token_from_category(category, token)
    end
  end
end