Class: StuffClassifier::TfIdf

Inherits:
Base
  • Object
show all
Defined in:
lib/stuff-classifier/tf-idf.rb

Instance Attribute Summary

Attributes inherited from Base

#name

Attributes included from Tokenizer

#stemming

Instance Method Summary collapse

Methods inherited from Base

#cat_count, #categories, #incr_cat, #incr_word, #initialize, open, #save_state, #total_count, #train, #word_count, #word_prob, #word_weighted_average

Methods included from Tokenizer

#each_word, #ignore_words, #ignore_words=, #stemming?

Constructor Details

This class inherits a constructor from StuffClassifier::Base

Instance Method Details

#cat_scores(text) ⇒ Object


18
19
20
21
22
23
24
25
# File 'lib/stuff-classifier/tf-idf.rb', line 18

def cat_scores(text)
  probs = {}
  categories.each do |cat|
    p = text_prob(text, cat)
    probs[cat] = p
  end
  probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
end

#classify(text, default = nil) ⇒ Object


27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/stuff-classifier/tf-idf.rb', line 27

def classify(text, default=nil)
  max_prob = 0.0
  best = nil

  cat_scores(text).each do |score|
    cat, prob = score
    if prob > max_prob
      max_prob = prob
      best = cat
    end
  end

  max_prob > 0 ? best : default
end

#text_prob(text, cat) ⇒ Object


14
15
16
# File 'lib/stuff-classifier/tf-idf.rb', line 14

def text_prob(text, cat)
  each_word(text).map{|w| tf_idf(w, cat)}.inject(0){|s,p| s + p}
end

#tf_idf(word, cat) ⇒ Object


2
3
4
5
6
7
8
9
10
11
12
# File 'lib/stuff-classifier/tf-idf.rb', line 2

def tf_idf(word, cat)
  word_cat_nr = word_count(word, cat)
  cat_nr = cat_count(cat)
  tf = 1.0 * word_cat_nr / cat_nr
  
  total_categories = categories.length
  categories_with_word = (@wcount[word] || []).length

  idf = Math.log((total_categories + 2) / (categories_with_word + 1.0), 10)    
  return tf * idf
end