Class: StuffClassifier::TfIdf

Inherits:
Base
  • Object
show all
Extended by:
Storage::ActAsStorable
Defined in:
lib/stuff-classifier/tf-idf.rb

Instance Attribute Summary

Attributes inherited from Base

#category_list, #language, #min_prob, #name, #thresholds, #tokenizer, #training_count, #word_list

Instance Method Summary collapse

Methods included from Storage::ActAsStorable

storable, to_store

Methods inherited from Base

#cat_count, #categories, #categories_with_word_count, #classify, #incr_cat, #incr_word, open, #save_state, #total_cat_count, #total_categories, #total_word_count, #total_word_count_in_cat, #train, #word_count

Constructor Details

#initialize(name, opts = {}) ⇒ TfIdf

Returns a new instance of TfIdf.



5
6
7
# File 'lib/stuff-classifier/tf-idf.rb', line 5

def initialize(name, opts={})
  super(name, opts)
end

Instance Method Details

#cat_scores(text) ⇒ Object



24
25
26
27
28
29
30
31
# File 'lib/stuff-classifier/tf-idf.rb', line 24

def cat_scores(text)
  probs = {}
  categories.each do |cat|
    p = text_prob(text, cat)
    probs[cat] = p
  end
  probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
end

#text_prob(text, cat) ⇒ Object



20
21
22
# File 'lib/stuff-classifier/tf-idf.rb', line 20

def text_prob(text, cat)
  @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
end

#word_classification_detail(word) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
# File 'lib/stuff-classifier/tf-idf.rb', line 33

def word_classification_detail(word)

  p "tf_idf"
  result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
  ap result

  p "text_prob"
  result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end  
  ap result    
  
end

#word_prob(word, cat) ⇒ Object



10
11
12
13
14
15
16
17
18
# File 'lib/stuff-classifier/tf-idf.rb', line 10

def word_prob(word, cat)
  word_cat_nr = word_count(word, cat)
  cat_nr = cat_count(cat)

  tf = 1.0 * word_cat_nr / cat_nr
      
  idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))    
  tf * idf
end