Class: StuffClassifier::TfIdf

Inherits:
Base
  • Object
show all
Extended by:
Storage::ActAsStorable
Defined in:
lib/stuff-classifier/tf-idf.rb

Instance Attribute Summary

Attributes inherited from Base

#category_list, #language, #min_prob, #name, #thresholds, #tokenizer, #training_count, #word_list

Instance Method Summary collapse

Methods included from Storage::ActAsStorable

storable, to_store

Methods inherited from Base

#cat_count, #categories, #categories_with_word_count, #classify, #incr_cat, #incr_word, open, #save_state, #total_cat_count, #total_categories, #total_word_count, #total_word_count_in_cat, #train, #word_count

Constructor Details

#initialize(name, opts = {}) ⇒ TfIdf

Returns a new instance of TfIdf.



4
5
6
# File 'lib/stuff-classifier/tf-idf.rb', line 4

def initialize(name, opts={})
  super(name, opts)
end

Instance Method Details

#cat_scores(text) ⇒ Object



23
24
25
26
27
28
29
30
# File 'lib/stuff-classifier/tf-idf.rb', line 23

def cat_scores(text)
  probs = {}
  categories.each do |cat|
    p = text_prob(text, cat)
    probs[cat] = p
  end
  probs.map{|k,v| [k,v]}.sort{|a,b| b[1] <=> a[1]}
end

#text_prob(text, cat) ⇒ Object



19
20
21
# File 'lib/stuff-classifier/tf-idf.rb', line 19

def text_prob(text, cat)
  @tokenizer.each_word(text).map{|w| word_prob(w, cat)}.inject(0){|s,p| s + p}
end

#word_classification_detail(word) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
# File 'lib/stuff-classifier/tf-idf.rb', line 32

def word_classification_detail(word)

  p "tf_idf"
  result=self.categories.inject({}) do |h,cat| h[cat]=self.word_prob(word,cat);h end
  ap result

  p "text_prob"
  result=categories.inject({}) do |h,cat| h[cat]=text_prob(word,cat);h end  
  ap result    
  
end

#word_prob(word, cat) ⇒ Object



9
10
11
12
13
14
15
16
17
# File 'lib/stuff-classifier/tf-idf.rb', line 9

def word_prob(word, cat)
  word_cat_nr = word_count(word, cat)
  cat_nr = cat_count(cat)

  tf = 1.0 * word_cat_nr / cat_nr
      
  idf = Math.log10((total_categories + 2) / (categories_with_word_count(word) + 1.0))    
  tf * idf
end