Class: TeRex::Classifier::Bayes

Inherits:
Object
  • Object
show all
Defined in:
lib/te_rex/bayes.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*categories) ⇒ Bayes

categories = [=> “Thing1”, :msg => “Thing1 message”, => “Thing2”, :msg => “Thing2 message”] initialize(=> “Refund”, :msg => “You’ll get a refund”, => “Nonrefund”, :msg => “You won’t get a refund”)



12
13
14
15
16
17
18
19
# File 'lib/te_rex/bayes.rb', line 12

def initialize(*categories)
  @clasif = Hash.new
  @messages = Hash.new
  categories.each {|cat| @clasif[TeRex::Format.category_term(cat[:tag])] = Hash.new}
  categories.each {|cat| @messages[cat[:tag]] = cat[:msg]}
  @total_words = 0
  @category_counts = Hash.new(0)
end

Instance Attribute Details

#category_countsObject (readonly)

Returns the value of attribute category_counts.



8
9
10
# File 'lib/te_rex/bayes.rb', line 8

def category_counts
  @category_counts
end

#messagesObject (readonly)

Returns the value of attribute messages.



8
9
10
# File 'lib/te_rex/bayes.rb', line 8

def messages
  @messages
end

#total_wordsObject (readonly)

Returns the value of attribute total_words.



8
9
10
# File 'lib/te_rex/bayes.rb', line 8

def total_words
  @total_words
end

Instance Method Details

#categoriesObject



57
58
59
# File 'lib/te_rex/bayes.rb', line 57

def categories
  @classif.keys.collect {|c| c.to_s}
end

#classifications(text) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/te_rex/bayes.rb', line 33

def classifications(text)
  score = Hash.new
  training_count = @category_counts.values.inject {|x,y| x+y}.to_f

  @clasif.each do |category, category_words|
    score[category.to_s] = 0
    total = category_words.values.inject(0) {|sum, element| sum+element}
    BayesData.index_frequency(text).each do |word, count|
      s = category_words.has_key?(word) ? category_words[word] : 0.1
      score[category.to_s] += Math.log(s/total.to_f)
    end

    k = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
    score[category.to_s] += Math.log(k/training_count)
  end

  score
end

#classify(text) ⇒ Object



52
53
54
55
# File 'lib/te_rex/bayes.rb', line 52

def classify(text)
  tag = (classifications(text).sort_by{|a| -a[1]})[0][0]
  [tag, @messages[tag]]
end

#train(ctgry, text) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
# File 'lib/te_rex/bayes.rb', line 21

def train(ctgry, text)
  category = TeRex::Format.category_term(ctgry)
  @category_counts[category] += 1

  BayesData.index_frequency(text).each do |word, count|
    @clasif[category][word] ||= 0
    @clasif[category][word] += count

    @total_words += count
  end
end

#training_descriptionObject



61
62
63
64
65
66
67
68
69
70
# File 'lib/te_rex/bayes.rb', line 61

def training_description
  max_threshold = (@total_words/self.category_counts.keys.count).to_f
  tmp = []
  @clasif.each_pair do |term,val|
    cc = self.category_counts[term]
    train_ratio = (@total_words/cc).to_f
    tmp << [(train_ratio >= max_threshold), term, "description" => {"training_ratio" => "#{train_ratio}", "threshold" => "#{max_threshold}", "category_counts" => "#{cc}", "total_words" => "#{@total_words}"}]
  end
  tmp
end

#under_trained?Boolean

Returns:

  • (Boolean)


72
73
74
# File 'lib/te_rex/bayes.rb', line 72

def under_trained?
  training_description.select {|ut| ut.first == true}
end