Class: Dictionary::TF_IDF

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/bow/dictionary.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ TF_IDF

Returns a new instance of TF_IDF.



17
18
19
20
21
22
23
24
25
26
# File 'lib/rbbt/bow/dictionary.rb', line 17

def initialize(options = {})
  @term_limit = {
    :limit => 500_000,
  }.merge(options)[:limit]

  @terms = Hash.new(0)
  @docs = Hash.new(0)
  @num_docs = 0
  @total_terms = 0
end

Instance Attribute Details

#docsObject (readonly)

Returns the value of attribute docs.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def docs
  @docs
end

#num_docsObject (readonly)

Returns the value of attribute num_docs.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def num_docs
  @num_docs
end

#termsObject (readonly)

Returns the value of attribute terms.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def terms
  @terms
end

#total_termsObject (readonly)

Returns the value of attribute total_terms.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def total_terms
  @total_terms
end

Instance Method Details

#add(terms) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/rbbt/bow/dictionary.rb', line 29

def add(terms)
  if @term_limit && @terms.length > @term_limit
    terms = terms.delete_if{|term, count| !@terms.include? term }
  end

  terms.each{|term, count|
    @terms[term] += count
    @total_terms += count
    @docs[term]  += 1
  }
  @num_docs += 1
end

#best(options = {}) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/rbbt/bow/dictionary.rb', line 76

def best(options = {})
  key = Misc.obj2digest(options)
  @best ||= {}
  @best[key] ||= begin
                   high, low, limit = {
                     :low   => 0,
                     :high    => 1,
                   }.merge(options).
                   values_at(:high, :low, :limit)

                   num_docs = @num_docs.to_f
                   best = df.select{|term, value|
                     value >= low && value <= high
                   }.collect{|p| 
                     term     = p.first
                     df_value = p.last
                     [term,
                      @terms[term].to_f / num_docs * Math::log(1.0/df_value)
                     ]
                   }

                   if limit
                     Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit-1).flatten]
                   else
                     Hash[*best.flatten]
                   end
                 end
end

#dfObject



42
43
44
45
46
47
48
# File 'lib/rbbt/bow/dictionary.rb', line 42

def df
  df = Hash.new(0)
  @docs.each{|term, count|
   df[term] = count.to_f / @num_docs
  }
  df
end

#idfObject



58
59
60
61
62
63
64
65
# File 'lib/rbbt/bow/dictionary.rb', line 58

def idf
  idf = Hash.new(0)
  num_docs = @num_docs.to_f
  @docs.each{|term, count|
   idf[term] = Math::log(num_docs / count)
  }
  idf
end

#tfObject



50
51
52
53
54
55
56
# File 'lib/rbbt/bow/dictionary.rb', line 50

def tf
  tf = Hash.new(0)
  @terms.each{|term, count|
   tf[term] = count.to_f / @total_terms
  }
  tf
end

#tf_idfObject



67
68
69
70
71
72
73
74
# File 'lib/rbbt/bow/dictionary.rb', line 67

def tf_idf
  tf_idf = Hash.new(0)
  num_docs = @num_docs.to_f
  @docs.each{|term, count|
   tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
  }
  tf_idf
end

#weights(options = {}) ⇒ Object



105
106
107
108
109
110
111
112
113
114
# File 'lib/rbbt/bow/dictionary.rb', line 105

def weights(options = {})
  best_terms = best(options).keys
  weights = {}

  num_docs = @num_docs.to_f
  best_terms.each{|term|
    weights[term] = Math::log(num_docs / @docs[term])
  }
  weights
end