Class: Dictionary::TF_IDF

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/bow/dictionary.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ TF_IDF

Returns a new instance of TF_IDF.



17
18
19
20
21
22
23
24
25
26
# File 'lib/rbbt/bow/dictionary.rb', line 17

def initialize(options = {})
  @term_limit = {
    :limit => 500_000,
  }.merge(options)[:limit]

  @terms = Hash.new(0)
  @docs = Hash.new(0)
  @num_docs = 0
  @total_terms = 0
end

Instance Attribute Details

#docsObject (readonly)

Returns the value of attribute docs.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def docs
  @docs
end

#num_docsObject (readonly)

Returns the value of attribute num_docs.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def num_docs
  @num_docs
end

#termsObject (readonly)

Returns the value of attribute terms.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def terms
  @terms
end

#total_termsObject (readonly)

Returns the value of attribute total_terms.



15
16
17
# File 'lib/rbbt/bow/dictionary.rb', line 15

def total_terms
  @total_terms
end

Instance Method Details

#add(terms) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/rbbt/bow/dictionary.rb', line 29

def add(terms)
  if @term_limit && @terms.length > @term_limit
    terms = terms.delete_if{|term, count| !@terms.include? term }
  end

  terms.each{|term, count|
    @terms[term] += count
    @total_terms += count
    @docs[term]  += 1
  }
  @num_docs += 1
end

#best(options = {}) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/rbbt/bow/dictionary.rb', line 76

def best(options = {})
  high, low, limit = {
    :low   => 0,
    :high    => 1,
  }.merge(options).
  values_at(:high, :low, :limit)

  num_docs = @num_docs.to_f
  best = df.select{|term, value|
    value >= low && value <= high
  }.collect{|p| 
    term     = p.first
    df_value = p.last
    [term,
     @terms[term].to_f / num_docs * Math::log(1.0/df_value)
    ]
  }

  if limit
    Hash[*best.sort{|a,b| b[1] <=>  a[1]}.slice(0, limit).flatten]
  else
    Hash[*best.flatten]
  end
end

#dfObject



42
43
44
45
46
47
48
# File 'lib/rbbt/bow/dictionary.rb', line 42

def df
  df = Hash.new(0)
  @docs.each{|term, count|
   df[term] = count.to_f / @num_docs
  }
  df
end

#idfObject



58
59
60
61
62
63
64
65
# File 'lib/rbbt/bow/dictionary.rb', line 58

def idf
  idf = Hash.new(0)
  num_docs = @num_docs.to_f
  @docs.each{|term, count|
   idf[term] = Math::log(num_docs / count)
  }
  idf
end

#tfObject



50
51
52
53
54
55
56
# File 'lib/rbbt/bow/dictionary.rb', line 50

def tf
  tf = Hash.new(0)
  @terms.each{|term, count|
   tf[term] = count.to_f / @total_terms
  }
  tf
end

#tf_idfObject



67
68
69
70
71
72
73
74
# File 'lib/rbbt/bow/dictionary.rb', line 67

def tf_idf
  tf_idf = Hash.new(0)
  num_docs = @num_docs.to_f
  @docs.each{|term, count|
   tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
  }
  tf_idf
end

#weights(options = {}) ⇒ Object



101
102
103
104
105
106
107
108
109
110
# File 'lib/rbbt/bow/dictionary.rb', line 101

def weights(options = {})
  best_terms = best(options).keys
  weights = {}

  num_docs = @num_docs.to_f
  best_terms.each{|term|
    weights[term] = Math::log(num_docs / @docs[term])
  }
  weights
end