Class: TFIDF

Inherits:
Object
  • Object
show all
Defined in:
lib/tfidf.rb

Constant Summary collapse

@@split_pattern =

Regex pattern of delimiters for splitting text

/[\W]/

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ TFIDF

Arguments

corpus: an array of strings, one string per document

Returns

self

Example: tfidf = TFIDF.new([“This is a document…”, “Far, far away…”, “The quick brown fox jumps over the lazy dog”]) tfidf.tf(“fox”,“2fd4e1c67a2d28fced849ee1bb76e7391b93eb12”) #=> 1 See examples/demo_tf.rb for more



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/tfidf.rb', line 28

def initialize(corpus)
  @cardinality = 0
  @docs = {}
  @tdm = {}
  @dtm = {}
  @idf = {}

  #not in use
  #TODO:
  @dense_matrix = nil

  if corpus.is_a? String
    @cardinality = 1
    corpus = [corpus]
  else
    @cardinality = corpus.length
  end
  memo = corpus.reduce({:terms => SortedSet.new, :docs => ActiveSupport::OrderedHash.new({}), :dtm => {}, :tdm => {}}) do |memo, doc|
    doc_id = TFIDF.hash_func doc
    memo[:docs][doc_id] = doc
    tf_single_doc = TFIDF.tf_single(doc)
    memo[:dtm][doc_id] = tf_single_doc
    tf_single_doc.each do |keyvalue| 
      term, freq = keyvalue
#        term_id = TFIDF.hash_func term
      memo[:terms].add term
      lambda {|x|
        if x[term] != nil
          x[term][doc_id] = freq
        else
          x[term] = {doc_id => freq}
        end
      }.call memo[:tdm]
    end
    memo
  end
  @docs = memo[:docs]
  @terms = ActiveSupport::OrderedHash.new({})
  memo[:terms].each do |term|
    term_id = TFIDF.hash_func term
    @terms[term_id] = term
  end
  @tdm = memo[:tdm]
  @dtm = memo[:dtm]
  @tdm.each {|k, v|
    @idf[k] = TFIDF.idf(v.size, @cardinality)}
end

Class Method Details

.from_tf(tf_in_hash) ⇒ Object

TODO: really make this work as intended



77
78
79
80
81
82
83
84
85
86
87
# File 'lib/tfidf.rb', line 77

def self.from_tf(tf_in_hash)
  @dtm = OrderedHash.new
  @docs = OrderedHash.new
  @terms = SortedSet.new
  tf_in_hash.map {|e|
    id = TFIDF.hash_func Time.now
    @dtm[id] = e
    @docs[id] = "Place holder"
    e.keys.map {|t| @terms.add t}
  }
end

.hash_func(obj) ⇒ Object

Hash function used for generating id for documents as well as terms



14
15
16
# File 'lib/tfidf.rb', line 14

def self.hash_func(obj)
  return Digest::SHA1.hexdigest obj
end

.idf(x, cardinality) ⇒ Object

Simply the formula for tf*idf



227
228
229
# File 'lib/tfidf.rb', line 227

def self.idf(x,cardinality)
  return Math.log2(cardinality.to_f/(x+1).to_f)
end

.mergeObject

TODO: Merge 2 TFIDF objects



222
223
224
# File 'lib/tfidf.rb', line 222

def self.merge()

end

.should_be_ignored_in_TF?(str) ⇒ Boolean

If a string is too short or contains non-alphanumeric characters, dump it

Returns:

  • (Boolean)


232
233
234
235
236
237
238
239
# File 'lib/tfidf.rb', line 232

def self.should_be_ignored_in_TF?(str)
  if str.length <= 3
    true
  elsif (/[^[[:alnum:]]]/ =~ str) != nil
         true
  else false
  end
end

.sparse_to_dense(array_of_ordered_hashes) ⇒ Object



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/tfidf.rb', line 186

def self.sparse_to_dense(array_of_ordered_hashes)
  terms = SortedSet.new
  array_of_ordered_hashes.each {|ordered_hash|
    ordered_hash.keys.each {|k| terms.add(k)}}
  terms = terms.to_a
  n = array_of_ordered_hashes.length
  d = terms.length
  dense_matrix = GSL::Matrix.alloc(n,d)
  (0...n).each do |i|
    array_of_ordered_hashes[i].each do |term, freq|
      idx = terms.index term
      dense_matrix.set([i,idx], freq)
    end
  end
  return dense_matrix
end

.tf_single(str) ⇒ Object

Build a TF vector out of a single document(String)

Argument:

String valued document

Returns:

A hash as in {"term" => frequency, ...}


94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/tfidf.rb', line 94

def self.tf_single(str)
  if str == nil
    return nil
  else
    dict = str.split(pattern=@@split_pattern).reduce({}) {|dict, key|
      key = key.stem.downcase
      unless TFIDF.should_be_ignored_in_TF?(key)
        if dict[key] != nil
          dict[key] += 1
        else
          dict[key] = 1
        end
      end
      dict}
    if block_given?
      yield dict.keys
    end
    dict
  end
end

Instance Method Details

#cardinalityObject

Cardinality, or number of documents in corpus



116
117
118
# File 'lib/tfidf.rb', line 116

def cardinality
  return @cardinality
end

#denseObject

Access, or calculate if not present, a dense DTM in GSL::Matrix Each row corresponds to a document, each column a term Use TFIDF#terms.values and TFIDF#docs.values to find column/row index of a specific document/term



207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/tfidf.rb', line 207

def dense()
  return @dense_matrix unless @dense_matrix.nil?
  dense_matrix = GSL::Matrix.alloc(@docs.size, @terms.size)
  (0...@docs.size).each do |i|
    doc_id = @docs.keys[i]
    @dtm[doc_id].each do |term,freq|
      idx = @terms.values.index term
      dense_matrix.set([i,idx], freq)
    end
  end
  @dense_matrix = dense_matrix
  return @dense_matrix
end

#docsObject

Documents, in a hash, as in: => “this is a document…”, …



121
122
123
# File 'lib/tfidf.rb', line 121

def docs
  return @docs
end

#dtmObject

Document Term Matrix, in sparse List of lists(LIL)



131
132
133
# File 'lib/tfidf.rb', line 131

def dtm
  return @dtm
end

#idf(term = nil) ⇒ Object

Arguments

(Optional)term: Term

Returns

IDF of Term


173
174
175
176
177
178
179
# File 'lib/tfidf.rb', line 173

def idf(term = nil)
  if term == nil
    return @idf
  else
    return lambda {|x| (x==nil)?0:x}.call(@idf[term])
  end
end

#tdmObject

Term Document Matrix, in sparse List of lists(LIL)



136
137
138
# File 'lib/tfidf.rb', line 136

def tdm
  return @tdm
end

#termsObject

Terms, stored in a similar way as documents



126
127
128
# File 'lib/tfidf.rb', line 126

def terms
  return @terms
end

#tf(term = nil, doc = nil) ⇒ Object

Arguments

t: Term
d: Document ID

Returns

tf(t,d)

Alternatively:

Arguments

d: Document ID
t: nil(or unspecified)

Returns

Hash which contains non-zero tf of all terms

Yet another alternative:

Arguments

d: nil
t: nil

Returns

Everything


159
160
161
162
163
164
165
166
167
# File 'lib/tfidf.rb', line 159

def tf(term=nil, doc=nil)
  if term == nil || doc == nil
    return @dtm
  elsif term == nil
    return @dtm[doc]
  else
    return lambda {|x| (x == nil)?0:x}.call(@dtm[doc][term])
  end
end

#tfidf(term, doc) ⇒ Object

tf*idf(t,d)



182
183
184
# File 'lib/tfidf.rb', line 182

def tfidf(term, doc)
  return tf(term,doc) * idf(term)
end