Class: Company::Mapping::TFIDF

Inherits:
Object
  • Object
show all
Defined in:
lib/company/mapping/tfidf/tfidf.rb

Overview

TFIDF class implements Term Frequency Inverse Document Frequency statistic. Term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ TFIDF

Returns a new instance of TFIDF.



9
10
11
# File 'lib/company/mapping/tfidf/tfidf.rb', line 9

def initialize(corpus)
  @corpus = corpus
end

Instance Attribute Details

#idfObject

Returns the value of attribute idf.



7
8
9
# File 'lib/company/mapping/tfidf/tfidf.rb', line 7

def idf
  @idf
end

#tfObject

Returns the value of attribute tf.



7
8
9
# File 'lib/company/mapping/tfidf/tfidf.rb', line 7

def tf
  @tf
end

Instance Method Details

#calculateObject

Calculates the tf-idf weights in the given corpus



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/company/mapping/tfidf/tfidf.rb', line 14

def calculate
  @tfidf = Hash.new

  @idf ||= InverseDocumentFrequency.new(@corpus)
  @tf ||= NormalizedTermFrequency.new(BasicTokenizer.new)
  @idf_weights = @idf.calculate

  @corpus.each do |doc|
    termfreq = @tf.calculate(doc.contents)

    @tfidf[doc.id] =
        termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
          weight = tf * @idf_weights[term]
          tfidf_weights[term] = weight
        end
  end
  @tfidf
end

#calculate_tfidf_weights_of_new_document(new_doc) ⇒ Object

Calculates tfidf weights of new incoming document without importing the document in the corpus and re-calculating the tf-idf weights for the entire corpus



34
35
36
37
38
39
40
41
42
# File 'lib/company/mapping/tfidf/tfidf.rb', line 34

def calculate_tfidf_weights_of_new_document(new_doc)
  termfreq = @tf.calculate(new_doc.contents)

  @tfidf[new_doc.id] = termfreq.each_with_object({}) do |(term, tf), tfidf_weights|
    weight = tf * (@idf_weights[term] || @idf.maxIDF)
    tfidf_weights[term] = weight
  end
  @tfidf
end

#similarity(doc1_id, doc2_id) ⇒ Object

Calculates tf-idf similarity between two given documents. It is actually the calculated Cosine Similarity by using tf*idf weights.



46
47
48
49
# File 'lib/company/mapping/tfidf/tfidf.rb', line 46

def similarity(doc1_id, doc2_id)
  @tfidf ||= calculate
  CosineSimilarity.new.calculate(@tfidf[doc1_id], @tfidf[doc2_id])
end