Class: Treat::Workers::Extractors::Similarity::TfIdf

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/extractors/similarity/tf_idf.rb

Overview

Calculates the TF*IDF score of words.

Class Method Summary collapse

Class Method Details

.similarity(entity, options = {}) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/treat/workers/extractors/similarity/tf_idf.rb', line 6

def self.similarity(entity, options={})

  raise 'Not currently implemented.'
  
  unless options[:to] && 
         options[:to].type == :document
    raise Treat::Exception, 'Must supply ' +
    'a document to compare to using ' +
    'the option :to for this worker.'
  end

  unless options[:to].parent_collection && 
         entity.parent_collection
    raise Treat::Exception, 'The TF*IDF ' +
    'similarity algorithm can only be applied ' +
    'to documents that are inside collections.' 
  end
  
  coll = TfIdfSimilarity::Collection.new
  
  entity.each_document do |doc|
    tdoc = TfIdfSimilarity::Document.new(doc.to_s)
    term_counts = Hash.new(0)
    doc.each_word do |word| 
      val = word.value.downcase
      term_counts[val] ||= 0.0
      term_counts[val] += 1.0
    end
    size = term_counts.values.reduce(:+)
    tdoc.instance_eval do
      @term_counts, @size = term_counts, size
    end
    coll << tdoc
  end
  puts coll.similarity_matrix.inspect
end