Class: Treat::Workers::Extractors::TfIdf::Native

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/extractors/tf_idf/native.rb

Overview

Calculates the TF*IDF score of words.

Constant Summary collapse

DefaultOptions =
{
  :tf => :natural,
  :idf => :logarithm,
  :remove_common_words => true,
  :precision => 4
}
Algorithms =
{
  :tf => {
    :natural => lambda { |tf| tf },
    :logarithm => lambda { |tf| Math.log(1 + tf) },
    :sqrt =>lambda { |tf| Math.sqrt(tf) }
  },
  :idf => {
    :logarithm => lambda { |n,df| Math.log(n/(1 + df)) },
    :none => lambda { |n,idf| 1 }
  }
}
@@n =

Optimization caches for tf idf.

{}
@@df =

Number of documents in the collection (n).

{}
@@f =

Number of documents that have a given value (document count).

{}
@@wc =

Number of times a word appears in a given document (term count).

{}
@@cw =

Number of words in a given document (word count).

{}

Class Method Summary collapse

Class Method Details

.tf_idf(entity, options = {}) ⇒ Object

Common words to filter out.



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/treat/workers/extractors/tf_idf/native.rb', line 26

def self.tf_idf(entity, options={})
  l = Treat.languages[entity.language]
  if l.respond_to?(:stop_words)
    @@cw[entity.language] = l.stop_words
    return 0 if @@cw[entity.language].include?(entity.value)
  end
  return 0 if entity.value.length <= 2
  options = DefaultOptions.merge(options)
  lambdas = options.partition do |k,v|
    [:tf, :idf, :normalization].include?(k)
  end[0]
  lambdas.each do |opt,val|
    if opt.is_a?(Symbol)
      if Algorithms[opt][val]
        options[opt] = Algorithms[opt][val]
      else
        raise Treat::Exception,
        "The specified algorithm '#{val}' "+
        "to calculate #{opt} does not exist."
      end
    end
  end
  collection = entity.parent_collection
  unless collection
    raise Treat::Exception, "Cannot get the TF*IDF scores " +
    "for a document that is not in a collection."
  end
  document = entity.parent_document
  dc = collection.document_count
  if !collection || !document
    raise Treat::Exception,
    "Tf*Idf requires a collection with documents."
  end
  val = entity.value.downcase
  @@n[collection.id] = dc if @@n[collection.id].nil?
  @@df[collection.id] ||= {}
  if @@df[collection.id][val].nil?
    df = 0
    collection.each_document do |doc|
      @@f[doc.id] ||= {}
      if @@f[doc.id][val].nil?
        @@f[doc.id][val] =
        doc.frequency_of(val)
      end
      df += 1 if @@f[doc.id][val] > 0
    end
    @@df[collection.id][val] = df
  end
  f = @@f[document.id][entity.value].to_f
  df = @@df[collection.id][entity.value].to_f
  tf = options[:tf].call(f).to_f
  if options[:normalize_word_count]
    @@wc[document.id] ||= document.word_count
    tf /= @@wc[document.id]
  end
  n = @@n[collection.id].to_f
  idf = options[:idf].call(n, df)
  tf_idf = tf * idf
  tf_idf.abs.round(options[:precision])
end