Class: Treat::Workers::Extractors::Keywords::TfIdf

Inherits:
Object
  • Object
show all
Defined in:
lib/treat/workers/extractors/keywords/tf_idf.rb

Overview

Extracts an arbitrary number of keywords from a document in a collection by selecting its N words with the highest TF*IDF score.

Constant Summary collapse

DefaultOptions =

Default options - retrieve 5 keywords.

{ :number => 5 }

Class Method Summary collapse

Class Method Details

.keywords(entity, options = {}) ⇒ Object

Annotate a document with an array containing the N words with the highest TF*IDF in that document.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/treat/workers/extractors/keywords/tf_idf.rb', line 12

def self.keywords(entity, options = {})
  
  options = DefaultOptions.merge(options)
  tf_idfs = {}
  
  entity.each_word do |word|
    tf_idf = word.tf_idf
    if tf_idf
      tf_idfs[word] ||= tf_idf 
    end
  end

  tf_idfs = tf_idfs.
  sort_by {|k,v| v}.reverse
 
  keywords = []
  i = 0
  max_count = tf_idfs.size < options[:number] ? tf_idfs.size : options[:number]
  
  tf_idfs.each do |word|
    
    w = word[0].to_s
    next if keywords.include?(w)
    break if i > max_count
    keywords << w
    
    i += 1
  end
  
  entity.each_word do |word|
    
    if keywords.include?(word.to_s)
      word.set :keyword, true
      pp = entity.parent_phrase
    else
      word.set :keyword, false
    end
    
  end
  
  keywords
  
end