Class: Tf_Idf_CSV

Inherits:
Object
  • Object
show all
Defined in:
lib/tf-idf_csv.rb

Overview

This class expects a CSV input One row per document, the first cell should be a document identifier each subsequent cell contains one term. TF-IDF will be returned based on the number of times the term appears in each document, relative to the total number of documents it appears in

Instance Method Summary collapse

Constructor Details

#initializeTf_Idf_CSV

Returns a new instance of Tf_Idf_CSV.



10
11
12
13
14
15
# File 'lib/tf-idf_csv.rb', line 10

def initialize      
  @tf_idf = {}
  @total_number_of_docs = 0 
  @doc_count_per_term = Hash.new(0)
  @term_freq_per_doc = Hash.new  
end

Instance Method Details

#add_csv(csv) ⇒ Object



17
18
19
20
21
22
23
24
# File 'lib/tf-idf_csv.rb', line 17

def add_csv(csv)
  csv.each do |row|
    name = row[0] 
    terms = row[1..-1]
    add_document(name, terms)
  end
  calculate_tf_idf
end

#write(csv_file_name, options = {}) ⇒ Object

Save the results as CSV Term, Doc1, Doc2, Doc3… Eggs, 0.04535,,0.02



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/tf-idf_csv.rb', line 29

def write(csv_file_name, options = {})
  decimal_places = options[:decimal_places] || 20
      
  CSV.open(csv_file_name,"w") do |f|
    f << ["term", docs].flatten
    @tf_idf.each do |term, values|
      tmp_row = [term]
      docs.each do |doc|
        value = values[doc] ? ("%.#{decimal_places}f" % values[doc]) : nil
        value = nil if value =~ /^0\.0+$/
        tmp_row << value
      end
      f << tmp_row
    end
  end
end