Class: Basset::ClassificationEvaluator

Inherits:
Object
  • Object
show all
Defined in:
lib/basset/classification_evaluator.rb

Overview

Class for running evaluation tests on a classifier, and document representation. Takes the training_documents, which should be an array of objects that can return a vector of features (like Basset::Document) The args hash has two optional keys => true, :folding_amount => 10 where folding_amount is the amount of cross validation.

Instance Method Summary collapse

Constructor Details

#initialize(training_documents, args = {}) ⇒ ClassificationEvaluator

Returns a new instance of ClassificationEvaluator.



7
8
9
10
11
12
13
# File 'lib/basset/classification_evaluator.rb', line 7

def initialize(training_documents, args = {})
  args[:output] = true unless args.has_key?(:output)
  @output_to_console = args[:output]
  @folding_amount = (args[:folding_amount] or 10)
  @total_documents_trained = 0
  @document_sets = split_documents_into_cross_validation_sets(training_documents, @folding_amount)
end

Instance Method Details

#compare_against_basset_classifiers(classifiers, chi_value = 0, &block) ⇒ Object

Classifiers should be an array of basset classifier objects to run cross validation tests on. chi_value will be passed on to the feature_selector. The default value of 0 will select all features. The block will get called and passed in each training_set and test_set from the document_sets. It should run some external classifier and return the number of documents from the test_set that were correctly classified.



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/basset/classification_evaluator.rb', line 23

def compare_against_basset_classifiers(classifiers, chi_value = 0, &block)
  # initialize the results hash
  results = {"External" => {:correct => 0, :total => 0, :time => 0.0}}
  classifiers.each {|classifier| results[classifier.class] = {:correct => 0, :total => 0, :time => 0.0}}

  # run on each doc set
  @document_sets.each_with_index do |document_set, iteration|
    puts "iteration #{iteration + 1} of #{@document_sets.size}" if @output_to_console
    feature_extractor = nil
    feature_extractor_time = time_execution { feature_extractor = create_feature_extractor(document_set[:training_set], chi_value) }
    number_of_test_documents = document_set[:test_set].size
    
    # do a test run on each classifier
    classifiers.each do |classifier|          
      correct = 0
      time = time_execution { correct = test_run(document_set[:training_set], document_set[:test_set], feature_extractor, classifier) } + feature_extractor_time
      results[classifier.class][:time] += time
      results[classifier.class][:correct] += correct
      results[classifier.class][:total] += number_of_test_documents
      output_results(correct, number_of_test_documents, time, classifier.class) if @output_to_console
    end
      
    # now run the external and gather results
    correct = 0
    time = time_execution { correct = block.call(document_set[:training_set], document_set[:test_set]) }
    results["External"][:time]    += time
    results["External"][:correct] += correct
    results["External"][:total]   += number_of_test_documents
    output_results(correct, number_of_test_documents, time, "External") if @output_to_console
  end
  
  puts "\nFinal Results\n---------------------------------------------------------------------------------------" if @output_to_console
  puts "Trained on #{@total_documents_trained} documents on #{@folding_amount} cross validation runs." if @output_to_console
  if @output_to_console
    results.each_pair {|classifier, results_numbers| output_results(results_numbers[:correct], results_numbers[:total], results_numbers[:time], classifier)}
  end
  
  return results
end

#test_with_basset_classifiers(classifiers) ⇒ Object

Classifiers should be an array of basset classifier objects to run cross validation tests on



16
17
# File 'lib/basset/classification_evaluator.rb', line 16

def test_with_basset_classifiers(classifiers)
end

#test_with_cross_validation(training_document_names, folding_amount = 10) ⇒ Object

It will then feature select and train on 9 and test on the other. Iterate 10 times using each block as the test set and the others as the training and combine the results.



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/basset/classification_evaluator.rb', line 66

def test_with_cross_validation(training_document_names, folding_amount = 10)
  # make sure it's not in some order
  training_document_names.each {|class_documents| class_documents.randomize!}

  # the folding amount determines how big the test set size is. for 10 fold it's 10% and we run 10 times
  total_correct, total_documents = 0, 0

  # there's some tricky code here to make sure that the training and test sets have an equal percentage 
  # of docs from each class for each iteration.
  folding_amount.times do |iteration|
    puts "iteration #{iteration + 1} of #{folding_amount}" if @output_to_console
    test_set = []
    training_document_names.each do |class_document_names|
      test_set_size = (class_document_names.size / folding_amount).to_i
      test_set << class_document_names.slice(iteration * test_set_size, test_set_size)
    end
    training_set = []
    training_document_names.each_with_index {|class_document_names, i| training_set += (class_document_names - test_set[i])}
    test_set = test_set.flatten
  
    correct, total = test_run(training_set, test_set)
    total_correct += correct
    total_documents += total
  end

  output_results(total_correct, total_documents) if @output_to_console
  return [total_correct, total_documents]
end