Class: Baobab::Dataset

Inherits:
Array
  • Object
show all
Defined in:
lib/baobab/dataset.rb

Overview

Represents a dataset or subset thereof. Is an array of hashes where all hashes contain the same keys.

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data) ⇒ Dataset

Receives an array of hashes. All hashes must contain the same keys.



8
9
10
11
12
# File 'lib/baobab/dataset.rb', line 8

def initialize data
    data.each do |row|
        self << row
    end
end

Class Method Details

.from_json(filename) ⇒ Object



14
15
16
17
# File 'lib/baobab/dataset.rb', line 14

def self.from_json filename
    text = File.read(filename)
    self.new JSON.parse(text)
end

Instance Method Details

#attribute_names(class_var) ⇒ Object



19
20
21
# File 'lib/baobab/dataset.rb', line 19

def attribute_names class_var
    self.column_names.reject{|name| name == class_var}
end

#column_namesObject

Returns an array of the attribute names in the dataset Careful: it’s empty on an empty set.



25
26
27
# File 'lib/baobab/dataset.rb', line 25

def column_names
    self[0].keys
end

#column_values(attribute) ⇒ Object

Returns an array of the values of an attribute in the dataset. Careful: it’s empty on an empty set.



31
32
33
# File 'lib/baobab/dataset.rb', line 31

def column_values attribute
    self.map{|row| row[attribute]}.to_a.uniq
end

#entropy(class_var) ⇒ Object



46
47
48
49
50
51
52
# File 'lib/baobab/dataset.rb', line 46

def entropy class_var
    class_vals = self.column_values(class_var)
    probabilities = class_vals.map do |class_val|
        self.probability(class_var, class_val)
    end
    Shannon::entropy *probabilities
end

#probability(var, val) ⇒ Object

Evaluates the probability that var be val in this dataset. Can also be used for subsets.



56
57
58
59
60
61
62
# File 'lib/baobab/dataset.rb', line 56

def probability var, val
    unless self.count.zero?
        self.count{|r| r[var] == val}.fdiv(self.count)
    else
        0
    end
end

#subset(conditions) ⇒ Object

Gets a subset with given conditions. Keys must be of the same type as in the dataset (be careful with symbols).



37
38
39
40
41
42
43
44
# File 'lib/baobab/dataset.rb', line 37

def subset conditions
    rows = self.select do |row|
        conditions.reduce(true) do |memo, (var, val)|
            memo and row[var] == val
        end
    end
    Dataset.new rows
end

#validateObject



64
65
66
67
68
69
70
71
72
73
74
# File 'lib/baobab/dataset.rb', line 64

def validate
    raise 'Dataset is empty' if self.empty?
    self.reduce(self[0].keys) do |memo, row|
        if memo == row.keys then
            memo
        else
            raise 'Dataset is inconsistent'
        end
    end
    return nil
end