Class: DecisionTree::ID3Tree

Inherits:
Object
  • Object
show all
Defined in:
lib/decisiontree/id3_tree.rb

Instance Method Summary collapse

Constructor Details

#initialize(attributes, data, default, type) ⇒ ID3Tree

Returns a new instance of ID3Tree.



41
42
43
44
# File 'lib/decisiontree/id3_tree.rb', line 41

def initialize(attributes, data, default, type)
  @used, @tree, @type = {}, {}, type
  @data, @attributes, @default = data, attributes, default
end

Instance Method Details

#build_rules(tree = @tree) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/decisiontree/id3_tree.rb', line 141

def build_rules(tree=@tree)
  attr = tree.to_a.first
  cases = attr[1].to_a
  rules = []
  cases.each do |c,child|
    if child.is_a?(Hash) then
      build_rules(child).each do |r|
        r2 = r.clone
        r2.premises.unshift([attr.first, c])
        rules << r2
      end
    else
      rules << Rule.new(@attributes, [[attr.first, c]], child)
    end
  end
  rules
end

#fitness_for(attribute) ⇒ Object



59
60
61
62
63
64
# File 'lib/decisiontree/id3_tree.rb', line 59

def fitness_for(attribute)
  case type(attribute)
    when :discrete; fitness = proc{|a,b,c| id3_discrete(a,b,c)}
    when :continuous; fitness = proc{|a,b,c| id3_continuous(a,b,c)}
  end
end

#graph(filename) ⇒ Object



130
131
132
133
# File 'lib/decisiontree/id3_tree.rb', line 130

def graph(filename)
  dgp = DotGraphPrinter.new(build_tree)
  dgp.write_to_file("#{filename}.png", "png")
end

#id3_continuous(data, attributes, attribute) ⇒ Object

ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds)



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/decisiontree/id3_tree.rb', line 98

def id3_continuous(data, attributes, attribute)
  values, thresholds = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort, []
  return [-1, -1] if values.size == 1
  values.each_index { |i| thresholds.push((values[i]+(values[i+1].nil? ? values[i] : values[i+1])).to_f / 2) }
  thresholds.pop
  #thresholds -= used[attribute] if used.has_key? attribute

  gain = thresholds.collect { |threshold|
    sp = data.partition { |d| d[attributes.index(attribute)] >= threshold }
    pos = (sp[0].size).to_f / data.size
    neg = (sp[1].size).to_f / data.size

    [data.classification.entropy - pos*sp[0].classification.entropy - neg*sp[1].classification.entropy, threshold]
  }.max { |a,b| a[0] <=> b[0] }

  return [-1, -1] if gain.size == 0
  gain
end

#id3_discrete(data, attributes, attribute) ⇒ Object

ID3 for discrete label cases



118
119
120
121
122
123
124
# File 'lib/decisiontree/id3_tree.rb', line 118

def id3_discrete(data, attributes, attribute)
  values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
  partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
  remainder = partitions.collect {|p| (p.size.to_f / data.size) * p.classification.entropy}.inject(0) {|i,s| s+=i }

  [data.classification.entropy - remainder, attributes.index(attribute)]
end

#id3_train(data, attributes, default, used = {}) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/decisiontree/id3_tree.rb', line 66

def id3_train(data, attributes, default, used={})
  return default if data.empty?

  # return classification if all examples have the same classification
  return data.first.last if data.classification.uniq.size == 1

  # Choose best attribute (1. enumerate all attributes / 2. Pick best attribute)
  performance = attributes.collect { |attribute| fitness_for(attribute).call(data, attributes, attribute) }
  max = performance.max { |a,b| a[0] <=> b[0] }
  best = Node.new(attributes[performance.index(max)], max[1], max[0])
  best.threshold = nil if @type == :discrete
  @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
  tree, l = {best => {}}, ['>=', '<']

  fitness = fitness_for(best.attribute)
  case type(best.attribute)
    when :continuous
      data.partition { |d| d[attributes.index(best.attribute)] >= best.threshold }.each_with_index  { |examples, i|
        tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
      }
    when :discrete
      values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
      partitions = values.collect { |val| data.select { |d| d[attributes.index(best.attribute)] == val } }
      partitions.each_with_index  { |examples, i|
        tree[best][values[i]] = id3_train(examples, attributes-[values[i]], (data.classification.mode rescue 0), &fitness)
      }
    end

  tree
end

#predict(test) ⇒ Object



126
127
128
# File 'lib/decisiontree/id3_tree.rb', line 126

def predict(test)
  descend(@tree, test)
end

#rulesetObject



135
136
137
138
139
# File 'lib/decisiontree/id3_tree.rb', line 135

def ruleset
  rs = Ruleset.new(@attributes, @data, @default, @type)
  rs.rules = build_rules
  rs
end

#train(data = @data, attributes = @attributes, default = @default) ⇒ Object



46
47
48
49
50
51
52
53
# File 'lib/decisiontree/id3_tree.rb', line 46

def train(data=@data, attributes=@attributes, default=@default)
  initialize(attributes, data, default, @type)

  # Remove samples with same attributes leaving most common classification
  data2 = data.inject({}) {|hash, d| hash[d.slice(0..-2)] ||= Hash.new(0); hash[d.slice(0..-2)][d.last] += 1; hash }.map{|key,val| key + [val.sort_by{ |k, v| v }.last.first]}

  @tree = id3_train(data2, attributes, default)
end

#type(attribute) ⇒ Object



55
56
57
# File 'lib/decisiontree/id3_tree.rb', line 55

def type(attribute)
  @type.is_a?(Hash) ? @type[attribute.to_sym] : @type
end