Class: DecisionTree::ID3Tree

Inherits:
Object
  • Object
show all
Defined in:
lib/decisiontree/id3_tree.rb

Instance Method Summary collapse

Constructor Details

#initialize(attributes, data, default, type) ⇒ ID3Tree

Returns a new instance of ID3Tree



39
40
41
42
# File 'lib/decisiontree/id3_tree.rb', line 39

def initialize(attributes, data, default, type)
  @used, @tree, @type = {}, {}, type
  @data, @attributes, @default = data, attributes, default
end

Instance Method Details

#build_rules(tree = @tree) ⇒ Object



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/decisiontree/id3_tree.rb', line 146

def build_rules(tree=@tree)
  attr = tree.to_a.first
  cases = attr[1].to_a
  rules = []
  cases.each do |c,child|
    if child.is_a?(Hash) then
      build_rules(child).each do |r|
        r2 = r.clone
        r2.premises.unshift([attr.first, c])
        rules << r2
      end
    else
      rules << Rule.new(@attributes, [[attr.first, c]], child)
    end
  end
  rules
end

#fitness_for(attribute) ⇒ Object



58
59
60
61
62
63
# File 'lib/decisiontree/id3_tree.rb', line 58

def fitness_for(attribute)
  case type(attribute)
    when :discrete; fitness = proc{|a,b,c| id3_discrete(a,b,c)}
    when :continuous; fitness = proc{|a,b,c| id3_continuous(a,b,c)}
  end
end

#graph(filename, file_type = "png") ⇒ Object



134
135
136
137
138
# File 'lib/decisiontree/id3_tree.rb', line 134

def graph(filename, file_type = "png")
  require 'graphr'
  dgp = DotGraphPrinter.new(build_tree)
  dgp.write_to_file("#{filename}.#{file_type}", file_type)
end

#id3_continuous(data, attributes, attribute) ⇒ Object

ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds)



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/decisiontree/id3_tree.rb', line 102

def id3_continuous(data, attributes, attribute)
  values, thresholds = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort, []
  return [-1, -1] if values.size == 1
  values.each_index { |i| thresholds.push((values[i]+(values[i+1].nil? ? values[i] : values[i+1])).to_f / 2) }
  thresholds.pop
  #thresholds -= used[attribute] if used.has_key? attribute

  gain = thresholds.collect { |threshold|
    sp = data.partition { |d| d[attributes.index(attribute)] >= threshold }
    pos = (sp[0].size).to_f / data.size
    neg = (sp[1].size).to_f / data.size

    [data.classification.entropy - pos*sp[0].classification.entropy - neg*sp[1].classification.entropy, threshold]
  }.max { |a,b| a[0] <=> b[0] }

  return [-1, -1] if gain.size == 0
  gain
end

#id3_discrete(data, attributes, attribute) ⇒ Object

ID3 for discrete label cases



122
123
124
125
126
127
128
# File 'lib/decisiontree/id3_tree.rb', line 122

def id3_discrete(data, attributes, attribute)
  values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
  partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
  remainder = partitions.collect {|p| (p.size.to_f / data.size) * p.classification.entropy}.inject(0) {|i,s| s+=i }

  [data.classification.entropy - remainder, attributes.index(attribute)]
end

#id3_train(data, attributes, default, used = {}) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/decisiontree/id3_tree.rb', line 65

def id3_train(data, attributes, default, used={})
  return default if data.empty?

  # return classification if all examples have the same classification
  return data.first.last if data.classification.uniq.size == 1

  # Choose best attribute:
  # 1. enumerate all attributes
  # 2. Pick best attribute
  # 3. If attributes all score the same, then pick a random one to avoid infinite recursion.
  performance = attributes.collect { |attribute| fitness_for(attribute).call(data, attributes, attribute) }
  max = performance.max { |a,b| a[0] <=> b[0] }
  min = performance.min { |a,b| a[0] <=> b[0] }
  max = performance.shuffle.first if max[0] == min[0]
  best = Node.new(attributes[performance.index(max)], max[1], max[0])
  best.threshold = nil if @type == :discrete
  @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
  tree, l = {best => {}}, ['>=', '<']

  fitness = fitness_for(best.attribute)
  case type(best.attribute)
    when :continuous
      data.partition { |d| d[attributes.index(best.attribute)] >= best.threshold }.each_with_index  { |examples, i|
        tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
      }
    when :discrete
      values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
      partitions = values.collect { |val| data.select { |d| d[attributes.index(best.attribute)] == val } }
      partitions.each_with_index  { |examples, i|
        tree[best][values[i]] = id3_train(examples, attributes-[values[i]], (data.classification.mode rescue 0), &fitness)
      }
    end

  tree
end

#predict(test) ⇒ Object



130
131
132
# File 'lib/decisiontree/id3_tree.rb', line 130

def predict(test)
  descend(@tree, test)
end

#rulesetObject



140
141
142
143
144
# File 'lib/decisiontree/id3_tree.rb', line 140

def ruleset
  rs = Ruleset.new(@attributes, @data, @default, @type)
  rs.rules = build_rules
  rs
end

#train(data = @data, attributes = @attributes, default = @default) ⇒ Object



44
45
46
47
48
49
50
51
52
# File 'lib/decisiontree/id3_tree.rb', line 44

def train(data=@data, attributes=@attributes, default=@default)
  attributes = attributes.map {|e| e.to_s}
  initialize(attributes, data, default, @type)

  # Remove samples with same attributes leaving most common classification
  data2 = data.inject({}) {|hash, d| hash[d.slice(0..-2)] ||= Hash.new(0); hash[d.slice(0..-2)][d.last] += 1; hash }.map{|key,val| key + [val.sort_by{ |k, v| v }.last.first]}

  @tree = id3_train(data2, attributes, default)
end

#type(attribute) ⇒ Object



54
55
56
# File 'lib/decisiontree/id3_tree.rb', line 54

def type(attribute)
  @type.is_a?(Hash) ? @type[attribute.to_sym] : @type
end