Class: DecisionTree::ID3Tree

Inherits:
Object
  • Object
show all
Defined in:
lib/SelfModifiedDecisionTree.rb

Instance Method Summary collapse

Constructor Details

#initialize(attributes, data, default, type) ⇒ ID3Tree

Returns a new instance of ID3Tree.



113
114
115
116
# File 'lib/SelfModifiedDecisionTree.rb', line 113

def initialize(attributes, data, default, type)
  @used, @tree, @type = {}, {}, type
  @data, @attributes, @default = data, attributes, default
end

Instance Method Details

#build_rules(tree = @tree) ⇒ Object



220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/SelfModifiedDecisionTree.rb', line 220

def build_rules(tree=@tree)
  attr = tree.to_a.first
  cases = attr[1].to_a
  rules = []
  cases.each do |c,child|
    if child.is_a?(Hash) then
      build_rules(child).each do |r|
        r2 = r.clone
        r2.premises.unshift([attr.first, c])
        rules << r2
      end
    else
      rules << Rule.new(@attributes, [[attr.first, c]], child)
    end
  end
  rules
end

#fitness_for(attribute) ⇒ Object



132
133
134
135
136
137
# File 'lib/SelfModifiedDecisionTree.rb', line 132

def fitness_for(attribute)
  case type(attribute)
    when :discrete; fitness = proc{|a,b,c| id3_discrete(a,b,c)}
    when :continuous; fitness = proc{|a,b,c| id3_continuous(a,b,c)}
  end
end

#graph(filename, file_type = "png") ⇒ Object



208
209
210
211
212
# File 'lib/SelfModifiedDecisionTree.rb', line 208

def graph(filename, file_type = "png")
  require 'graphr'
  dgp = DotGraphPrinter.new(build_tree)
  dgp.write_to_file("#{filename}.#{file_type}", file_type)
end

#id3_continuous(data, attributes, attribute) ⇒ Object

ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds)



176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/SelfModifiedDecisionTree.rb', line 176

def id3_continuous(data, attributes, attribute)
  values, thresholds = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort, []
  return [-1, -1] if values.size == 1
  values.each_index { |i| thresholds.push((values[i]+(values[i+1].nil? ? values[i] : values[i+1])).to_f / 2) }
  thresholds.pop
  #thresholds -= used[attribute] if used.has_key? attribute

  gain = thresholds.collect { |threshold|
    sp = data.partition { |d| d[attributes.index(attribute)] >= threshold }
    pos = (sp[0].size).to_f / data.size
    neg = (sp[1].size).to_f / data.size

    [data.classification.entropy - pos*sp[0].classification.entropy - neg*sp[1].classification.entropy, threshold]
  }.max { |a,b| a[0] <=> b[0] }

  return [-1, -1] if gain.size == 0
  gain
end

#id3_discrete(data, attributes, attribute) ⇒ Object

ID3 for discrete label cases



196
197
198
199
200
201
202
# File 'lib/SelfModifiedDecisionTree.rb', line 196

def id3_discrete(data, attributes, attribute)
  values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
  partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
  remainder = partitions.collect {|p| (p.size.to_f / data.size) * p.classification.entropy}.inject(0) {|i,s| s+=i }

  [data.classification.entropy - remainder, attributes.index(attribute)]
end

#id3_train(data, attributes, default, used = {}) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/SelfModifiedDecisionTree.rb', line 139

def id3_train(data, attributes, default, used={})
  return default if data.empty?

  # return classification if all examples have the same classification
  return data.first.last if data.classification.uniq.size == 1

  # Choose best attribute:
  # 1. enumerate all attributes
  # 2. Pick best attribute
  # 3. If attributes all score the same, then pick a random one to avoid infinite recursion.
  performance = attributes.collect { |attribute| fitness_for(attribute).call(data, attributes, attribute) }
  max = performance.max { |a,b| a[0] <=> b[0] }
  min = performance.min { |a,b| a[0] <=> b[0] }
  max = performance.shuffle.first if max[0] == min[0]
  best = Node.new(attributes[performance.index(max)], max[1], max[0])
  best.threshold = nil if @type == :discrete
  @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
  tree, l = {best => {}}, ['>=', '<']

  fitness = fitness_for(best.attribute)
  case type(best.attribute)
    when :continuous
      data.partition { |d| d[attributes.index(best.attribute)] >= best.threshold }.each_with_index  { |examples, i|
        tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
      }
    when :discrete
      values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
      partitions = values.collect { |val| data.select { |d| d[attributes.index(best.attribute)] == val } }
      partitions.each_with_index  { |examples, i|
        tree[best][values[i]] = id3_train(examples, attributes-[values[i]], (data.classification.mode rescue 0), &fitness)
      }
    end

  tree
end

#predict(test) ⇒ Object



204
205
206
# File 'lib/SelfModifiedDecisionTree.rb', line 204

def predict(test)
  descend(@tree, test)
end

#rulesetObject



214
215
216
217
218
# File 'lib/SelfModifiedDecisionTree.rb', line 214

def ruleset
  rs = Ruleset.new(@attributes, @data, @default, @type)
  rs.rules = build_rules
  rs
end

#train(data = @data, attributes = @attributes, default = @default) ⇒ Object



118
119
120
121
122
123
124
125
126
# File 'lib/SelfModifiedDecisionTree.rb', line 118

def train(data=@data, attributes=@attributes, default=@default)
  attributes = attributes.map {|e| e.to_s}
  initialize(attributes, data, default, @type)

  # Remove samples with same attributes leaving most common classification
  data2 = data.inject({}) {|hash, d| hash[d.slice(0..-2)] ||= Hash.new(0); hash[d.slice(0..-2)][d.last] += 1; hash }.map{|key,val| key + [val.sort_by{ |k, v| v }.last.first]}

  @tree = id3_train(data2, attributes, default)
end

#type(attribute) ⇒ Object



128
129
130
# File 'lib/SelfModifiedDecisionTree.rb', line 128

def type(attribute)
  @type.is_a?(Hash) ? @type[attribute.to_sym] : @type
end