Class: Rumale::Ensemble::GradientBoostingClassifier

Inherits:
Object
  • Object
show all
Includes:
Base::BaseEstimator, Base::Classifier
Defined in:
lib/rumale/ensemble/gradient_boosting_classifier.rb

Overview

GradientBoostingClassifier is a class that implements gradient tree boosting for classification. The class use negative binomial log-likelihood for the loss function. For multiclass classification problem, it uses one-vs-the-rest strategy.

reference

  • J H. Friedman, “Greedy Function Approximation: A Gradient Boosting Machine,” Annals of Statistics, 29 (5), pp. 1189–1232, 2001.

  • J H. Friedman, “Stochastic Gradient Boosting,” Computational Statistics and Data Analysis, 38 (4), pp. 367–378, 2002.

    1. Chen and C. Guestrin, “XGBoost: A Scalable Tree Boosting System,” Proc. KDD’16, pp. 785–794, 2016.

Examples:

estimator =
  Rumale::Ensemble::GradientBoostingClassifier.new(
    n_estimators: 100, learning_rate: 0.3, reg_lambda: 0.001, random_seed: 1)
estimator.fit(training_samples, traininig_values)
results = estimator.predict(testing_samples)

Instance Attribute Summary collapse

Attributes included from Base::BaseEstimator

#params

Instance Method Summary collapse

Methods included from Base::Classifier

#score

Constructor Details

#initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0, max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil) ⇒ GradientBoostingClassifier

Create a new classifier with gradient tree boosting.



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 61

def initialize(n_estimators: 100, learning_rate: 0.1, reg_lambda: 0.0, subsample: 1.0,
               max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
               max_features: nil, random_seed: nil)
  check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
                                    max_features: max_features, random_seed: random_seed)
  check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
  check_params_float(learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample)
  check_params_positive(n_estimators: n_estimators,
                        learning_rate: learning_rate, reg_lambda: reg_lambda, subsample: subsample,
                        max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
                        max_features: max_features)
  @params = {}
  @params[:n_estimators] = n_estimators
  @params[:learning_rate] = learning_rate
  @params[:reg_lambda] = reg_lambda
  @params[:subsample] = subsample
  @params[:max_depth] = max_depth
  @params[:max_leaf_nodes] = max_leaf_nodes
  @params[:min_samples_leaf] = min_samples_leaf
  @params[:max_features] = max_features
  @params[:random_seed] = random_seed
  @params[:random_seed] ||= srand
  @estimators = nil
  @classes = nil
  @base_predictions = nil
  @feature_importances = nil
  @rng = Random.new(@params[:random_seed])
end

Instance Attribute Details

#classesNumo::Int32 (readonly)

Return the class labels.



36
37
38
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 36

def classes
  @classes
end

#estimatorsArray<GradientTreeRegressor> (readonly)

Return the set of estimators.



32
33
34
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 32

def estimators
  @estimators
end

#feature_importancesNumo::DFloat (readonly)

Return the importance for each feature. The feature importances are calculated based on the numbers of times the feature is used for splitting.



41
42
43
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 41

def feature_importances
  @feature_importances
end

#rngRandom (readonly)

Return the random generator for random selection of feature index.



45
46
47
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 45

def rng
  @rng
end

Instance Method Details

#apply(x) ⇒ Numo::Int32

Return the index of the leaf that each sample reached.



189
190
191
192
193
194
195
196
197
198
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 189

def apply(x)
  check_sample_array(x)
  n_classes = @classes.size
  leaf_ids = if n_classes > 2
               Array.new(n_classes) { |n| @estimators[n].map { |tree| tree.apply(x) } }
             else
               @estimators.map { |tree| tree.apply(x) }
             end
  Numo::Int32[*leaf_ids].transpose
end

#decision_function(x) ⇒ Numo::DFloat

Calculate confidence scores for samples.



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 140

def decision_function(x)
  check_sample_array(x)
  n_samples = x.shape[0]
  n_classes = @classes.size
  if n_classes > 2
    scores = Numo::DFloat.ones(n_samples, n_classes) * @base_predictions
    n_classes.times do |n|
      @estimators[n].each { |tree| scores[true, n] += tree.predict(x) }
    end
  else
    scores = Numo::DFloat.ones(n_samples) * @base_predictions
    @estimators.each { |tree| scores += tree.predict(x) }
  end
  scores
end

#fit(x, y) ⇒ GradientBoostingClassifier

Fit the model with given training data.



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 95

def fit(x, y)
  check_sample_array(x)
  check_label_array(y)
  check_sample_label_size(x, y)

  n_features = x.shape[1]
  @params[:max_features] = n_features if @params[:max_features].nil?
  @params[:max_features] = [[1, @params[:max_features]].max, n_features].min

  # train estimator.
  @classes = Numo::Int32[*y.to_a.uniq.sort]
  n_classes = @classes.size
  if n_classes > 2
    @base_predictions = Numo::DFloat.zeros(n_classes)
    @estimators = Array.new(n_classes) do |n|
      bin_y = Numo::DFloat.cast(y.eq(@classes[n])) * 2 - 1
      y_mean = bin_y.mean
      @base_predictions[n] = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
      partial_fit(x, bin_y, @base_predictions[n])
    end
  else
    negative_label = y.to_a.uniq.min
    bin_y = Numo::DFloat.cast(y.ne(negative_label)) * 2 - 1
    y_mean = bin_y.mean
    @base_predictions = 0.5 * Numo::NMath.log((1.0 + y_mean) / (1.0 - y_mean))
    @estimators = partial_fit(x, bin_y, @base_predictions)
  end

  # calculate feature importances.
  @feature_importances = Numo::DFloat.zeros(n_features)
  if n_classes > 2
    n_classes.times do |n|
      @estimators[n].each { |tree| @feature_importances += tree.feature_importances }
    end
  else
    @estimators.each { |tree| @feature_importances += tree.feature_importances }
  end

  self
end

#marshal_dumpHash

Dump marshal data.



202
203
204
205
206
207
208
209
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 202

def marshal_dump
  { params: @params,
    estimators: @estimators,
    classes: @classes,
    base_predictions: @base_predictions,
    feature_importances: @feature_importances,
    rng: @rng }
end

#marshal_load(obj) ⇒ nil

Load marshal data.



213
214
215
216
217
218
219
220
221
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 213

def marshal_load(obj)
  @params = obj[:params]
  @estimators = obj[:estimators]
  @classes = obj[:classes]
  @base_predictions = obj[:base_predictions]
  @feature_importances = obj[:feature_importances]
  @rng = obj[:rng]
  nil
end

#predict(x) ⇒ Numo::Int32

Predict class labels for samples.



160
161
162
163
164
165
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 160

def predict(x)
  check_sample_array(x)
  n_samples = x.shape[0]
  probs = predict_proba(x)
  Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
end

#predict_proba(x) ⇒ Numo::DFloat

Predict probability for samples.



171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/rumale/ensemble/gradient_boosting_classifier.rb', line 171

def predict_proba(x)
  check_sample_array(x)

  proba = 1.0 / (Numo::NMath.exp(-decision_function(x)) + 1.0)

  return (proba.transpose / proba.sum(axis: 1)).transpose if @classes.size > 2

  n_samples, = x.shape
  probs = Numo::DFloat.zeros(n_samples, 2)
  probs[true, 1] = proba
  probs[true, 0] = 1.0 - proba
  probs
end