No flags found
Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.
e.g., #unittest #integration
#production #enterprise
#frontend #backend
b8ef9a9
... +249 ...
280c487
Use flags to group coverage reports by test type, project and/or folders.
Then setup custom commit statuses and notifications for each flag.
e.g., #unittest #integration
#production #enterprise
#frontend #backend
1 | + | """ |
|
2 | + | This module contains the loss classes. |
|
3 | + | ||
4 | + | Specific losses are used for regression, binary classification or multiclass |
|
5 | + | classification. |
|
6 | + | """ |
|
7 | + | # Author: Nicolas Hug |
|
8 | + | ||
9 | + | from abc import ABC, abstractmethod |
|
10 | + | ||
11 | + | import numpy as np |
|
12 | + | from scipy.special import expit |
|
13 | + | try: # logsumexp was moved from mist to special in 0.19 |
|
14 | + | from scipy.special import logsumexp |
|
15 | + | except ImportError: |
|
16 | + | from scipy.misc import logsumexp |
|
17 | + | ||
18 | + | from .types import Y_DTYPE |
|
19 | + | from .types import G_H_DTYPE |
|
20 | + | from ._loss import _update_gradients_least_squares |
|
21 | + | from ._loss import _update_gradients_hessians_binary_crossentropy |
|
22 | + | from ._loss import _update_gradients_hessians_categorical_crossentropy |
|
23 | + | ||
24 | + | ||
25 | + | class BaseLoss(ABC): |
|
26 | + | """Base class for a loss.""" |
|
27 | + | ||
28 | + | def init_gradients_and_hessians(self, n_samples, prediction_dim): |
|
29 | + | """Return initial gradients and hessians. |
|
30 | + | ||
31 | + | Unless hessians are constant, arrays are initialized with undefined |
|
32 | + | values. |
|
33 | + | ||
34 | + | Parameters |
|
35 | + | ---------- |
|
36 | + | n_samples : int |
|
37 | + | The number of samples passed to `fit()`. |
|
38 | + | prediction_dim : int |
|
39 | + | The dimension of a raw prediction, i.e. the number of trees |
|
40 | + | built at each iteration. Equals 1 for regression and binary |
|
41 | + | classification, or K where K is the number of classes for |
|
42 | + | multiclass classification. |
|
43 | + | ||
44 | + | Returns |
|
45 | + | ------- |
|
46 | + | gradients : ndarray, shape (prediction_dim, n_samples) |
|
47 | + | The initial gradients. The array is not initialized. |
|
48 | + | hessians : ndarray, shape (prediction_dim, n_samples) |
|
49 | + | If hessians are constant (e.g. for `LeastSquares` loss, the |
|
50 | + | array is initialized to ``1``. Otherwise, the array is allocated |
|
51 | + | without being initialized. |
|
52 | + | """ |
|
53 | + | shape = (prediction_dim, n_samples) |
|
54 | + | gradients = np.empty(shape=shape, dtype=G_H_DTYPE) |
|
55 | + | if self.hessians_are_constant: |
|
56 | + | # if the hessians are constant, we consider they are equal to 1. |
|
57 | + | # this is correct as long as we adjust the gradients. See e.g. LS |
|
58 | + | # loss |
|
59 | + | hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE) |
|
60 | + | else: |
|
61 | + | hessians = np.empty(shape=shape, dtype=G_H_DTYPE) |
|
62 | + | ||
63 | + | return gradients, hessians |
|
64 | + | ||
65 | + | @abstractmethod |
|
66 | + | def get_baseline_prediction(self, y_train, prediction_dim): |
|
67 | + | """Return initial predictions (before the first iteration). |
|
68 | + | ||
69 | + | Parameters |
|
70 | + | ---------- |
|
71 | + | y_train : ndarray, shape (n_samples,) |
|
72 | + | The target training values. |
|
73 | + | prediction_dim : int |
|
74 | + | The dimension of one prediction: 1 for binary classification and |
|
75 | + | regression, n_classes for multiclass classification. |
|
76 | + | ||
77 | + | Returns |
|
78 | + | ------- |
|
79 | + | baseline_prediction : float or ndarray, shape (1, prediction_dim) |
|
80 | + | The baseline prediction. |
|
81 | + | """ |
|
82 | + | ||
83 | + | @abstractmethod |
|
84 | + | def update_gradients_and_hessians(self, gradients, hessians, y_true, |
|
85 | + | raw_predictions): |
|
86 | + | """Update gradients and hessians arrays, inplace. |
|
87 | + | ||
88 | + | The gradients (resp. hessians) are the first (resp. second) order |
|
89 | + | derivatives of the loss for each sample with respect to the |
|
90 | + | predictions of model, evaluated at iteration ``i - 1``. |
|
91 | + | ||
92 | + | Parameters |
|
93 | + | ---------- |
|
94 | + | gradients : ndarray, shape (prediction_dim, n_samples) |
|
95 | + | The gradients (treated as OUT array). |
|
96 | + | hessians : ndarray, shape (prediction_dim, n_samples) or \ |
|
97 | + | (1,) |
|
98 | + | The hessians (treated as OUT array). |
|
99 | + | y_true : ndarray, shape (n_samples,) |
|
100 | + | The true target values or each training sample. |
|
101 | + | raw_predictions : ndarray, shape (prediction_dim, n_samples) |
|
102 | + | The raw_predictions (i.e. values from the trees) of the tree |
|
103 | + | ensemble at iteration ``i - 1``. |
|
104 | + | """ |
|
105 | + | ||
106 | + | ||
107 | + | class LeastSquares(BaseLoss): |
|
108 | + | """Least squares loss, for regression. |
|
109 | + | ||
110 | + | For a given sample x_i, least squares loss is defined as:: |
|
111 | + | ||
112 | + | loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2 |
|
113 | + | ||
114 | + | This actually computes the half least squares loss to optimize simplify |
|
115 | + | the computation of the gradients and get a unit hessian (and be consistent |
|
116 | + | with what is done in LightGBM). |
|
117 | + | """ |
|
118 | + | ||
119 | + | hessians_are_constant = True |
|
120 | + | ||
121 | + | def __call__(self, y_true, raw_predictions, average=True): |
|
122 | + | # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
|
123 | + | # return a view. |
|
124 | + | raw_predictions = raw_predictions.reshape(-1) |
|
125 | + | loss = 0.5 * np.power(y_true - raw_predictions, 2) |
|
126 | + | return loss.mean() if average else loss |
|
127 | + | ||
128 | + | def get_baseline_prediction(self, y_train, prediction_dim): |
|
129 | + | return np.mean(y_train) |
|
130 | + | ||
131 | + | @staticmethod |
|
132 | + | def inverse_link_function(raw_predictions): |
|
133 | + | return raw_predictions |
|
134 | + | ||
135 | + | def update_gradients_and_hessians(self, gradients, hessians, y_true, |
|
136 | + | raw_predictions): |
|
137 | + | # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
|
138 | + | # return a view. |
|
139 | + | raw_predictions = raw_predictions.reshape(-1) |
|
140 | + | gradients = gradients.reshape(-1) |
|
141 | + | _update_gradients_least_squares(gradients, y_true, raw_predictions) |
|
142 | + | ||
143 | + | ||
144 | + | class BinaryCrossEntropy(BaseLoss): |
|
145 | + | """Binary cross-entropy loss, for binary classification. |
|
146 | + | ||
147 | + | For a given sample x_i, the binary cross-entropy loss is defined as the |
|
148 | + | negative log-likelihood of the model which can be expressed as:: |
|
149 | + | ||
150 | + | loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i |
|
151 | + | ||
152 | + | See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman, |
|
153 | + | section 4.4.1 (about logistic regression). |
|
154 | + | """ |
|
155 | + | ||
156 | + | hessians_are_constant = False |
|
157 | + | inverse_link_function = staticmethod(expit) |
|
158 | + | ||
159 | + | def __call__(self, y_true, raw_predictions, average=True): |
|
160 | + | # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
|
161 | + | # return a view. |
|
162 | + | raw_predictions = raw_predictions.reshape(-1) |
|
163 | + | # logaddexp(0, x) = log(1 + exp(x)) |
|
164 | + | loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions |
|
165 | + | return loss.mean() if average else loss |
|
166 | + | ||
167 | + | def get_baseline_prediction(self, y_train, prediction_dim): |
|
168 | + | if prediction_dim > 2: |
|
169 | + | raise ValueError( |
|
170 | + | "loss='binary_crossentropy' is not defined for multiclass" |
|
171 | + | " classification with n_classes=%d, use" |
|
172 | + | " loss='categorical_crossentropy' instead" % prediction_dim) |
|
173 | + | proba_positive_class = np.mean(y_train) |
|
174 | + | eps = np.finfo(y_train.dtype).eps |
|
175 | + | proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps) |
|
176 | + | # log(x / 1 - x) is the anti function of sigmoid, or the link function |
|
177 | + | # of the Binomial model. |
|
178 | + | return np.log(proba_positive_class / (1 - proba_positive_class)) |
|
179 | + | ||
180 | + | def update_gradients_and_hessians(self, gradients, hessians, y_true, |
|
181 | + | raw_predictions): |
|
182 | + | # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
|
183 | + | # return a view. |
|
184 | + | raw_predictions = raw_predictions.reshape(-1) |
|
185 | + | gradients = gradients.reshape(-1) |
|
186 | + | hessians = hessians.reshape(-1) |
|
187 | + | _update_gradients_hessians_binary_crossentropy( |
|
188 | + | gradients, hessians, y_true, raw_predictions) |
|
189 | + | ||
190 | + | def predict_proba(self, raw_predictions): |
|
191 | + | # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to |
|
192 | + | # return a view. |
|
193 | + | raw_predictions = raw_predictions.reshape(-1) |
|
194 | + | proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE) |
|
195 | + | proba[:, 1] = expit(raw_predictions) |
|
196 | + | proba[:, 0] = 1 - proba[:, 1] |
|
197 | + | return proba |
|
198 | + | ||
199 | + | ||
200 | + | class CategoricalCrossEntropy(BaseLoss): |
|
201 | + | """Categorical cross-entropy loss, for multiclass classification. |
|
202 | + | ||
203 | + | For a given sample x_i, the categorical cross-entropy loss is defined as |
|
204 | + | the negative log-likelihood of the model and generalizes the binary |
|
205 | + | cross-entropy to more than 2 classes. |
|
206 | + | """ |
|
207 | + | ||
208 | + | hessians_are_constant = False |
|
209 | + | ||
210 | + | def __call__(self, y_true, raw_predictions, average=True): |
|
211 | + | one_hot_true = np.zeros_like(raw_predictions) |
|
212 | + | prediction_dim = raw_predictions.shape[0] |
|
213 | + | for k in range(prediction_dim): |
|
214 | + | one_hot_true[k, :] = (y_true == k) |
|
215 | + | ||
216 | + | loss = (logsumexp(raw_predictions, axis=0) - |
|
217 | + | (one_hot_true * raw_predictions).sum(axis=0)) |
|
218 | + | return loss.mean() if average else loss |
|
219 | + | ||
220 | + | def get_baseline_prediction(self, y_train, prediction_dim): |
|
221 | + | init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE) |
|
222 | + | eps = np.finfo(y_train.dtype).eps |
|
223 | + | for k in range(prediction_dim): |
|
224 | + | proba_kth_class = np.mean(y_train == k) |
|
225 | + | proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps) |
|
226 | + | init_value[k, :] += np.log(proba_kth_class) |
|
227 | + | ||
228 | + | return init_value |
|
229 | + | ||
230 | + | def update_gradients_and_hessians(self, gradients, hessians, y_true, |
|
231 | + | raw_predictions): |
|
232 | + | _update_gradients_hessians_categorical_crossentropy( |
|
233 | + | gradients, hessians, y_true, raw_predictions) |
|
234 | + | ||
235 | + | def predict_proba(self, raw_predictions): |
|
236 | + | # TODO: This could be done in parallel |
|
237 | + | # compute softmax (using exp(log(softmax))) |
|
238 | + | proba = np.exp(raw_predictions - |
|
239 | + | logsumexp(raw_predictions, axis=0)[np.newaxis, :]) |
|
240 | + | return proba.T |
|
241 | + | ||
242 | + | ||
243 | + | _LOSSES = { |
|
244 | + | 'least_squares': LeastSquares, |
|
245 | + | 'binary_crossentropy': BinaryCrossEntropy, |
|
246 | + | 'categorical_crossentropy': CategoricalCrossEntropy |
|
247 | + | } |
396 | 396 | # which is more feature than we have in most case. |
|
397 | 397 | estimator.set_params(k=1) |
|
398 | 398 | ||
399 | + | if name in ('HistGradientBoostingClassifier', |
|
400 | + | 'HistGradientBoostingRegressor'): |
|
401 | + | # The default min_samples_leaf (20) isn't appropriate for small |
|
402 | + | # datasets (only very shallow trees are built) that the checks use. |
|
403 | + | estimator.set_params(min_samples_leaf=5) |
|
404 | + | ||
399 | 405 | ||
400 | 406 | class NotAnArray: |
|
401 | 407 | """An object that is convertible to an array |
2462 | 2468 | if hasattr(estimator, method)} |
|
2463 | 2469 | ||
2464 | 2470 | # Fit again |
|
2471 | + | set_random_state(estimator) |
|
2465 | 2472 | estimator.fit(X_train, y_train) |
|
2466 | 2473 | ||
2467 | 2474 | for method in check_methods: |
2012 | 2012 | ||
2013 | 2013 | See also |
|
2014 | 2014 | -------- |
|
2015 | + | sklearn.ensemble.HistGradientBoostingClassifier, |
|
2015 | 2016 | sklearn.tree.DecisionTreeClassifier, RandomForestClassifier |
|
2016 | 2017 | AdaBoostClassifier |
|
2017 | 2018 |
2472 | 2473 | ||
2473 | 2474 | See also |
|
2474 | 2475 | -------- |
|
2475 | - | DecisionTreeRegressor, RandomForestRegressor |
|
2476 | + | sklearn.ensemble.HistGradientBoostingRegressor, |
|
2477 | + | sklearn.tree.DecisionTreeRegressor, RandomForestRegressor |
|
2476 | 2478 | ||
2477 | 2479 | References |
|
2478 | 2480 | ---------- |
1 | + | """ |
|
2 | + | This module contains the TreeGrower class. |
|
3 | + | ||
4 | + | TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on |
|
5 | + | the gradients and hessians of the training data. |
|
6 | + | """ |
|
7 | + | # Author: Nicolas Hug |
|
8 | + | ||
9 | + | from heapq import heappush, heappop |
|
10 | + | import numpy as np |
|
11 | + | from timeit import default_timer as time |
|
12 | + | import numbers |
|
13 | + | ||
14 | + | from .splitting import Splitter |
|
15 | + | from .histogram import HistogramBuilder |
|
16 | + | from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE |
|
17 | + | from .utils import sum_parallel |
|
18 | + | ||
19 | + | ||
20 | + | class TreeNode: |
|
21 | + | """Tree Node class used in TreeGrower. |
|
22 | + | ||
23 | + | This isn't used for prediction purposes, only for training (see |
|
24 | + | TreePredictor). |
|
25 | + | ||
26 | + | Parameters |
|
27 | + | ---------- |
|
28 | + | depth : int |
|
29 | + | The depth of the node, i.e. its distance from the root. |
|
30 | + | sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) |
|
31 | + | The indices of the samples at the node. |
|
32 | + | sum_gradients : float |
|
33 | + | The sum of the gradients of the samples at the node. |
|
34 | + | sum_hessians : float |
|
35 | + | The sum of the hessians of the samples at the node. |
|
36 | + | parent : TreeNode or None, optional (default=None) |
|
37 | + | The parent of the node. None for root. |
|
38 | + | ||
39 | + | Attributes |
|
40 | + | ---------- |
|
41 | + | depth : int |
|
42 | + | The depth of the node, i.e. its distance from the root. |
|
43 | + | sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) |
|
44 | + | The indices of the samples at the node. |
|
45 | + | sum_gradients : float |
|
46 | + | The sum of the gradients of the samples at the node. |
|
47 | + | sum_hessians : float |
|
48 | + | The sum of the hessians of the samples at the node. |
|
49 | + | parent : TreeNode or None |
|
50 | + | The parent of the node. None for root. |
|
51 | + | split_info : SplitInfo or None |
|
52 | + | The result of the split evaluation. |
|
53 | + | left_child : TreeNode or None |
|
54 | + | The left child of the node. None for leaves. |
|
55 | + | right_child : TreeNode or None |
|
56 | + | The right child of the node. None for leaves. |
|
57 | + | value : float or None |
|
58 | + | The value of the leaf, as computed in finalize_leaf(). None for |
|
59 | + | non-leaf nodes. |
|
60 | + | partition_start : int |
|
61 | + | start position of the node's sample_indices in splitter.partition. |
|
62 | + | partition_stop : int |
|
63 | + | stop position of the node's sample_indices in splitter.partition. |
|
64 | + | """ |
|
65 | + | ||
66 | + | split_info = None |
|
67 | + | left_child = None |
|
68 | + | right_child = None |
|
69 | + | value = None |
|
70 | + | histograms = None |
|
71 | + | sibling = None |
|
72 | + | parent = None |
|
73 | + | ||
74 | + | # start and stop indices of the node in the splitter.partition |
|
75 | + | # array. Concretely, |
|
76 | + | # self.sample_indices = view(self.splitter.partition[start:stop]) |
|
77 | + | # Please see the comments about splitter.partition and |
|
78 | + | # splitter.split_indices for more info about this design. |
|
79 | + | # These 2 attributes are only used in _update_raw_prediction, because we |
|
80 | + | # need to iterate over the leaves and I don't know how to efficiently |
|
81 | + | # store the sample_indices views because they're all of different sizes. |
|
82 | + | partition_start = 0 |
|
83 | + | partition_stop = 0 |
|
84 | + | ||
85 | + | def __init__(self, depth, sample_indices, sum_gradients, |
|
86 | + | sum_hessians, parent=None): |
|
87 | + | self.depth = depth |
|
88 | + | self.sample_indices = sample_indices |
|
89 | + | self.n_samples = sample_indices.shape[0] |
|
90 | + | self.sum_gradients = sum_gradients |
|
91 | + | self.sum_hessians = sum_hessians |
|
92 | + | self.parent = parent |
|
93 | + | ||
94 | + | def __lt__(self, other_node): |
|
95 | + | """Comparison for priority queue. |
|
96 | + | ||
97 | + | Nodes with high gain are higher priority than nodes with low gain. |
|
98 | + | ||
99 | + | heapq.heappush only need the '<' operator. |
|
100 | + | heapq.heappop take the smallest item first (smaller is higher |
|
101 | + | priority). |
|
102 | + | ||
103 | + | Parameters |
|
104 | + | ----------- |
|
105 | + | other_node : TreeNode |
|
106 | + | The node to compare with. |
|
107 | + | """ |
|
108 | + | return self.split_info.gain > other_node.split_info.gain |
|
109 | + | ||
110 | + | ||
111 | + | class TreeGrower: |
|
112 | + | """Tree grower class used to build a tree. |
|
113 | + | ||
114 | + | The tree is fitted to predict the values of a Newton-Raphson step. The |
|
115 | + | splits are considered in a best-first fashion, and the quality of a |
|
116 | + | split is defined in splitting._split_gain. |
|
117 | + | ||
118 | + | Parameters |
|
119 | + | ---------- |
|
120 | + | X_binned : ndarray of int, shape (n_samples, n_features) |
|
121 | + | The binned input samples. Must be Fortran-aligned. |
|
122 | + | gradients : ndarray, shape (n_samples,) |
|
123 | + | The gradients of each training sample. Those are the gradients of the |
|
124 | + | loss w.r.t the predictions, evaluated at iteration ``i - 1``. |
|
125 | + | hessians : ndarray, shape (n_samples,) |
|
126 | + | The hessians of each training sample. Those are the hessians of the |
|
127 | + | loss w.r.t the predictions, evaluated at iteration ``i - 1``. |
|
128 | + | max_leaf_nodes : int or None, optional (default=None) |
|
129 | + | The maximum number of leaves for each tree. If None, there is no |
|
130 | + | maximum limit. |
|
131 | + | max_depth : int or None, optional (default=None) |
|
132 | + | The maximum depth of each tree. The depth of a tree is the number of |
|
133 | + | nodes to go from the root to the deepest leaf. |
|
134 | + | min_samples_leaf : int, optional (default=20) |
|
135 | + | The minimum number of samples per leaf. |
|
136 | + | min_gain_to_split : float, optional (default=0.) |
|
137 | + | The minimum gain needed to split a node. Splits with lower gain will |
|
138 | + | be ignored. |
|
139 | + | max_bins : int, optional (default=256) |
|
140 | + | The maximum number of bins. Used to define the shape of the |
|
141 | + | histograms. |
|
142 | + | actual_n_bins : ndarray of int or int, optional (default=None) |
|
143 | + | The actual number of bins needed for each feature, which is lower or |
|
144 | + | equal to ``max_bins``. If it's an int, all features are considered to |
|
145 | + | have the same number of bins. If None, all features are considered to |
|
146 | + | have ``max_bins`` bins. |
|
147 | + | l2_regularization : float, optional (default=0) |
|
148 | + | The L2 regularization parameter. |
|
149 | + | min_hessian_to_split : float, optional (default=1e-3) |
|
150 | + | The minimum sum of hessians needed in each node. Splits that result in |
|
151 | + | at least one child having a sum of hessians less than |
|
152 | + | ``min_hessian_to_split`` are discarded. |
|
153 | + | shrinkage : float, optional (default=1) |
|
154 | + | The shrinkage parameter to apply to the leaves values, also known as |
|
155 | + | learning rate. |
|
156 | + | """ |
|
157 | + | def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, |
|
158 | + | max_depth=None, min_samples_leaf=20, min_gain_to_split=0., |
|
159 | + | max_bins=256, actual_n_bins=None, l2_regularization=0., |
|
160 | + | min_hessian_to_split=1e-3, shrinkage=1.): |
|
161 | + | ||
162 | + | self._validate_parameters(X_binned, max_leaf_nodes, max_depth, |
|
163 | + | min_samples_leaf, min_gain_to_split, |
|
164 | + | l2_regularization, min_hessian_to_split) |
|
165 | + | ||
166 | + | if actual_n_bins is None: |
|
167 | + | actual_n_bins = max_bins |
|
168 | + | ||
169 | + | if isinstance(actual_n_bins, numbers.Integral): |
|
170 | + | actual_n_bins = np.array( |
|
171 | + | [actual_n_bins] * X_binned.shape[1], |
|
172 | + | dtype=np.uint32) |
|
173 | + | else: |
|
174 | + | actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32) |
|
175 | + | ||
176 | + | hessians_are_constant = hessians.shape[0] == 1 |
|
177 | + | self.histogram_builder = HistogramBuilder( |
|
178 | + | X_binned, max_bins, gradients, hessians, hessians_are_constant) |
|
179 | + | self.splitter = Splitter( |
|
180 | + | X_binned, max_bins, actual_n_bins, l2_regularization, |
|
181 | + | min_hessian_to_split, min_samples_leaf, min_gain_to_split, |
|
182 | + | hessians_are_constant) |
|
183 | + | self.max_leaf_nodes = max_leaf_nodes |
|
184 | + | self.max_bins = max_bins |
|
185 | + | self.n_features = X_binned.shape[1] |
|
186 | + | self.max_depth = max_depth |
|
187 | + | self.min_samples_leaf = min_samples_leaf |
|
188 | + | self.X_binned = X_binned |
|
189 | + | self.min_gain_to_split = min_gain_to_split |
|
190 | + | self.shrinkage = shrinkage |
|
191 | + | self.splittable_nodes = [] |
|
192 | + | self.finalized_leaves = [] |
|
193 | + | self.total_find_split_time = 0. # time spent finding the best splits |
|
194 | + | self.total_compute_hist_time = 0. # time spent computing histograms |
|
195 | + | self.total_apply_split_time = 0. # time spent splitting nodes |
|
196 | + | self._intilialize_root(gradients, hessians, hessians_are_constant) |
|
197 | + | self.n_nodes = 1 |
|
198 | + | ||
199 | + | def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, |
|
200 | + | min_samples_leaf, min_gain_to_split, |
|
201 | + | l2_regularization, min_hessian_to_split): |
|
202 | + | """Validate parameters passed to __init__. |
|
203 | + | ||
204 | + | Also validate parameters passed to splitter. |
|
205 | + | """ |
|
206 | + | if X_binned.dtype != np.uint8: |
|
207 | + | raise NotImplementedError( |
|
208 | + | "X_binned must be of type uint8.") |
|
209 | + | if not X_binned.flags.f_contiguous: |
|
210 | + | raise ValueError( |
|
211 | + | "X_binned should be passed as Fortran contiguous " |
|
212 | + | "array for maximum efficiency.") |
|
213 | + | if max_leaf_nodes is not None and max_leaf_nodes <= 1: |
|
214 | + | raise ValueError('max_leaf_nodes={} should not be' |
|
215 | + | ' smaller than 2'.format(max_leaf_nodes)) |
|
216 | + | if max_depth is not None and max_depth <= 1: |
|
217 | + | raise ValueError('max_depth={} should not be' |
|
218 | + | ' smaller than 2'.format(max_depth)) |
|
219 | + | if min_samples_leaf < 1: |
|
220 | + | raise ValueError('min_samples_leaf={} should ' |
|
221 | + | 'not be smaller than 1'.format(min_samples_leaf)) |
|
222 | + | if min_gain_to_split < 0: |
|
223 | + | raise ValueError('min_gain_to_split={} ' |
|
224 | + | 'must be positive.'.format(min_gain_to_split)) |
|
225 | + | if l2_regularization < 0: |
|
226 | + | raise ValueError('l2_regularization={} must be ' |
|
227 | + | 'positive.'.format(l2_regularization)) |
|
228 | + | if min_hessian_to_split < 0: |
|
229 | + | raise ValueError('min_hessian_to_split={} ' |
|
230 | + | 'must be positive.'.format(min_hessian_to_split)) |
|
231 | + | ||
232 | + | def grow(self): |
|
233 | + | """Grow the tree, from root to leaves.""" |
|
234 | + | while self.splittable_nodes: |
|
235 | + | self.split_next() |
|
236 | + | ||
237 | + | def _intilialize_root(self, gradients, hessians, hessians_are_constant): |
|
238 | + | """Initialize root node and finalize it if needed.""" |
|
239 | + | n_samples = self.X_binned.shape[0] |
|
240 | + | depth = 0 |
|
241 | + | sum_gradients = sum_parallel(gradients) |
|
242 | + | if self.histogram_builder.hessians_are_constant: |
|
243 | + | sum_hessians = hessians[0] * n_samples |
|
244 | + | else: |
|
245 | + | sum_hessians = sum_parallel(hessians) |
|
246 | + | self.root = TreeNode( |
|
247 | + | depth=depth, |
|
248 | + | sample_indices=self.splitter.partition, |
|
249 | + | sum_gradients=sum_gradients, |
|
250 | + | sum_hessians=sum_hessians |
|
251 | + | ) |
|
252 | + | ||
253 | + | self.root.partition_start = 0 |
|
254 | + | self.root.partition_stop = n_samples |
|
255 | + | ||
256 | + | if self.root.n_samples < 2 * self.min_samples_leaf: |
|
257 | + | # Do not even bother computing any splitting statistics. |
|
258 | + | self._finalize_leaf(self.root) |
|
259 | + | return |
|
260 | + | if sum_hessians < self.splitter.min_hessian_to_split: |
|
261 | + | self._finalize_leaf(self.root) |
|
262 | + | return |
|
263 | + | ||
264 | + | self.root.histograms = self.histogram_builder.compute_histograms_brute( |
|
265 | + | self.root.sample_indices) |
|
266 | + | self._compute_best_split_and_push(self.root) |
|
267 | + | ||
268 | + | def _compute_best_split_and_push(self, node): |
|
269 | + | """Compute the best possible split (SplitInfo) of a given node. |
|
270 | + | ||
271 | + | Also push it in the heap of splittable nodes if gain isn't zero. |
|
272 | + | The gain of a node is 0 if either all the leaves are pure |
|
273 | + | (best gain = 0), or if no split would satisfy the constraints, |
|
274 | + | (min_hessians_to_split, min_gain_to_split, min_samples_leaf) |
|
275 | + | """ |
|
276 | + | ||
277 | + | node.split_info = self.splitter.find_node_split( |
|
278 | + | node.sample_indices, node.histograms, node.sum_gradients, |
|
279 | + | node.sum_hessians) |
|
280 | + | ||
281 | + | if node.split_info.gain <= 0: # no valid split |
|
282 | + | self._finalize_leaf(node) |
|
283 | + | else: |
|
284 | + | heappush(self.splittable_nodes, node) |
|
285 | + | ||
286 | + | def split_next(self): |
|
287 | + | """Split the node with highest potential gain. |
|
288 | + | ||
289 | + | Returns |
|
290 | + | ------- |
|
291 | + | left : TreeNode |
|
292 | + | The resulting left child. |
|
293 | + | right : TreeNode |
|
294 | + | The resulting right child. |
|
295 | + | """ |
|
296 | + | # Consider the node with the highest loss reduction (a.k.a. gain) |
|
297 | + | node = heappop(self.splittable_nodes) |
|
298 | + | ||
299 | + | tic = time() |
|
300 | + | (sample_indices_left, |
|
301 | + | sample_indices_right, |
|
302 | + | right_child_pos) = self.splitter.split_indices(node.split_info, |
|
303 | + | node.sample_indices) |
|
304 | + | self.total_apply_split_time += time() - tic |
|
305 | + | ||
306 | + | depth = node.depth + 1 |
|
307 | + | n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes) |
|
308 | + | n_leaf_nodes += 2 |
|
309 | + | ||
310 | + | left_child_node = TreeNode(depth, |
|
311 | + | sample_indices_left, |
|
312 | + | node.split_info.sum_gradient_left, |
|
313 | + | node.split_info.sum_hessian_left, |
|
314 | + | parent=node) |
|
315 | + | right_child_node = TreeNode(depth, |
|
316 | + | sample_indices_right, |
|
317 | + | node.split_info.sum_gradient_right, |
|
318 | + | node.split_info.sum_hessian_right, |
|
319 | + | parent=node) |
|
320 | + | left_child_node.sibling = right_child_node |
|
321 | + | right_child_node.sibling = left_child_node |
|
322 | + | node.right_child = right_child_node |
|
323 | + | node.left_child = left_child_node |
|
324 | + | ||
325 | + | # set start and stop indices |
|
326 | + | left_child_node.partition_start = node.partition_start |
|
327 | + | left_child_node.partition_stop = node.partition_start + right_child_pos |
|
328 | + | right_child_node.partition_start = left_child_node.partition_stop |
|
329 | + | right_child_node.partition_stop = node.partition_stop |
|
330 | + | ||
331 | + | self.n_nodes += 2 |
|
332 | + | ||
333 | + | if self.max_depth is not None and depth == self.max_depth: |
|
334 | + | self._finalize_leaf(left_child_node) |
|
335 | + | self._finalize_leaf(right_child_node) |
|
336 | + | return left_child_node, right_child_node |
|
337 | + | ||
338 | + | if (self.max_leaf_nodes is not None |
|
339 | + | and n_leaf_nodes == self.max_leaf_nodes): |
|
340 | + | self._finalize_leaf(left_child_node) |
|
341 | + | self._finalize_leaf(right_child_node) |
|
342 | + | self._finalize_splittable_nodes() |
|
343 | + | return left_child_node, right_child_node |
|
344 | + | ||
345 | + | if left_child_node.n_samples < self.min_samples_leaf * 2: |
|
346 | + | self._finalize_leaf(left_child_node) |
|
347 | + | if right_child_node.n_samples < self.min_samples_leaf * 2: |
|
348 | + | self._finalize_leaf(right_child_node) |
|
349 | + | ||
350 | + | # Compute histograms of childs, and compute their best possible split |
|
351 | + | # (if needed) |
|
352 | + | should_split_left = left_child_node.value is None # node isn't a leaf |
|
353 | + | should_split_right = right_child_node.value is None |
|
354 | + | if should_split_left or should_split_right: |
|
355 | + | ||
356 | + | # We will compute the histograms of both nodes even if one of them |
|
357 | + | # is a leaf, since computing the second histogram is very cheap |
|
358 | + | # (using histogram subtraction). |
|
359 | + | n_samples_left = left_child_node.sample_indices.shape[0] |
|
360 | + | n_samples_right = right_child_node.sample_indices.shape[0] |
|
361 | + | if n_samples_left < n_samples_right: |
|
362 | + | smallest_child = left_child_node |
|
363 | + | largest_child = right_child_node |
|
364 | + | else: |
|
365 | + | smallest_child = right_child_node |
|
366 | + | largest_child = left_child_node |
|
367 | + | ||
368 | + | # We use the brute O(n_samples) method on the child that has the |
|
369 | + | # smallest number of samples, and the subtraction trick O(n_bins) |
|
370 | + | # on the other one. |
|
371 | + | tic = time() |
|
372 | + | smallest_child.histograms = \ |
|
373 | + | self.histogram_builder.compute_histograms_brute( |
|
374 | + | smallest_child.sample_indices) |
|
375 | + | largest_child.histograms = \ |
|
376 | + | self.histogram_builder.compute_histograms_subtraction( |
|
377 | + | node.histograms, smallest_child.histograms) |
|
378 | + | self.total_compute_hist_time += time() - tic |
|
379 | + | ||
380 | + | tic = time() |
|
381 | + | if should_split_left: |
|
382 | + | self._compute_best_split_and_push(left_child_node) |
|
383 | + | if should_split_right: |
|
384 | + | self._compute_best_split_and_push(right_child_node) |
|
385 | + | self.total_find_split_time += time() - tic |
|
386 | + | ||
387 | + | return left_child_node, right_child_node |
|
388 | + | ||
389 | + | def _finalize_leaf(self, node): |
|
390 | + | """Compute the prediction value that minimizes the objective function. |
|
391 | + | ||
392 | + | This sets the node.value attribute (node is a leaf iff node.value is |
|
393 | + | not None). |
|
394 | + | ||
395 | + | See Equation 5 of: |
|
396 | + | XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016 |
|
397 | + | https://arxiv.org/abs/1603.02754 |
|
398 | + | """ |
|
399 | + | node.value = -self.shrinkage * node.sum_gradients / ( |
|
400 | + | node.sum_hessians + self.splitter.l2_regularization) |
|
401 | + | self.finalized_leaves.append(node) |
|
402 | + | ||
403 | + | def _finalize_splittable_nodes(self): |
|
404 | + | """Transform all splittable nodes into leaves. |
|
405 | + | ||
406 | + | Used when some constraint is met e.g. maximum number of leaves or |
|
407 | + | maximum depth.""" |
|
408 | + | while len(self.splittable_nodes) > 0: |
|
409 | + | node = self.splittable_nodes.pop() |
|
410 | + | self._finalize_leaf(node) |
|
411 | + | ||
412 | + | def make_predictor(self, bin_thresholds=None): |
|
413 | + | """Make a TreePredictor object out of the current tree. |
|
414 | + | ||
415 | + | Parameters |
|
416 | + | ---------- |
|
417 | + | bin_thresholds : array-like of floats, optional (default=None) |
|
418 | + | The actual thresholds values of each bin. |
|
419 | + | ||
420 | + | Returns |
|
421 | + | ------- |
|
422 | + | A TreePredictor object. |
|
423 | + | """ |
|
424 | + | predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE) |
|
425 | + | _fill_predictor_node_array(predictor_nodes, self.root, |
|
426 | + | bin_thresholds=bin_thresholds) |
|
427 | + | return TreePredictor(predictor_nodes) |
|
428 | + | ||
429 | + | ||
430 | + | def _fill_predictor_node_array(predictor_nodes, grower_node, |
|
431 | + | bin_thresholds, next_free_idx=0): |
|
432 | + | """Helper used in make_predictor to set the TreePredictor fields.""" |
|
433 | + | node = predictor_nodes[next_free_idx] |
|
434 | + | node['count'] = grower_node.n_samples |
|
435 | + | node['depth'] = grower_node.depth |
|
436 | + | if grower_node.split_info is not None: |
|
437 | + | node['gain'] = grower_node.split_info.gain |
|
438 | + | else: |
|
439 | + | node['gain'] = -1 |
|
440 | + | ||
441 | + | if grower_node.value is not None: |
|
442 | + | # Leaf node |
|
443 | + | node['is_leaf'] = True |
|
444 | + | node['value'] = grower_node.value |
|
445 | + | return next_free_idx + 1 |
|
446 | + | else: |
|
447 | + | # Decision node |
|
448 | + | split_info = grower_node.split_info |
|
449 | + | feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx |
|
450 | + | node['feature_idx'] = feature_idx |
|
451 | + | node['bin_threshold'] = bin_idx |
|
452 | + | if bin_thresholds is not None: |
|
453 | + | threshold = bin_thresholds[feature_idx][bin_idx] |
|
454 | + | node['threshold'] = threshold |
|
455 | + | next_free_idx += 1 |
|
456 | + | ||
457 | + | node['left'] = next_free_idx |
|
458 | + | next_free_idx = _fill_predictor_node_array( |
|
459 | + | predictor_nodes, grower_node.left_child, |
|
460 | + | bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) |
|
461 | + | ||
462 | + | node['right'] = next_free_idx |
|
463 | + | return _fill_predictor_node_array( |
|
464 | + | predictor_nodes, grower_node.right_child, |
|
465 | + | bin_thresholds=bin_thresholds, next_free_idx=next_free_idx) |
4 | 4 | ||
5 | 5 | def configuration(parent_package="", top_path=None): |
|
6 | 6 | config = Configuration("ensemble", parent_package, top_path) |
|
7 | + | ||
7 | 8 | config.add_extension("_gradient_boosting", |
|
8 | 9 | sources=["_gradient_boosting.pyx"], |
|
9 | 10 | include_dirs=[numpy.get_include()]) |
|
10 | 11 | ||
11 | 12 | config.add_subpackage("tests") |
|
12 | 13 | ||
14 | + | # Histogram-based gradient boosting files |
|
15 | + | config.add_extension( |
|
16 | + | "_hist_gradient_boosting._gradient_boosting", |
|
17 | + | sources=["_hist_gradient_boosting/_gradient_boosting.pyx"], |
|
18 | + | include_dirs=[numpy.get_include()]) |
|
19 | + | ||
20 | + | config.add_extension("_hist_gradient_boosting.histogram", |
|
21 | + | sources=["_hist_gradient_boosting/histogram.pyx"], |
|
22 | + | include_dirs=[numpy.get_include()]) |
|
23 | + | ||
24 | + | config.add_extension("_hist_gradient_boosting.splitting", |
|
25 | + | sources=["_hist_gradient_boosting/splitting.pyx"], |
|
26 | + | include_dirs=[numpy.get_include()]) |
|
27 | + | ||
28 | + | config.add_extension("_hist_gradient_boosting._binning", |
|
29 | + | sources=["_hist_gradient_boosting/_binning.pyx"], |
|
30 | + | include_dirs=[numpy.get_include()]) |
|
31 | + | ||
32 | + | config.add_extension("_hist_gradient_boosting._predictor", |
|
33 | + | sources=["_hist_gradient_boosting/_predictor.pyx"], |
|
34 | + | include_dirs=[numpy.get_include()]) |
|
35 | + | ||
36 | + | config.add_extension("_hist_gradient_boosting._loss", |
|
37 | + | sources=["_hist_gradient_boosting/_loss.pyx"], |
|
38 | + | include_dirs=[numpy.get_include()]) |
|
39 | + | ||
40 | + | config.add_extension("_hist_gradient_boosting.types", |
|
41 | + | sources=["_hist_gradient_boosting/types.pyx"], |
|
42 | + | include_dirs=[numpy.get_include()]) |
|
43 | + | ||
44 | + | config.add_extension("_hist_gradient_boosting.utils", |
|
45 | + | sources=["_hist_gradient_boosting/utils.pyx"], |
|
46 | + | include_dirs=[numpy.get_include()]) |
|
47 | + | ||
48 | + | config.add_subpackage("_hist_gradient_boosting.tests") |
|
49 | + | ||
13 | 50 | return config |
|
14 | 51 | ||
15 | 52 | if __name__ == "__main__": |
Learn more Showing 61 files with coverage changes found.
sklearn/neighbors/setup.py
sklearn/preprocessing/setup.py
sklearn/feature_extraction/setup.py
sklearn/utils/setup.py
sklearn/tree/setup.py
sklearn/ensemble/partial_dependence.py
sklearn/manifold/setup.py
sklearn/datasets/setup.py
sklearn/metrics/setup.py
sklearn/tests/test_docstring_parameters.py
sklearn/decomposition/setup.py
sklearn/linear_model/setup.py
sklearn/metrics/cluster/setup.py
sklearn/inspection/partial_dependence.py
sklearn/__check_build/setup.py
sklearn/inspection/tests/test_partial_dependence.py
sklearn/ensemble/tests/test_partial_dependence.py
sklearn/_build_utils/__init__.py
sklearn/utils/fixes.py
sklearn/compose/tests/test_column_transformer.py
sklearn/tree/export.py
sklearn/utils/testing.py
sklearn/manifold/spectral_embedding_.py
sklearn/utils/tests/test_testing.py
sklearn/_build_utils/openmp_helpers.py
sklearn/compose/_column_transformer.py
sklearn/preprocessing/tests/test_encoders.py
sklearn/utils/tests/test_utils.py
sklearn/tree/tests/test_export.py
sklearn/utils/tests/test_validation.py
sklearn/preprocessing/tests/test_function_transformer.py
sklearn/manifold/tests/test_spectral_embedding.py
sklearn/cluster/tests/test_spectral.py
sklearn/tree/_reingold_tilford.py
sklearn/tests/test_impute.py
sklearn/utils/tests/test_multiclass.py
sklearn/utils/deprecation.py
sklearn/svm/classes.py
sklearn/utils/__init__.py
sklearn/metrics/pairwise.py
sklearn/linear_model/ridge.py
sklearn/neighbors/tests/test_dist_metrics.py
sklearn/utils/validation.py
sklearn/utils/multiclass.py
sklearn/gaussian_process/gpr.py
sklearn/metrics/cluster/supervised.py
sklearn/preprocessing/_encoders.py
sklearn/model_selection/tests/test_validation.py
sklearn/impute.py
sklearn/utils/estimator_checks.py
sklearn/model_selection/tests/test_split.py
sklearn/model_selection/tests/test_search.py
sklearn/metrics/tests/test_classification.py
sklearn/ensemble/_hist_gradient_boosting/predictor.py
sklearn/ensemble/_hist_gradient_boosting/loss.py
sklearn/experimental/enable_hist_gradient_boosting.py
sklearn/ensemble/_hist_gradient_boosting/grower.py
sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
sklearn/experimental/tests/test_enable_hist_gradient_boosting.py
sklearn/ensemble/_hist_gradient_boosting/binning.py
280c487
d63d9db
9d8269a
406cec1
8adb9f0
10cb5be
962c5e4
55152cd
ac40e4d
6f6fa51
7bc7f6e
66d1376
94b814c
dfbea1d
e1deb05
8bffe2c
37e2742
f0d477c
49ca471
6685244
cc980a7
6d62f92
6c9f03e
42dda67
5cbabf8
058ae94
4755ba7
172abaa
442593a
5623288
1da9941
d493fe4
3ef0212
6109620
a8a4ce0
4cb5da4
6553f72
c4b22bf
b79876b
83bc17a
69f127c
ea14a84
8b1f603
fab5cc2
acfcce5
b8b73e6
505d409
6d70978
72d48b9
062ec75
f4ac929
dcce26b
45a1d05
2c461d6
0bb5a9f
11f5573
22ce4fa
c272fd0
b4ce890
7e4a88b
1536120
a7766fa
2e24b71
bd72a4b
063fdea
82428f0
8b70c5d
4c4a05a
a120db2
903b522
726f0e6
ccde666
2e86b3c
81a51c9
5ab23cd
9a93692
cf2d832
1934c56
946823f
b33ebad
6d1b606
c563977
f1c1c3d
7813e96
01ec7d6
47a72da
5b40ffd
9c3c450
beb0e31
f53a2e0
ae0d101
b0ba1d6
2416cb7
2644cb3
bdfacb1
04f0e86
491e14c
3c5f922
86a8496
1fc79af
e2319be
fc1a399
da1174c
a4d5c9b
3f94a32
e4d67f7
357a283
27f6481
c3702a5
fa38f02
33e8374
e8d3554
d6df35f
8df021e
930c4d6
f79763e
cec180e
2af2504
2d2c081
c4f3985
fcfbf64
bc0d805
c08ca89
ee96ac3
c76dcd4
8de4e4f
7fcf760
f04f4d8
796183f
9262d45
6f5c93f
69f6c4b
fb86030
404f3ae
431920d
de051a9
b3d32ba
8045eb9
0204a5d
d79d636
5d53e5b
82f4ce1
b7cf145
0c09736
b9a151a
a83225e
9717834
2d76ad3
e818f00
6266c6d
e512799
1364f43
9ec5a49
f935761
23f1d4f
c5ccae7
483a744
b071efd
2004615
b1784b0
c3e4340
d653a54
3cb197e
e160d55
cbd9d15
96d9ea6
9e68984
d16ecff
170a5e1
c338ba8
92dfe9d
c34a054
14c7d47
39d8030
e47b745
783a399
2ba66cb
a92cbbd
e06b988
a53de7b
468ec14
ad94842
00aab5f
f543d61
6e791ba
ce5dff3
2e5bf39
afd48ac
d703bf1
8fc65f7
cb38816
2181495
4614762
44c34cc
0dbbcee
396b65c
a70b150
602802f
01098e3
f93e2a5
48abf28
c50f9e7
ea53299
d782d02
9ff4242
c75acca
e66fff2
b4ba169
29ffcdf
2341a04
e4738ee
04a99c4
59a7483
a86f0d2
27d32d6
65ac62a
1bfde2c
5060aee
1cd23f1
10affef
713d838
565e936
ec5128c
ae4640e
5a82534
2fd29e1
af23bec
d0f73cd
35343f2
5d8c21a
628ea61
c4d00f0
f8500a2
1fac60a
8e8b927
3614a7e
dac76a1
6fafd85
2a80af8
10520da
c6227cd
cf3f723
e9c2509
1ea65e2
722a982
889d39f
498fe50
67602e5
ca4d144