scikit-learn / scikit-learn

# Compare ``` b8ef9a9 ... +249 ... 280c487 ```

@@ -0,0 +1,247 @@
 1 + `"""` 2 + `This module contains the loss classes.` 3 + 4 + `Specific losses are used for regression, binary classification or multiclass` 5 + `classification.` 6 + `"""` 7 + `# Author: Nicolas Hug` 8 + 9 + `from abc import ABC, abstractmethod` 10 + 11 + `import numpy as np` 12 + `from scipy.special import expit` 13 + `try: # logsumexp was moved from mist to special in 0.19` 14 + ` from scipy.special import logsumexp` 15 + `except ImportError:` 16 + ` from scipy.misc import logsumexp` 17 + 18 + `from .types import Y_DTYPE` 19 + `from .types import G_H_DTYPE` 20 + `from ._loss import _update_gradients_least_squares` 21 + `from ._loss import _update_gradients_hessians_binary_crossentropy` 22 + `from ._loss import _update_gradients_hessians_categorical_crossentropy` 23 + 24 + 25 + `class BaseLoss(ABC):` 26 + ` """Base class for a loss."""` 27 + 28 + ` def init_gradients_and_hessians(self, n_samples, prediction_dim):` 29 + ` """Return initial gradients and hessians.` 30 + 31 + ` Unless hessians are constant, arrays are initialized with undefined` 32 + ` values.` 33 + 34 + ` Parameters` 35 + ` ----------` 36 + ` n_samples : int` 37 + ` The number of samples passed to `fit()`.` 38 + ` prediction_dim : int` 39 + ` The dimension of a raw prediction, i.e. the number of trees` 40 + ` built at each iteration. Equals 1 for regression and binary` 41 + ` classification, or K where K is the number of classes for` 42 + ` multiclass classification.` 43 + 44 + ` Returns` 45 + ` -------` 46 + ` gradients : ndarray, shape (prediction_dim, n_samples)` 47 + ` The initial gradients. The array is not initialized.` 48 + ` hessians : ndarray, shape (prediction_dim, n_samples)` 49 + ` If hessians are constant (e.g. for `LeastSquares` loss, the` 50 + ` array is initialized to ``1``. Otherwise, the array is allocated` 51 + ` without being initialized.` 52 + ` """` 53 + ` shape = (prediction_dim, n_samples)` 54 + ` gradients = np.empty(shape=shape, dtype=G_H_DTYPE)` 55 + ` if self.hessians_are_constant:` 56 + ` # if the hessians are constant, we consider they are equal to 1.` 57 + ` # this is correct as long as we adjust the gradients. See e.g. LS` 58 + ` # loss` 59 + ` hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)` 60 + ` else:` 61 + ` hessians = np.empty(shape=shape, dtype=G_H_DTYPE)` 62 + 63 + ` return gradients, hessians` 64 + 65 + ` @abstractmethod` 66 + ` def get_baseline_prediction(self, y_train, prediction_dim):` 67 + ` """Return initial predictions (before the first iteration).` 68 + 69 + ` Parameters` 70 + ` ----------` 71 + ` y_train : ndarray, shape (n_samples,)` 72 + ` The target training values.` 73 + ` prediction_dim : int` 74 + ` The dimension of one prediction: 1 for binary classification and` 75 + ` regression, n_classes for multiclass classification.` 76 + 77 + ` Returns` 78 + ` -------` 79 + ` baseline_prediction : float or ndarray, shape (1, prediction_dim)` 80 + ` The baseline prediction.` 81 + ` """` 82 + 83 + ` @abstractmethod` 84 + ` def update_gradients_and_hessians(self, gradients, hessians, y_true,` 85 + ` raw_predictions):` 86 + ` """Update gradients and hessians arrays, inplace.` 87 + 88 + ` The gradients (resp. hessians) are the first (resp. second) order` 89 + ` derivatives of the loss for each sample with respect to the` 90 + ` predictions of model, evaluated at iteration ``i - 1``.` 91 + 92 + ` Parameters` 93 + ` ----------` 94 + ` gradients : ndarray, shape (prediction_dim, n_samples)` 95 + ` The gradients (treated as OUT array).` 96 + ` hessians : ndarray, shape (prediction_dim, n_samples) or \` 97 + ` (1,)` 98 + ` The hessians (treated as OUT array).` 99 + ` y_true : ndarray, shape (n_samples,)` 100 + ` The true target values or each training sample.` 101 + ` raw_predictions : ndarray, shape (prediction_dim, n_samples)` 102 + ` The raw_predictions (i.e. values from the trees) of the tree` 103 + ` ensemble at iteration ``i - 1``.` 104 + ` """` 105 + 106 + 107 + `class LeastSquares(BaseLoss):` 108 + ` """Least squares loss, for regression.` 109 + 110 + ` For a given sample x_i, least squares loss is defined as::` 111 + 112 + ` loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2` 113 + 114 + ` This actually computes the half least squares loss to optimize simplify` 115 + ` the computation of the gradients and get a unit hessian (and be consistent` 116 + ` with what is done in LightGBM).` 117 + ` """` 118 + 119 + ` hessians_are_constant = True` 120 + 121 + ` def __call__(self, y_true, raw_predictions, average=True):` 122 + ` # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to` 123 + ` # return a view.` 124 + ` raw_predictions = raw_predictions.reshape(-1)` 125 + ` loss = 0.5 * np.power(y_true - raw_predictions, 2)` 126 + ` return loss.mean() if average else loss` 127 + 128 + ` def get_baseline_prediction(self, y_train, prediction_dim):` 129 + ` return np.mean(y_train)` 130 + 131 + ` @staticmethod` 132 + ` def inverse_link_function(raw_predictions):` 133 + ` return raw_predictions` 134 + 135 + ` def update_gradients_and_hessians(self, gradients, hessians, y_true,` 136 + ` raw_predictions):` 137 + ` # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to` 138 + ` # return a view.` 139 + ` raw_predictions = raw_predictions.reshape(-1)` 140 + ` gradients = gradients.reshape(-1)` 141 + ` _update_gradients_least_squares(gradients, y_true, raw_predictions)` 142 + 143 + 144 + `class BinaryCrossEntropy(BaseLoss):` 145 + ` """Binary cross-entropy loss, for binary classification.` 146 + 147 + ` For a given sample x_i, the binary cross-entropy loss is defined as the` 148 + ` negative log-likelihood of the model which can be expressed as::` 149 + 150 + ` loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i` 151 + 152 + ` See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,` 153 + ` section 4.4.1 (about logistic regression).` 154 + ` """` 155 + 156 + ` hessians_are_constant = False` 157 + ` inverse_link_function = staticmethod(expit)` 158 + 159 + ` def __call__(self, y_true, raw_predictions, average=True):` 160 + ` # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to` 161 + ` # return a view.` 162 + ` raw_predictions = raw_predictions.reshape(-1)` 163 + ` # logaddexp(0, x) = log(1 + exp(x))` 164 + ` loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions` 165 + ` return loss.mean() if average else loss` 166 + 167 + ` def get_baseline_prediction(self, y_train, prediction_dim):` 168 + ` if prediction_dim > 2:` 169 + ` raise ValueError(` 170 + ` "loss='binary_crossentropy' is not defined for multiclass"` 171 + ` " classification with n_classes=%d, use"` 172 + ` " loss='categorical_crossentropy' instead" % prediction_dim)` 173 + ` proba_positive_class = np.mean(y_train)` 174 + ` eps = np.finfo(y_train.dtype).eps` 175 + ` proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)` 176 + ` # log(x / 1 - x) is the anti function of sigmoid, or the link function` 177 + ` # of the Binomial model.` 178 + ` return np.log(proba_positive_class / (1 - proba_positive_class))` 179 + 180 + ` def update_gradients_and_hessians(self, gradients, hessians, y_true,` 181 + ` raw_predictions):` 182 + ` # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to` 183 + ` # return a view.` 184 + ` raw_predictions = raw_predictions.reshape(-1)` 185 + ` gradients = gradients.reshape(-1)` 186 + ` hessians = hessians.reshape(-1)` 187 + ` _update_gradients_hessians_binary_crossentropy(` 188 + ` gradients, hessians, y_true, raw_predictions)` 189 + 190 + ` def predict_proba(self, raw_predictions):` 191 + ` # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to` 192 + ` # return a view.` 193 + ` raw_predictions = raw_predictions.reshape(-1)` 194 + ` proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)` 195 + ` proba[:, 1] = expit(raw_predictions)` 196 + ` proba[:, 0] = 1 - proba[:, 1]` 197 + ` return proba` 198 + 199 + 200 + `class CategoricalCrossEntropy(BaseLoss):` 201 + ` """Categorical cross-entropy loss, for multiclass classification.` 202 + 203 + ` For a given sample x_i, the categorical cross-entropy loss is defined as` 204 + ` the negative log-likelihood of the model and generalizes the binary` 205 + ` cross-entropy to more than 2 classes.` 206 + ` """` 207 + 208 + ` hessians_are_constant = False` 209 + 210 + ` def __call__(self, y_true, raw_predictions, average=True):` 211 + ` one_hot_true = np.zeros_like(raw_predictions)` 212 + ` prediction_dim = raw_predictions.shape[0]` 213 + ` for k in range(prediction_dim):` 214 + ` one_hot_true[k, :] = (y_true == k)` 215 + 216 + ` loss = (logsumexp(raw_predictions, axis=0) -` 217 + ` (one_hot_true * raw_predictions).sum(axis=0))` 218 + ` return loss.mean() if average else loss` 219 + 220 + ` def get_baseline_prediction(self, y_train, prediction_dim):` 221 + ` init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)` 222 + ` eps = np.finfo(y_train.dtype).eps` 223 + ` for k in range(prediction_dim):` 224 + ` proba_kth_class = np.mean(y_train == k)` 225 + ` proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)` 226 + ` init_value[k, :] += np.log(proba_kth_class)` 227 + 228 + ` return init_value` 229 + 230 + ` def update_gradients_and_hessians(self, gradients, hessians, y_true,` 231 + ` raw_predictions):` 232 + ` _update_gradients_hessians_categorical_crossentropy(` 233 + ` gradients, hessians, y_true, raw_predictions)` 234 + 235 + ` def predict_proba(self, raw_predictions):` 236 + ` # TODO: This could be done in parallel` 237 + ` # compute softmax (using exp(log(softmax)))` 238 + ` proba = np.exp(raw_predictions -` 239 + ` logsumexp(raw_predictions, axis=0)[np.newaxis, :])` 240 + ` return proba.T` 241 + 242 + 243 + `_LOSSES = {` 244 + ` 'least_squares': LeastSquares,` 245 + ` 'binary_crossentropy': BinaryCrossEntropy,` 246 + ` 'categorical_crossentropy': CategoricalCrossEntropy` 247 + `}`

@@ -396,6 +396,12 @@
 396 396 ` # which is more feature than we have in most case.` 397 397 ` estimator.set_params(k=1)` 398 398 399 + ` if name in ('HistGradientBoostingClassifier',` 400 + ` 'HistGradientBoostingRegressor'):` 401 + ` # The default min_samples_leaf (20) isn't appropriate for small` 402 + ` # datasets (only very shallow trees are built) that the checks use.` 403 + ` estimator.set_params(min_samples_leaf=5)` 404 + 399 405 400 406 `class NotAnArray:` 401 407 ` """An object that is convertible to an array`
@@ -2462,6 +2468,7 @@
 2462 2468 ` if hasattr(estimator, method)}` 2463 2469 2464 2470 ` # Fit again` 2471 + ` set_random_state(estimator)` 2465 2472 ` estimator.fit(X_train, y_train)` 2466 2473 2467 2474 ` for method in check_methods:`

@@ -2012,6 +2012,7 @@
 2012 2012 2013 2013 ` See also` 2014 2014 ` --------` 2015 + ` sklearn.ensemble.HistGradientBoostingClassifier,` 2015 2016 ` sklearn.tree.DecisionTreeClassifier, RandomForestClassifier` 2016 2017 ` AdaBoostClassifier` 2017 2018
@@ -2472,7 +2473,8 @@
 2472 2473 2473 2474 ` See also` 2474 2475 ` --------` 2475 - ` DecisionTreeRegressor, RandomForestRegressor` 2476 + ` sklearn.ensemble.HistGradientBoostingRegressor,` 2477 + ` sklearn.tree.DecisionTreeRegressor, RandomForestRegressor` 2476 2478 2477 2479 ` References` 2478 2480 ` ----------`

@@ -0,0 +1,465 @@
 1 + `"""` 2 + `This module contains the TreeGrower class.` 3 + 4 + `TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on` 5 + `the gradients and hessians of the training data.` 6 + `"""` 7 + `# Author: Nicolas Hug` 8 + 9 + `from heapq import heappush, heappop` 10 + `import numpy as np` 11 + `from timeit import default_timer as time` 12 + `import numbers` 13 + 14 + `from .splitting import Splitter` 15 + `from .histogram import HistogramBuilder` 16 + `from .predictor import TreePredictor, PREDICTOR_RECORD_DTYPE` 17 + `from .utils import sum_parallel` 18 + 19 + 20 + `class TreeNode:` 21 + ` """Tree Node class used in TreeGrower.` 22 + 23 + ` This isn't used for prediction purposes, only for training (see` 24 + ` TreePredictor).` 25 + 26 + ` Parameters` 27 + ` ----------` 28 + ` depth : int` 29 + ` The depth of the node, i.e. its distance from the root.` 30 + ` sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)` 31 + ` The indices of the samples at the node.` 32 + ` sum_gradients : float` 33 + ` The sum of the gradients of the samples at the node.` 34 + ` sum_hessians : float` 35 + ` The sum of the hessians of the samples at the node.` 36 + ` parent : TreeNode or None, optional (default=None)` 37 + ` The parent of the node. None for root.` 38 + 39 + ` Attributes` 40 + ` ----------` 41 + ` depth : int` 42 + ` The depth of the node, i.e. its distance from the root.` 43 + ` sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)` 44 + ` The indices of the samples at the node.` 45 + ` sum_gradients : float` 46 + ` The sum of the gradients of the samples at the node.` 47 + ` sum_hessians : float` 48 + ` The sum of the hessians of the samples at the node.` 49 + ` parent : TreeNode or None` 50 + ` The parent of the node. None for root.` 51 + ` split_info : SplitInfo or None` 52 + ` The result of the split evaluation.` 53 + ` left_child : TreeNode or None` 54 + ` The left child of the node. None for leaves.` 55 + ` right_child : TreeNode or None` 56 + ` The right child of the node. None for leaves.` 57 + ` value : float or None` 58 + ` The value of the leaf, as computed in finalize_leaf(). None for` 59 + ` non-leaf nodes.` 60 + ` partition_start : int` 61 + ` start position of the node's sample_indices in splitter.partition.` 62 + ` partition_stop : int` 63 + ` stop position of the node's sample_indices in splitter.partition.` 64 + ` """` 65 + 66 + ` split_info = None` 67 + ` left_child = None` 68 + ` right_child = None` 69 + ` value = None` 70 + ` histograms = None` 71 + ` sibling = None` 72 + ` parent = None` 73 + 74 + ` # start and stop indices of the node in the splitter.partition` 75 + ` # array. Concretely,` 76 + ` # self.sample_indices = view(self.splitter.partition[start:stop])` 77 + ` # Please see the comments about splitter.partition and` 78 + ` # splitter.split_indices for more info about this design.` 79 + ` # These 2 attributes are only used in _update_raw_prediction, because we` 80 + ` # need to iterate over the leaves and I don't know how to efficiently` 81 + ` # store the sample_indices views because they're all of different sizes.` 82 + ` partition_start = 0` 83 + ` partition_stop = 0` 84 + 85 + ` def __init__(self, depth, sample_indices, sum_gradients,` 86 + ` sum_hessians, parent=None):` 87 + ` self.depth = depth` 88 + ` self.sample_indices = sample_indices` 89 + ` self.n_samples = sample_indices.shape[0]` 90 + ` self.sum_gradients = sum_gradients` 91 + ` self.sum_hessians = sum_hessians` 92 + ` self.parent = parent` 93 + 94 + ` def __lt__(self, other_node):` 95 + ` """Comparison for priority queue.` 96 + 97 + ` Nodes with high gain are higher priority than nodes with low gain.` 98 + 99 + ` heapq.heappush only need the '<' operator.` 100 + ` heapq.heappop take the smallest item first (smaller is higher` 101 + ` priority).` 102 + 103 + ` Parameters` 104 + ` -----------` 105 + ` other_node : TreeNode` 106 + ` The node to compare with.` 107 + ` """` 108 + ` return self.split_info.gain > other_node.split_info.gain` 109 + 110 + 111 + `class TreeGrower:` 112 + ` """Tree grower class used to build a tree.` 113 + 114 + ` The tree is fitted to predict the values of a Newton-Raphson step. The` 115 + ` splits are considered in a best-first fashion, and the quality of a` 116 + ` split is defined in splitting._split_gain.` 117 + 118 + ` Parameters` 119 + ` ----------` 120 + ` X_binned : ndarray of int, shape (n_samples, n_features)` 121 + ` The binned input samples. Must be Fortran-aligned.` 122 + ` gradients : ndarray, shape (n_samples,)` 123 + ` The gradients of each training sample. Those are the gradients of the` 124 + ` loss w.r.t the predictions, evaluated at iteration ``i - 1``.` 125 + ` hessians : ndarray, shape (n_samples,)` 126 + ` The hessians of each training sample. Those are the hessians of the` 127 + ` loss w.r.t the predictions, evaluated at iteration ``i - 1``.` 128 + ` max_leaf_nodes : int or None, optional (default=None)` 129 + ` The maximum number of leaves for each tree. If None, there is no` 130 + ` maximum limit.` 131 + ` max_depth : int or None, optional (default=None)` 132 + ` The maximum depth of each tree. The depth of a tree is the number of` 133 + ` nodes to go from the root to the deepest leaf.` 134 + ` min_samples_leaf : int, optional (default=20)` 135 + ` The minimum number of samples per leaf.` 136 + ` min_gain_to_split : float, optional (default=0.)` 137 + ` The minimum gain needed to split a node. Splits with lower gain will` 138 + ` be ignored.` 139 + ` max_bins : int, optional (default=256)` 140 + ` The maximum number of bins. Used to define the shape of the` 141 + ` histograms.` 142 + ` actual_n_bins : ndarray of int or int, optional (default=None)` 143 + ` The actual number of bins needed for each feature, which is lower or` 144 + ` equal to ``max_bins``. If it's an int, all features are considered to` 145 + ` have the same number of bins. If None, all features are considered to` 146 + ` have ``max_bins`` bins.` 147 + ` l2_regularization : float, optional (default=0)` 148 + ` The L2 regularization parameter.` 149 + ` min_hessian_to_split : float, optional (default=1e-3)` 150 + ` The minimum sum of hessians needed in each node. Splits that result in` 151 + ` at least one child having a sum of hessians less than` 152 + ` ``min_hessian_to_split`` are discarded.` 153 + ` shrinkage : float, optional (default=1)` 154 + ` The shrinkage parameter to apply to the leaves values, also known as` 155 + ` learning rate.` 156 + ` """` 157 + ` def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,` 158 + ` max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,` 159 + ` max_bins=256, actual_n_bins=None, l2_regularization=0.,` 160 + ` min_hessian_to_split=1e-3, shrinkage=1.):` 161 + 162 + ` self._validate_parameters(X_binned, max_leaf_nodes, max_depth,` 163 + ` min_samples_leaf, min_gain_to_split,` 164 + ` l2_regularization, min_hessian_to_split)` 165 + 166 + ` if actual_n_bins is None:` 167 + ` actual_n_bins = max_bins` 168 + 169 + ` if isinstance(actual_n_bins, numbers.Integral):` 170 + ` actual_n_bins = np.array(` 171 + ` [actual_n_bins] * X_binned.shape[1],` 172 + ` dtype=np.uint32)` 173 + ` else:` 174 + ` actual_n_bins = np.asarray(actual_n_bins, dtype=np.uint32)` 175 + 176 + ` hessians_are_constant = hessians.shape[0] == 1` 177 + ` self.histogram_builder = HistogramBuilder(` 178 + ` X_binned, max_bins, gradients, hessians, hessians_are_constant)` 179 + ` self.splitter = Splitter(` 180 + ` X_binned, max_bins, actual_n_bins, l2_regularization,` 181 + ` min_hessian_to_split, min_samples_leaf, min_gain_to_split,` 182 + ` hessians_are_constant)` 183 + ` self.max_leaf_nodes = max_leaf_nodes` 184 + ` self.max_bins = max_bins` 185 + ` self.n_features = X_binned.shape[1]` 186 + ` self.max_depth = max_depth` 187 + ` self.min_samples_leaf = min_samples_leaf` 188 + ` self.X_binned = X_binned` 189 + ` self.min_gain_to_split = min_gain_to_split` 190 + ` self.shrinkage = shrinkage` 191 + ` self.splittable_nodes = []` 192 + ` self.finalized_leaves = []` 193 + ` self.total_find_split_time = 0. # time spent finding the best splits` 194 + ` self.total_compute_hist_time = 0. # time spent computing histograms` 195 + ` self.total_apply_split_time = 0. # time spent splitting nodes` 196 + ` self._intilialize_root(gradients, hessians, hessians_are_constant)` 197 + ` self.n_nodes = 1` 198 + 199 + ` def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,` 200 + ` min_samples_leaf, min_gain_to_split,` 201 + ` l2_regularization, min_hessian_to_split):` 202 + ` """Validate parameters passed to __init__.` 203 + 204 + ` Also validate parameters passed to splitter.` 205 + ` """` 206 + ` if X_binned.dtype != np.uint8:` 207 + ` raise NotImplementedError(` 208 + ` "X_binned must be of type uint8.")` 209 + ` if not X_binned.flags.f_contiguous:` 210 + ` raise ValueError(` 211 + ` "X_binned should be passed as Fortran contiguous "` 212 + ` "array for maximum efficiency.")` 213 + ` if max_leaf_nodes is not None and max_leaf_nodes <= 1:` 214 + ` raise ValueError('max_leaf_nodes={} should not be'` 215 + ` ' smaller than 2'.format(max_leaf_nodes))` 216 + ` if max_depth is not None and max_depth <= 1:` 217 + ` raise ValueError('max_depth={} should not be'` 218 + ` ' smaller than 2'.format(max_depth))` 219 + ` if min_samples_leaf < 1:` 220 + ` raise ValueError('min_samples_leaf={} should '` 221 + ` 'not be smaller than 1'.format(min_samples_leaf))` 222 + ` if min_gain_to_split < 0:` 223 + ` raise ValueError('min_gain_to_split={} '` 224 + ` 'must be positive.'.format(min_gain_to_split))` 225 + ` if l2_regularization < 0:` 226 + ` raise ValueError('l2_regularization={} must be '` 227 + ` 'positive.'.format(l2_regularization))` 228 + ` if min_hessian_to_split < 0:` 229 + ` raise ValueError('min_hessian_to_split={} '` 230 + ` 'must be positive.'.format(min_hessian_to_split))` 231 + 232 + ` def grow(self):` 233 + ` """Grow the tree, from root to leaves."""` 234 + ` while self.splittable_nodes:` 235 + ` self.split_next()` 236 + 237 + ` def _intilialize_root(self, gradients, hessians, hessians_are_constant):` 238 + ` """Initialize root node and finalize it if needed."""` 239 + ` n_samples = self.X_binned.shape[0]` 240 + ` depth = 0` 241 + ` sum_gradients = sum_parallel(gradients)` 242 + ` if self.histogram_builder.hessians_are_constant:` 243 + ` sum_hessians = hessians[0] * n_samples` 244 + ` else:` 245 + ` sum_hessians = sum_parallel(hessians)` 246 + ` self.root = TreeNode(` 247 + ` depth=depth,` 248 + ` sample_indices=self.splitter.partition,` 249 + ` sum_gradients=sum_gradients,` 250 + ` sum_hessians=sum_hessians` 251 + ` )` 252 + 253 + ` self.root.partition_start = 0` 254 + ` self.root.partition_stop = n_samples` 255 + 256 + ` if self.root.n_samples < 2 * self.min_samples_leaf:` 257 + ` # Do not even bother computing any splitting statistics.` 258 + ` self._finalize_leaf(self.root)` 259 + ` return` 260 + ` if sum_hessians < self.splitter.min_hessian_to_split:` 261 + ` self._finalize_leaf(self.root)` 262 + ` return` 263 + 264 + ` self.root.histograms = self.histogram_builder.compute_histograms_brute(` 265 + ` self.root.sample_indices)` 266 + ` self._compute_best_split_and_push(self.root)` 267 + 268 + ` def _compute_best_split_and_push(self, node):` 269 + ` """Compute the best possible split (SplitInfo) of a given node.` 270 + 271 + ` Also push it in the heap of splittable nodes if gain isn't zero.` 272 + ` The gain of a node is 0 if either all the leaves are pure` 273 + ` (best gain = 0), or if no split would satisfy the constraints,` 274 + ` (min_hessians_to_split, min_gain_to_split, min_samples_leaf)` 275 + ` """` 276 + 277 + ` node.split_info = self.splitter.find_node_split(` 278 + ` node.sample_indices, node.histograms, node.sum_gradients,` 279 + ` node.sum_hessians)` 280 + 281 + ` if node.split_info.gain <= 0: # no valid split` 282 + ` self._finalize_leaf(node)` 283 + ` else:` 284 + ` heappush(self.splittable_nodes, node)` 285 + 286 + ` def split_next(self):` 287 + ` """Split the node with highest potential gain.` 288 + 289 + ` Returns` 290 + ` -------` 291 + ` left : TreeNode` 292 + ` The resulting left child.` 293 + ` right : TreeNode` 294 + ` The resulting right child.` 295 + ` """` 296 + ` # Consider the node with the highest loss reduction (a.k.a. gain)` 297 + ` node = heappop(self.splittable_nodes)` 298 + 299 + ` tic = time()` 300 + ` (sample_indices_left,` 301 + ` sample_indices_right,` 302 + ` right_child_pos) = self.splitter.split_indices(node.split_info,` 303 + ` node.sample_indices)` 304 + ` self.total_apply_split_time += time() - tic` 305 + 306 + ` depth = node.depth + 1` 307 + ` n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)` 308 + ` n_leaf_nodes += 2` 309 + 310 + ` left_child_node = TreeNode(depth,` 311 + ` sample_indices_left,` 312 + ` node.split_info.sum_gradient_left,` 313 + ` node.split_info.sum_hessian_left,` 314 + ` parent=node)` 315 + ` right_child_node = TreeNode(depth,` 316 + ` sample_indices_right,` 317 + ` node.split_info.sum_gradient_right,` 318 + ` node.split_info.sum_hessian_right,` 319 + ` parent=node)` 320 + ` left_child_node.sibling = right_child_node` 321 + ` right_child_node.sibling = left_child_node` 322 + ` node.right_child = right_child_node` 323 + ` node.left_child = left_child_node` 324 + 325 + ` # set start and stop indices` 326 + ` left_child_node.partition_start = node.partition_start` 327 + ` left_child_node.partition_stop = node.partition_start + right_child_pos` 328 + ` right_child_node.partition_start = left_child_node.partition_stop` 329 + ` right_child_node.partition_stop = node.partition_stop` 330 + 331 + ` self.n_nodes += 2` 332 + 333 + ` if self.max_depth is not None and depth == self.max_depth:` 334 + ` self._finalize_leaf(left_child_node)` 335 + ` self._finalize_leaf(right_child_node)` 336 + ` return left_child_node, right_child_node` 337 + 338 + ` if (self.max_leaf_nodes is not None` 339 + ` and n_leaf_nodes == self.max_leaf_nodes):` 340 + ` self._finalize_leaf(left_child_node)` 341 + ` self._finalize_leaf(right_child_node)` 342 + ` self._finalize_splittable_nodes()` 343 + ` return left_child_node, right_child_node` 344 + 345 + ` if left_child_node.n_samples < self.min_samples_leaf * 2:` 346 + ` self._finalize_leaf(left_child_node)` 347 + ` if right_child_node.n_samples < self.min_samples_leaf * 2:` 348 + ` self._finalize_leaf(right_child_node)` 349 + 350 + ` # Compute histograms of childs, and compute their best possible split` 351 + ` # (if needed)` 352 + ` should_split_left = left_child_node.value is None # node isn't a leaf` 353 + ` should_split_right = right_child_node.value is None` 354 + ` if should_split_left or should_split_right:` 355 + 356 + ` # We will compute the histograms of both nodes even if one of them` 357 + ` # is a leaf, since computing the second histogram is very cheap` 358 + ` # (using histogram subtraction).` 359 + ` n_samples_left = left_child_node.sample_indices.shape[0]` 360 + ` n_samples_right = right_child_node.sample_indices.shape[0]` 361 + ` if n_samples_left < n_samples_right:` 362 + ` smallest_child = left_child_node` 363 + ` largest_child = right_child_node` 364 + ` else:` 365 + ` smallest_child = right_child_node` 366 + ` largest_child = left_child_node` 367 + 368 + ` # We use the brute O(n_samples) method on the child that has the` 369 + ` # smallest number of samples, and the subtraction trick O(n_bins)` 370 + ` # on the other one.` 371 + ` tic = time()` 372 + ` smallest_child.histograms = \` 373 + ` self.histogram_builder.compute_histograms_brute(` 374 + ` smallest_child.sample_indices)` 375 + ` largest_child.histograms = \` 376 + ` self.histogram_builder.compute_histograms_subtraction(` 377 + ` node.histograms, smallest_child.histograms)` 378 + ` self.total_compute_hist_time += time() - tic` 379 + 380 + ` tic = time()` 381 + ` if should_split_left:` 382 + ` self._compute_best_split_and_push(left_child_node)` 383 + ` if should_split_right:` 384 + ` self._compute_best_split_and_push(right_child_node)` 385 + ` self.total_find_split_time += time() - tic` 386 + 387 + ` return left_child_node, right_child_node` 388 + 389 + ` def _finalize_leaf(self, node):` 390 + ` """Compute the prediction value that minimizes the objective function.` 391 + 392 + ` This sets the node.value attribute (node is a leaf iff node.value is` 393 + ` not None).` 394 + 395 + ` See Equation 5 of:` 396 + ` XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016` 397 + ` https://arxiv.org/abs/1603.02754` 398 + ` """` 399 + ` node.value = -self.shrinkage * node.sum_gradients / (` 400 + ` node.sum_hessians + self.splitter.l2_regularization)` 401 + ` self.finalized_leaves.append(node)` 402 + 403 + ` def _finalize_splittable_nodes(self):` 404 + ` """Transform all splittable nodes into leaves.` 405 + 406 + ` Used when some constraint is met e.g. maximum number of leaves or` 407 + ` maximum depth."""` 408 + ` while len(self.splittable_nodes) > 0:` 409 + ` node = self.splittable_nodes.pop()` 410 + ` self._finalize_leaf(node)` 411 + 412 + ` def make_predictor(self, bin_thresholds=None):` 413 + ` """Make a TreePredictor object out of the current tree.` 414 + 415 + ` Parameters` 416 + ` ----------` 417 + ` bin_thresholds : array-like of floats, optional (default=None)` 418 + ` The actual thresholds values of each bin.` 419 + 420 + ` Returns` 421 + ` -------` 422 + ` A TreePredictor object.` 423 + ` """` 424 + ` predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)` 425 + ` _fill_predictor_node_array(predictor_nodes, self.root,` 426 + ` bin_thresholds=bin_thresholds)` 427 + ` return TreePredictor(predictor_nodes)` 428 + 429 + 430 + `def _fill_predictor_node_array(predictor_nodes, grower_node,` 431 + ` bin_thresholds, next_free_idx=0):` 432 + ` """Helper used in make_predictor to set the TreePredictor fields."""` 433 + ` node = predictor_nodes[next_free_idx]` 434 + ` node['count'] = grower_node.n_samples` 435 + ` node['depth'] = grower_node.depth` 436 + ` if grower_node.split_info is not None:` 437 + ` node['gain'] = grower_node.split_info.gain` 438 + ` else:` 439 + ` node['gain'] = -1` 440 + 441 + ` if grower_node.value is not None:` 442 + ` # Leaf node` 443 + ` node['is_leaf'] = True` 444 + ` node['value'] = grower_node.value` 445 + ` return next_free_idx + 1` 446 + ` else:` 447 + ` # Decision node` 448 + ` split_info = grower_node.split_info` 449 + ` feature_idx, bin_idx = split_info.feature_idx, split_info.bin_idx` 450 + ` node['feature_idx'] = feature_idx` 451 + ` node['bin_threshold'] = bin_idx` 452 + ` if bin_thresholds is not None:` 453 + ` threshold = bin_thresholds[feature_idx][bin_idx]` 454 + ` node['threshold'] = threshold` 455 + ` next_free_idx += 1` 456 + 457 + ` node['left'] = next_free_idx` 458 + ` next_free_idx = _fill_predictor_node_array(` 459 + ` predictor_nodes, grower_node.left_child,` 460 + ` bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)` 461 + 462 + ` node['right'] = next_free_idx` 463 + ` return _fill_predictor_node_array(` 464 + ` predictor_nodes, grower_node.right_child,` 465 + ` bin_thresholds=bin_thresholds, next_free_idx=next_free_idx)`

@@ -4,12 +4,49 @@
 4 4 5 5 `def configuration(parent_package="", top_path=None):` 6 6 ` config = Configuration("ensemble", parent_package, top_path)` 7 + 7 8 ` config.add_extension("_gradient_boosting",` 8 9 ` sources=["_gradient_boosting.pyx"],` 9 10 ` include_dirs=[numpy.get_include()])` 10 11 11 12 ` config.add_subpackage("tests")` 12 13 14 + ` # Histogram-based gradient boosting files` 15 + ` config.add_extension(` 16 + ` "_hist_gradient_boosting._gradient_boosting",` 17 + ` sources=["_hist_gradient_boosting/_gradient_boosting.pyx"],` 18 + ` include_dirs=[numpy.get_include()])` 19 + 20 + ` config.add_extension("_hist_gradient_boosting.histogram",` 21 + ` sources=["_hist_gradient_boosting/histogram.pyx"],` 22 + ` include_dirs=[numpy.get_include()])` 23 + 24 + ` config.add_extension("_hist_gradient_boosting.splitting",` 25 + ` sources=["_hist_gradient_boosting/splitting.pyx"],` 26 + ` include_dirs=[numpy.get_include()])` 27 + 28 + ` config.add_extension("_hist_gradient_boosting._binning",` 29 + ` sources=["_hist_gradient_boosting/_binning.pyx"],` 30 + ` include_dirs=[numpy.get_include()])` 31 + 32 + ` config.add_extension("_hist_gradient_boosting._predictor",` 33 + ` sources=["_hist_gradient_boosting/_predictor.pyx"],` 34 + ` include_dirs=[numpy.get_include()])` 35 + 36 + ` config.add_extension("_hist_gradient_boosting._loss",` 37 + ` sources=["_hist_gradient_boosting/_loss.pyx"],` 38 + ` include_dirs=[numpy.get_include()])` 39 + 40 + ` config.add_extension("_hist_gradient_boosting.types",` 41 + ` sources=["_hist_gradient_boosting/types.pyx"],` 42 + ` include_dirs=[numpy.get_include()])` 43 + 44 + ` config.add_extension("_hist_gradient_boosting.utils",` 45 + ` sources=["_hist_gradient_boosting/utils.pyx"],` 46 + ` include_dirs=[numpy.get_include()])` 47 + 48 + ` config.add_subpackage("_hist_gradient_boosting.tests")` 49 + 13 50 ` return config` 14 51 15 52 `if __name__ == "__main__":`

Click to load this diff.

Click to load this diff.

Click to load this diff.

Click to load this diff.

Click to load this diff.

Click to load this diff.

Click to load this diff.

Click to load this diff.

Showing 61 files with coverage changes found.

Changes in `sklearn/neighbors/setup.py`
-13
+13
Changes in `sklearn/preprocessing/setup.py`
-9
+9
Changes in `sklearn/feature_extraction/setup.py`
-10
+10
Changes in `sklearn/utils/setup.py`
-25
+25
Changes in `sklearn/tree/setup.py`
-14
+14
Changes in `sklearn/ensemble/partial_dependence.py`
-99
+99
Changes in `sklearn/manifold/setup.py`
-9
+9
Changes in `sklearn/datasets/setup.py`
-10
+10
Changes in `sklearn/metrics/setup.py`
-8
+8
Changes in `sklearn/tests/test_docstring_parameters.py`
-46
+46
Changes in `sklearn/decomposition/setup.py`
-8
+8
Changes in `sklearn/linear_model/setup.py`
-13
+13
Changes in `sklearn/metrics/cluster/setup.py`
-7
+7
Changes in `sklearn/inspection/partial_dependence.py`
-107
+107
Changes in `sklearn/__check_build/setup.py`
-4
+4
Changes in `sklearn/inspection/tests/test_partial_dependence.py`
-73
+73
Changes in `sklearn/ensemble/tests/test_partial_dependence.py`
-41
+41
Changes in `sklearn/_build_utils/__init__.py`
-15
+15
Changes in `sklearn/utils/fixes.py`
-31
+31
Changes in `sklearn/compose/tests/test_column_transformer.py`
-96
+96
Changes in `sklearn/tree/export.py`
-58
+58
Changes in `sklearn/utils/testing.py`
-47
+47
Changes in `sklearn/manifold/spectral_embedding_.py`
-15
+15
Changes in `sklearn/utils/tests/test_testing.py`
-19
+19
Changes in `sklearn/_build_utils/openmp_helpers.py`
-4
+4
Changes in `sklearn/compose/_column_transformer.py`
-16
+16
Changes in `sklearn/preprocessing/tests/test_encoders.py`
-29
+29
Changes in `sklearn/utils/tests/test_utils.py`
-14
+14
Changes in `sklearn/tree/tests/test_export.py`
-8
+8
Changes in `sklearn/utils/tests/test_validation.py`
-25
+25
Changes in `sklearn/preprocessing/tests/test_function_transformer.py`
-4
+4
Changes in `sklearn/manifold/tests/test_spectral_embedding.py`
-5
+5
Changes in `sklearn/cluster/tests/test_spectral.py`
-3
+3
Changes in `sklearn/tree/_reingold_tilford.py`
-3
+3
Changes in `sklearn/tests/test_impute.py`
-12
+12
Changes in `sklearn/utils/tests/test_multiclass.py`
-3
+3
Changes in `sklearn/utils/deprecation.py`
-1
+1
Changes in `sklearn/svm/classes.py`
-1
+1
Changes in `sklearn/utils/__init__.py`
-2
+2
Changes in `sklearn/metrics/pairwise.py`
-3
+3
Changes in `sklearn/linear_model/ridge.py`
-4
+4
Changes in `sklearn/neighbors/tests/test_dist_metrics.py`
-1
+1
Changes in `sklearn/utils/validation.py`
-2
+2
Changes in `sklearn/utils/multiclass.py`
-1
+1
Changes in `sklearn/gaussian_process/gpr.py`
-1
+1
Changes in `sklearn/metrics/cluster/supervised.py`
-1
+1
Changes in `sklearn/preprocessing/_encoders.py`
-2
+2
Changes in `sklearn/model_selection/tests/test_validation.py`
-5
+5
Changes in `sklearn/impute.py`
-1
+1
Changes in `sklearn/utils/estimator_checks.py`
-2
+1
+1
Changes in `sklearn/model_selection/tests/test_split.py`
-1
+1
Changes in `sklearn/model_selection/tests/test_search.py`
-1
+1
Changes in `sklearn/metrics/tests/test_classification.py`
-1
+1
setup.py
New file `sklearn/ensemble/_hist_gradient_boosting/predictor.py`
New
New file `sklearn/ensemble/_hist_gradient_boosting/loss.py`
New
New file `sklearn/experimental/enable_hist_gradient_boosting.py`
New
New file `sklearn/ensemble/_hist_gradient_boosting/grower.py`
New
New file `sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py`
New
New file `sklearn/experimental/tests/test_enable_hist_gradient_boosting.py`
New
New file `sklearn/ensemble/_hist_gradient_boosting/binning.py`
New

Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual
Contextual