diff --git a/pyproject.toml b/pyproject.toml index 752e8f5..527c3c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "pydantic>=1.10.8", "rich>=13.3.5", "scikit-learn>=1.2.2", + "scipy>=1.15.3", "seaborn>=0.12.2", ] diff --git a/src/random_tree_models/models/gradientboostedtrees.py b/src/random_tree_models/models/gradientboostedtrees.py index 46d7d40..9161ddf 100644 --- a/src/random_tree_models/models/gradientboostedtrees.py +++ b/src/random_tree_models/models/gradientboostedtrees.py @@ -4,6 +4,7 @@ import numpy as np import sklearn.base as base from rich.progress import track +from scipy.optimize import minimize_scalar from sklearn.utils.multiclass import ( check_classification_targets, type_of_target, @@ -17,7 +18,7 @@ DecisionTreeRegressor, ) from random_tree_models.params import MetricNames, is_greater_zero -from random_tree_models.utils import bool_to_float +from random_tree_models.utils import vectorize_bool_to_float class GradientBoostedTreesTemplate(base.BaseEstimator): @@ -53,6 +54,70 @@ def predict(self, X: np.ndarray) -> np.ndarray: raise NotImplementedError() +def get_pseudo_residual_mse(y: np.ndarray, current_estimates: np.ndarray) -> np.ndarray: + """ + mse loss = sum_i (y_i - estimate_i)^2 + pseudo residual_i = d mse loss(y,estimate) / d estimate_i = - (y_i - estimate_i) + since we want to apply it as the negative gradient for steepest descent we flip the sign + """ + return y - current_estimates + + +def get_pseudo_residual_log_odds( + y: np.ndarray, current_estimates: np.ndarray +) -> np.ndarray: + """ + # dloss/dyhat, g in the xgboost paper + """ + return 2 * y / (1 + np.exp(2 * y * current_estimates)) + + +def get_start_estimate_mse(y: np.ndarray) -> float: + return float(np.mean(y)) + + +def get_start_estimate_log_odds(y: np.ndarray) -> float: + """ + 1/2 log(1+ym)/(1-ym) because ym is in [-1, 1] + equivalent to log(ym)/(1-ym) if ym were in [0, 1] + """ + ym = np.mean(y) + if ym == 1: + return math.inf + elif ym == -1: + return -math.inf + start_estimate = 0.5 * math.log((1 + ym) / (1 - ym)) + return start_estimate + + +def find_step_size( + y: np.ndarray, current_estimates: np.ndarray, h: np.ndarray +) -> float: + """ + Finds the optimal step size gamma for the update rule: + new_estimate = current_estimates + gamma * h + + This is done by minimizing the MSE loss function with respect to gamma. + loss(gamma) = sum((y - (current_estimates + gamma * h))^2) + """ + + def loss(gamma: float) -> float: + return float(np.sum((y - (current_estimates + gamma * h)) ** 2)) + + res = minimize_scalar(loss) + if res.success: + return float(res.x) + else: + # Fallback or error handling + return 1.0 + + +def get_probabilities_from_mapped_bools(h: np.ndarray) -> np.ndarray: + proba = 1 / (1 + np.exp(-2.0 * h)) + proba = np.array([1 - proba, proba]).T + return proba + + class GradientBoostedTreesRegressor( base.RegressorMixin, GradientBoostedTreesTemplate, @@ -60,25 +125,14 @@ class GradientBoostedTreesRegressor( """OG Gradient boosted trees regressor Friedman 2001, Greedy Function Approximation: A Gradient Boosting Machine - https://www.jstor.org/stable/2699986 - - Algorithm 2 (LS_Boost) - - y = our continuous target - M = number of boosts - - start_estimate = mean(y) - for m = 1 to M do: - dy = y - prev_estimate - new_rho, new_estimator = arg min(rho, estimator) mse(dy, rho*estimator(x)) - new_estimate = prev_estimate + new_rho * new_estimator(x) - prev_estimate = new_estimate + https://www.jstor.org/stable/2699986 -> Algorithm 2 (LS_Boost) + https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm + https://maelfabien.github.io/machinelearning/GradientBoost/#implement-a-high-level-gradient-boosting-in-python """ def __init__( self, measure_name: MetricNames = MetricNames.variance, - factor: float = 1.0, n_trees: int = 3, max_depth: int = 2, min_improvement: float = 0.0, @@ -91,47 +145,58 @@ def __init__( min_improvement=min_improvement, ensure_all_finite=ensure_all_finite, ) - self.factor = factor def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesRegressor": - X, y = validate_data(self, X, y, ensure_all_finite=False) + X, y = validate_data(self, X, y, ensure_all_finite=self.ensure_all_finite) self.trees_: list[DecisionTreeRegressor] = [] - self.start_estimate_ = np.mean(y) - - # initial differences to predict - _y = y - self.start_estimate_ + self.start_estimate_ = get_start_estimate_mse(y) + current_estimates = self.start_estimate_ * np.ones_like(y) + self.step_sizes_: list[float] = [] for _ in track(range(self.n_trees), total=self.n_trees, description="tree"): + r = get_pseudo_residual_mse(y, current_estimates) + # train decision tree to predict differences new_tree = DecisionTreeRegressor( measure_name=self.measure_name, max_depth=self.max_depth, min_improvement=self.min_improvement, ) - new_tree.fit(X, _y) + new_tree.fit(X, r) self.trees_.append(new_tree) + h = new_tree.predict(X) # estimate of pseudo residual + + # find one optimal step size to rule them all + gamma = find_step_size(y, current_estimates, h) + self.step_sizes_.append(gamma) + # update differences to predict - dy = self.factor * new_tree.predict(X) - _y = _y - dy + current_estimates = current_estimates + gamma * h return self def predict(self, X: np.ndarray) -> np.ndarray: - check_is_fitted(self, ("trees_", "n_features_in_", "start_estimate_")) + check_is_fitted( + self, ("trees_", "n_features_in_", "start_estimate_", "step_sizes_") + ) - X = validate_data(self, X, reset=False, ensure_all_finite=False) + X = validate_data( + self, X, reset=False, ensure_all_finite=self.ensure_all_finite + ) # baseline estimate y = np.ones(X.shape[0]) * self.start_estimate_ # improve on baseline - for tree in track( - self.trees_, description="tree", total=len(self.trees_) + for tree, step_size in track( + zip(self.trees_, self.step_sizes_), + description="tree", + total=len(self.trees_), ): # loop boosts - dy = self.factor * tree.predict(X) + dy = step_size * tree.predict(X) y += dy return y @@ -209,10 +274,6 @@ def __init__( ensure_all_finite=ensure_all_finite, ) - def _bool_to_float(self, y: np.ndarray) -> np.ndarray: - f = np.vectorize(bool_to_float) - return f(y) - def _more_tags(self) -> T.Dict[str, bool]: """Describes to scikit-learn parametrize_with_checks the scope of this class @@ -227,7 +288,7 @@ def __sklearn_tags__(self): return tags def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesClassifier": - X, y = validate_data(self, X, y, ensure_all_finite=False) + X, y = validate_data(self, X, y, ensure_all_finite=self.ensure_all_finite) check_classification_targets(y) @@ -243,52 +304,58 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesClassifier": self.classes_, y = np.unique(y, return_inverse=True) self.trees_: list[DecisionTreeRegressor] = [] - self.gammas_ = [] - y = self._bool_to_float(y) - ym = np.mean(y) - self.start_estimate_ = 0.5 * math.log((1 + ym) / (1 - ym)) - - yhat = np.ones_like(y) * self.start_estimate_ + y = vectorize_bool_to_float(y) # True -> 1, False -> -1 + self.start_estimate_ = get_start_estimate_log_odds(y) + current_estimates = self.start_estimate_ * np.ones_like(y) + self.step_sizes_: list[float] = [] for _ in track(range(self.n_trees), description="tree", total=self.n_trees): - g = ( - 2 * y / (1 + np.exp(2 * y * yhat)) - ) # dloss/dyhat, g in the xgboost paper + r = get_pseudo_residual_log_odds(y, current_estimates) new_tree = DecisionTreeRegressor( measure_name=self.measure_name, max_depth=self.max_depth, min_improvement=self.min_improvement, ) - new_tree.fit(X, y, g=g) + new_tree.fit(X, y, g=r) self.trees_.append(new_tree) - # update _y - g_pred = new_tree.predict(X) - yhat += g_pred + h = new_tree.predict(X) # estimate of pseudo residual + # find one optimal step size to rule them all + gamma = find_step_size(y, current_estimates, h) + self.step_sizes_.append(gamma) + + # update differences to predict + current_estimates = current_estimates + gamma * h return self def predict_proba(self, X: np.ndarray) -> np.ndarray: - check_is_fitted(self, ("trees_", "classes_", "gammas_", "n_features_in_")) - X = validate_data(self, X, reset=False, ensure_all_finite=False) + check_is_fitted( + self, + ("trees_", "classes_", "n_features_in_", "start_estimate_", "step_sizes_"), + ) + X = validate_data( + self, X, reset=False, ensure_all_finite=self.ensure_all_finite + ) - g = np.ones(X.shape[0]) * self.start_estimate_ + h = np.ones(X.shape[0]) * self.start_estimate_ - for _, tree in track( - enumerate(self.trees_), description="tree", total=len(self.trees_) + for tree, step_size in track( + zip(self.trees_, self.step_sizes_), + description="tree", + total=len(self.trees_), ): # loop boosts - g += tree.predict(X) + h += step_size * tree.predict(X) - proba = 1 / (1 + np.exp(-2.0 * g)) - proba = np.array([1 - proba, proba]).T - return proba + p = get_probabilities_from_mapped_bools(h) + return p def predict(self, X: np.ndarray) -> np.ndarray: - proba = self.predict_proba(X) + p = self.predict_proba(X) - ix = np.argmax(proba, axis=1) + ix = np.argmax(p, axis=1) y = self.classes_[ix] return y diff --git a/tests/models/test_gradientboostedtrees.py b/tests/models/test_gradientboostedtrees.py index 3c3f761..f9d74dd 100644 --- a/tests/models/test_gradientboostedtrees.py +++ b/tests/models/test_gradientboostedtrees.py @@ -1,3 +1,5 @@ +import math + import numpy as np import pytest from sklearn.utils.estimator_checks import parametrize_with_checks @@ -86,3 +88,85 @@ def test_gbt_estimators_with_sklearn_checks(estimator, check): Reference: https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.parametrize_with_checks.html#sklearn.utils.estimator_checks.parametrize_with_checks """ check(estimator) + + +def test_get_pseudo_residual_mse(): + y = np.array([1.0, 2.0, 3.0]) + current_estimates = np.array([0.5, 1.0, 2.0]) + expected_residuals = np.array([0.5, 1.0, 1.0]) + actual_residuals = gbt.get_pseudo_residual_mse(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + # Test with negative values + y = np.array([-1.0, -2.0, -3.0]) + current_estimates = np.array([-0.5, -1.0, -2.0]) + expected_residuals = np.array([-0.5, -1.0, -1.0]) + actual_residuals = gbt.get_pseudo_residual_mse(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + # Test with zero values + y = np.array([0.0, 0.0, 0.0]) + current_estimates = np.array([0.0, 0.0, 0.0]) + expected_residuals = np.array([0.0, 0.0, 0.0]) + actual_residuals = gbt.get_pseudo_residual_mse(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + +def test_get_pseudo_residual_log_odds(): + # Test case 1: Basic test with positive and negative values + y = np.array([1, -1, 1, -1]) + current_estimates = np.array([0.1, 0.2, -0.1, -0.2]) + expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) + actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + # Test case 2: y close to zero + y = np.array([0.001, -0.001]) + current_estimates = np.array([0.5, 0.5]) + expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) + actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + # Test case 3: current_estimates close to zero + y = np.array([1, -1]) + current_estimates = np.array([0.001, -0.001]) + expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) + actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + # Test case 4: Larger current_estimates + y = np.array([1, -1]) + current_estimates = np.array([2, -2]) + expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) + actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) + assert np.allclose(actual_residuals, expected_residuals) + + +def test_get_start_estimate_log_odds(): + # Test case 1: Balanced classes (mean close to 0) + y = np.array([1, -1, 1, -1]) + actual_start_estimate = gbt.get_start_estimate_log_odds(y) + assert np.isclose(actual_start_estimate, 0.0) + + # Test case 2: All positive class + y = np.array([1, 1, 1, 1]) + actual_start_estimate = gbt.get_start_estimate_log_odds(y) + assert math.isinf(actual_start_estimate) + + # Test case 3: All negative class + y = np.array([-1, -1, -1, -1]) + actual_start_estimate = gbt.get_start_estimate_log_odds(y) + assert math.isinf(actual_start_estimate) + + # Test case 4: Unbalanced classes + y = np.array([1, 1, 1, -1]) + ym = np.mean(y) + actual_start_estimate = gbt.get_start_estimate_log_odds(y) + v = 0.5493061443340549 + assert np.isclose(actual_start_estimate, v) + + # Test case 5: Another set of unbalanced classes + y = np.array([1, -1, -1, -1]) + ym = np.mean(y) + actual_start_estimate = gbt.get_start_estimate_log_odds(y) + assert np.isclose(actual_start_estimate, -v) diff --git a/uv.lock b/uv.lock index 0dd0f49..884fb13 100644 --- a/uv.lock +++ b/uv.lock @@ -2737,6 +2737,8 @@ dependencies = [ { name = "pydantic" }, { name = "rich" }, { name = "scikit-learn" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.16.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "seaborn" }, ] @@ -2770,6 +2772,7 @@ requires-dist = [ { name = "pydantic", specifier = ">=1.10.8" }, { name = "rich", specifier = ">=13.3.5" }, { name = "scikit-learn", specifier = ">=1.2.2" }, + { name = "scipy", specifier = ">=1.15.3" }, { name = "seaborn", specifier = ">=0.12.2" }, ]