From afad757fad4fbb3a06c51e21503ae31e40714919 Mon Sep 17 00:00:00 2001 From: eschmidt42 <11818904+eschmidt42@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:32:11 +0200 Subject: [PATCH] feat: polished xgboost.py a bit to use similar naming as gradientboostedtrees.py, also got rid of some redundant code that both modules should share, re-located re-used functions to gradient.py and transform.py, added tests --- src/random_tree_models/gradient.py | 72 ++++++++ .../models/gradientboostedtrees.py | 60 ++----- src/random_tree_models/models/xgboost.py | 104 +++++------- src/random_tree_models/transform.py | 21 +++ src/random_tree_models/utils.py | 15 -- tests/models/test_gradientboostedtrees.py | 84 ---------- tests/models/test_xgboost.py | 156 ++++++++---------- tests/test_gradient.py | 147 +++++++++++++++++ tests/test_transform.py | 63 +++++++ tests/test_utils.py | 43 ----- 10 files changed, 426 insertions(+), 339 deletions(-) create mode 100644 src/random_tree_models/gradient.py create mode 100644 src/random_tree_models/transform.py create mode 100644 tests/test_gradient.py create mode 100644 tests/test_transform.py diff --git a/src/random_tree_models/gradient.py b/src/random_tree_models/gradient.py new file mode 100644 index 0000000..e7846c7 --- /dev/null +++ b/src/random_tree_models/gradient.py @@ -0,0 +1,72 @@ +import math + +import numpy as np + + +def check_y_float(y_float: np.ndarray): + # expects y_float to consist only of the values -1 and 1 + unexpected_values = np.abs(y_float) != 1 + if np.sum(unexpected_values) > 0: + raise ValueError( + f"expected y_float to contain only -1 and 1, got {y_float[unexpected_values]}" + ) + + +def get_pseudo_residual_mse( + y: np.ndarray, current_estimates: np.ndarray, second_order: bool +) -> tuple[np.ndarray, np.ndarray | None]: + """ + mse loss = sum_i (y_i - estimate_i)^2 + pseudo residual_i = d mse loss(y,estimate) / d estimate_i = - (y_i - estimate_i) + since we want to apply it as the negative gradient for steepest descent we flip the sign + """ + first_derivative = y - current_estimates + + second_derivative = None + if second_order: + second_derivative = -1 * np.ones_like(first_derivative) + + return first_derivative, second_derivative + + +def get_pseudo_residual_log_odds( + y: np.ndarray, current_estimates: np.ndarray, second_order: bool +) -> tuple[np.ndarray, np.ndarray | None]: + """ + first derivative: d loss / d current_estimates, g in the xgboost paper + second derivative: d^2 loss / d current_estimates^2, h in the xgboost paper + + """ + check_y_float(y) + + a = np.exp(2 * y * current_estimates) + first_derivative = 2 * y / (1 + a) + + second_derivative = None + if second_order: + second_derivative = -(4 * y**2 * a / (1 + a) ** 2) + + return first_derivative, second_derivative + + +def get_start_estimate_mse(y: np.ndarray) -> float: + return float(np.mean(y)) + + +def get_start_estimate_log_odds(y: np.ndarray) -> float: + """ + 1/2 log(1+ym)/(1-ym) because ym is in [-1, 1] + equivalent to log(ym)/(1-ym) if ym were in [0, 1] + """ + check_y_float(y) + + ym = np.mean(y) + + if ym == 1: + return math.inf + elif ym == -1: + return -math.inf + + start_estimate = 0.5 * math.log((1 + ym) / (1 - ym)) + + return start_estimate diff --git a/src/random_tree_models/models/gradientboostedtrees.py b/src/random_tree_models/models/gradientboostedtrees.py index 9161ddf..0e60725 100644 --- a/src/random_tree_models/models/gradientboostedtrees.py +++ b/src/random_tree_models/models/gradientboostedtrees.py @@ -1,4 +1,3 @@ -import math import typing as T import numpy as np @@ -14,11 +13,20 @@ validate_data, # type: ignore ) +from random_tree_models.gradient import ( + get_pseudo_residual_log_odds, + get_pseudo_residual_mse, + get_start_estimate_log_odds, + get_start_estimate_mse, +) from random_tree_models.models.decisiontree import ( DecisionTreeRegressor, ) from random_tree_models.params import MetricNames, is_greater_zero -from random_tree_models.utils import vectorize_bool_to_float +from random_tree_models.transform import ( + get_probabilities_from_mapped_bools, + vectorize_bool_to_float, +) class GradientBoostedTreesTemplate(base.BaseEstimator): @@ -54,42 +62,6 @@ def predict(self, X: np.ndarray) -> np.ndarray: raise NotImplementedError() -def get_pseudo_residual_mse(y: np.ndarray, current_estimates: np.ndarray) -> np.ndarray: - """ - mse loss = sum_i (y_i - estimate_i)^2 - pseudo residual_i = d mse loss(y,estimate) / d estimate_i = - (y_i - estimate_i) - since we want to apply it as the negative gradient for steepest descent we flip the sign - """ - return y - current_estimates - - -def get_pseudo_residual_log_odds( - y: np.ndarray, current_estimates: np.ndarray -) -> np.ndarray: - """ - # dloss/dyhat, g in the xgboost paper - """ - return 2 * y / (1 + np.exp(2 * y * current_estimates)) - - -def get_start_estimate_mse(y: np.ndarray) -> float: - return float(np.mean(y)) - - -def get_start_estimate_log_odds(y: np.ndarray) -> float: - """ - 1/2 log(1+ym)/(1-ym) because ym is in [-1, 1] - equivalent to log(ym)/(1-ym) if ym were in [0, 1] - """ - ym = np.mean(y) - if ym == 1: - return math.inf - elif ym == -1: - return -math.inf - start_estimate = 0.5 * math.log((1 + ym) / (1 - ym)) - return start_estimate - - def find_step_size( y: np.ndarray, current_estimates: np.ndarray, h: np.ndarray ) -> float: @@ -112,12 +84,6 @@ def loss(gamma: float) -> float: return 1.0 -def get_probabilities_from_mapped_bools(h: np.ndarray) -> np.ndarray: - proba = 1 / (1 + np.exp(-2.0 * h)) - proba = np.array([1 - proba, proba]).T - return proba - - class GradientBoostedTreesRegressor( base.RegressorMixin, GradientBoostedTreesTemplate, @@ -156,7 +122,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesRegressor": self.step_sizes_: list[float] = [] for _ in track(range(self.n_trees), total=self.n_trees, description="tree"): - r = get_pseudo_residual_mse(y, current_estimates) + r, _ = get_pseudo_residual_mse(y, current_estimates, second_order=False) # train decision tree to predict differences new_tree = DecisionTreeRegressor( @@ -311,7 +277,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesClassifier": self.step_sizes_: list[float] = [] for _ in track(range(self.n_trees), description="tree", total=self.n_trees): - r = get_pseudo_residual_log_odds(y, current_estimates) + r, _ = get_pseudo_residual_log_odds( + y, current_estimates, second_order=False + ) new_tree = DecisionTreeRegressor( measure_name=self.measure_name, diff --git a/src/random_tree_models/models/xgboost.py b/src/random_tree_models/models/xgboost.py index 2d42d32..5e55fef 100644 --- a/src/random_tree_models/models/xgboost.py +++ b/src/random_tree_models/models/xgboost.py @@ -10,7 +10,6 @@ * sparsity-aware split finding / "default direction" for missing values """ -import math import typing as T import numpy as np @@ -26,9 +25,18 @@ validate_data, # type: ignore ) +from random_tree_models.gradient import ( + get_pseudo_residual_log_odds, + get_pseudo_residual_mse, + get_start_estimate_log_odds, + get_start_estimate_mse, +) from random_tree_models.models.decisiontree import DecisionTreeRegressor from random_tree_models.params import MetricNames, is_greater_zero -from random_tree_models.utils import vectorize_bool_to_float +from random_tree_models.transform import ( + get_probabilities_from_mapped_bools, + vectorize_bool_to_float, +) class XGBoostTemplate(base.BaseEstimator): @@ -73,15 +81,6 @@ def predict(self, X: np.ndarray) -> np.ndarray: raise NotImplementedError() -def compute_derivatives_negative_least_squares( - y: np.ndarray, start_estimate: float -) -> T.Tuple[np.ndarray, np.ndarray]: - "loss = - mean |y-yhat|^2" - g = y - start_estimate # 1st order derivative - h = -1 * np.ones_like(g) # 2nd order derivative - return g, h - - # TODO: add tests: # * X_hist is integer based # * X_hist has the same shape as X @@ -144,14 +143,19 @@ class XGBoostRegressor(base.RegressorMixin, XGBoostTemplate): """ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostRegressor": - X, y = validate_data(self, X, y, ensure_all_finite=False) + X, y = validate_data(self, X, y, ensure_all_finite=self.ensure_all_finite) self.trees_: list[DecisionTreeRegressor] = [] - self.start_estimate_: float = float(np.mean(y)) + self.start_estimate_ = get_start_estimate_mse(y) # initial differences to predict using negative squared error loss - g, h = compute_derivatives_negative_least_squares(y, self.start_estimate_) + current_estimates = self.start_estimate_ * np.ones_like(y) + g, h = get_pseudo_residual_mse(y, current_estimates, second_order=True) + + if h is None: + raise ValueError(f"h cannot be None beyond this stage.") + if self.use_hist: X_hist, all_x_bin_edges = xgboost_histogrammify_with_h( X, h, n_bins=self.n_bins @@ -179,7 +183,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostRegressor": def predict(self, X: np.ndarray) -> np.ndarray: check_is_fitted(self, ("trees_", "n_features_in_", "start_estimate_")) - X = validate_data(self, X, reset=False, ensure_all_finite=False) + X = validate_data( + self, X, reset=False, ensure_all_finite=self.ensure_all_finite + ) # baseline estimate y = np.ones(X.shape[0]) * self.start_estimate_ @@ -197,41 +203,6 @@ def predict(self, X: np.ndarray) -> np.ndarray: return y -def check_y_float(y_float: np.ndarray): - # expects y_float to consist only of the values -1 and 1 - unexpected_values = np.abs(y_float) != 1 - if np.sum(unexpected_values) > 0: - raise ValueError( - f"expected y_float to contain only -1 and 1, got {y_float[unexpected_values]}" - ) - - -def compute_start_estimate_binomial_loglikelihood(y_float: np.ndarray) -> float: - check_y_float(y_float) - - ym = np.mean(y_float) - start_estimate = 0.5 * math.log((1 + ym) / (1 - ym)) - - return start_estimate - - -def compute_derivatives_binomial_loglikelihood( - y_float: np.ndarray, yhat: np.ndarray -) -> T.Tuple[np.ndarray, np.ndarray]: - "loss = - sum log(1+exp(2*y*yhat))" - - check_y_float(y_float) - - # differences to predict using binomial log-likelihood (yes, the negative of the negative :P) - exp_y_yhat = np.exp(2 * y_float * yhat) - g = 2 * y_float / (1 + exp_y_yhat) # dloss/dyhat, g in the xgboost paper - - # d^2loss/dyhat^2, h in the xgboost paper - h = -(4 * y_float**2 * exp_y_yhat / (1 + exp_y_yhat) ** 2) - - return g, h - - class XGBoostClassifier(base.ClassifierMixin, XGBoostTemplate): """XGBoost classifier @@ -246,7 +217,7 @@ def __sklearn_tags__(self): return tags def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostClassifier": - X, y = validate_data(self, X, y, ensure_all_finite=False) + X, y = validate_data(self, X, y, ensure_all_finite=self.ensure_all_finite) check_classification_targets(y) @@ -269,11 +240,16 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostClassifier": y = vectorize_bool_to_float(y) # initial estimate - self.start_estimate_ = compute_start_estimate_binomial_loglikelihood(y) - yhat = np.ones_like(y) * self.start_estimate_ + self.start_estimate_ = get_start_estimate_log_odds(y) + current_estimates = np.ones_like(y) * self.start_estimate_ for _ in track(range(self.n_trees), description="tree", total=self.n_trees): - g, h = compute_derivatives_binomial_loglikelihood(y, yhat) + g, h = get_pseudo_residual_log_odds( + y, current_estimates, second_order=True + ) # compute_derivatives_binomial_loglikelihood(y, yhat) + + if h is None: + raise ValueError(f"h cannot be None beyond this stage.") if self.use_hist: _X, all_x_bin_edges = xgboost_histogrammify_with_h( @@ -293,16 +269,17 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostClassifier": new_tree.fit(_X, y, g=g, h=h) self.trees_.append(new_tree) - # update _y - yhat += new_tree.predict(X) + current_estimates += new_tree.predict(X) return self def predict_proba(self, X: np.ndarray) -> np.ndarray: check_is_fitted(self, ("trees_", "classes_", "gammas_", "n_features_in_")) - X = validate_data(self, X, reset=False, ensure_all_finite=False) + X = validate_data( + self, X, reset=False, ensure_all_finite=self.ensure_all_finite + ) - g = np.ones(X.shape[0]) * self.start_estimate_ + h = np.ones(X.shape[0]) * self.start_estimate_ for boost, tree in track( enumerate(self.trees_), description="tree", total=len(self.trees_) @@ -314,16 +291,15 @@ def predict_proba(self, X: np.ndarray) -> np.ndarray: else: _X = X - g += tree.predict(_X) + h += tree.predict(_X) - proba = 1 / (1 + np.exp(-2.0 * g)) - proba = np.array([1 - proba, proba]).T - return proba + p = get_probabilities_from_mapped_bools(h) + return p def predict(self, X: np.ndarray) -> np.ndarray: - proba = self.predict_proba(X) + p = self.predict_proba(X) - ix = np.argmax(proba, axis=1) + ix = np.argmax(p, axis=1) y = self.classes_[ix] return y diff --git a/src/random_tree_models/transform.py b/src/random_tree_models/transform.py new file mode 100644 index 0000000..f3681a1 --- /dev/null +++ b/src/random_tree_models/transform.py @@ -0,0 +1,21 @@ +import numpy as np + + +def bool_to_float(x: bool) -> float: + if x == True: + return 1.0 + elif x == False: + return -1.0 + else: + raise ValueError(f"{x=}, expected bool") + + +def vectorize_bool_to_float(y: np.ndarray) -> np.ndarray: + f = np.vectorize(bool_to_float) + return f(y) + + +def get_probabilities_from_mapped_bools(h: np.ndarray) -> np.ndarray: + p = 1 / (1 + np.exp(-2.0 * h)) + p = np.array([1 - p, p]).T + return p diff --git a/src/random_tree_models/utils.py b/src/random_tree_models/utils.py index 390eec5..313a9a3 100644 --- a/src/random_tree_models/utils.py +++ b/src/random_tree_models/utils.py @@ -1,6 +1,5 @@ import logging -import numpy as np from rich.logging import RichHandler @@ -20,17 +19,3 @@ def _get_logger(level=logging.INFO): logger = _get_logger() - - -def bool_to_float(x: bool) -> float: - if x == True: - return 1.0 - elif x == False: - return -1.0 - else: - raise ValueError(f"{x=}, expected bool") - - -def vectorize_bool_to_float(y: np.ndarray) -> np.ndarray: - f = np.vectorize(bool_to_float) - return f(y) diff --git a/tests/models/test_gradientboostedtrees.py b/tests/models/test_gradientboostedtrees.py index f9d74dd..3c3f761 100644 --- a/tests/models/test_gradientboostedtrees.py +++ b/tests/models/test_gradientboostedtrees.py @@ -1,5 +1,3 @@ -import math - import numpy as np import pytest from sklearn.utils.estimator_checks import parametrize_with_checks @@ -88,85 +86,3 @@ def test_gbt_estimators_with_sklearn_checks(estimator, check): Reference: https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.parametrize_with_checks.html#sklearn.utils.estimator_checks.parametrize_with_checks """ check(estimator) - - -def test_get_pseudo_residual_mse(): - y = np.array([1.0, 2.0, 3.0]) - current_estimates = np.array([0.5, 1.0, 2.0]) - expected_residuals = np.array([0.5, 1.0, 1.0]) - actual_residuals = gbt.get_pseudo_residual_mse(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - # Test with negative values - y = np.array([-1.0, -2.0, -3.0]) - current_estimates = np.array([-0.5, -1.0, -2.0]) - expected_residuals = np.array([-0.5, -1.0, -1.0]) - actual_residuals = gbt.get_pseudo_residual_mse(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - # Test with zero values - y = np.array([0.0, 0.0, 0.0]) - current_estimates = np.array([0.0, 0.0, 0.0]) - expected_residuals = np.array([0.0, 0.0, 0.0]) - actual_residuals = gbt.get_pseudo_residual_mse(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - -def test_get_pseudo_residual_log_odds(): - # Test case 1: Basic test with positive and negative values - y = np.array([1, -1, 1, -1]) - current_estimates = np.array([0.1, 0.2, -0.1, -0.2]) - expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) - actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - # Test case 2: y close to zero - y = np.array([0.001, -0.001]) - current_estimates = np.array([0.5, 0.5]) - expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) - actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - # Test case 3: current_estimates close to zero - y = np.array([1, -1]) - current_estimates = np.array([0.001, -0.001]) - expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) - actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - # Test case 4: Larger current_estimates - y = np.array([1, -1]) - current_estimates = np.array([2, -2]) - expected_residuals = 2 * y / (1 + np.exp(2 * y * current_estimates)) - actual_residuals = gbt.get_pseudo_residual_log_odds(y, current_estimates) - assert np.allclose(actual_residuals, expected_residuals) - - -def test_get_start_estimate_log_odds(): - # Test case 1: Balanced classes (mean close to 0) - y = np.array([1, -1, 1, -1]) - actual_start_estimate = gbt.get_start_estimate_log_odds(y) - assert np.isclose(actual_start_estimate, 0.0) - - # Test case 2: All positive class - y = np.array([1, 1, 1, 1]) - actual_start_estimate = gbt.get_start_estimate_log_odds(y) - assert math.isinf(actual_start_estimate) - - # Test case 3: All negative class - y = np.array([-1, -1, -1, -1]) - actual_start_estimate = gbt.get_start_estimate_log_odds(y) - assert math.isinf(actual_start_estimate) - - # Test case 4: Unbalanced classes - y = np.array([1, 1, 1, -1]) - ym = np.mean(y) - actual_start_estimate = gbt.get_start_estimate_log_odds(y) - v = 0.5493061443340549 - assert np.isclose(actual_start_estimate, v) - - # Test case 5: Another set of unbalanced classes - y = np.array([1, -1, -1, -1]) - ym = np.mean(y) - actual_start_estimate = gbt.get_start_estimate_log_odds(y) - assert np.isclose(actual_start_estimate, -v) diff --git a/tests/models/test_xgboost.py b/tests/models/test_xgboost.py index 83a84c1..a8aed26 100644 --- a/tests/models/test_xgboost.py +++ b/tests/models/test_xgboost.py @@ -86,90 +86,72 @@ def test_xgboost_estimators_with_sklearn_checks(estimator, check): check(estimator) -@pytest.mark.parametrize( - "y_float, start_estimate_exp", - [ - (np.array([-1.0, 1.0]), 0), - (np.array([-1.0, 1.0, 1.0, 1.0]), 0.5493061443340549), - (np.array([-1.0, -1.0, -1.0, 1.0]), -0.5493061443340549), - (np.array([True, True, False, False]), None), - (np.array([-2.0, -2.0, 2.0, 2.0]), None), - ], -) -def test_compute_start_estimate_binomial_loglikelihood( - y_float: np.ndarray, start_estimate_exp: float -): - try: - # line to test - start_estimate = xgboost.compute_start_estimate_binomial_loglikelihood(y_float) - except ValueError as ex: - if start_estimate_exp is None: - pass # expectedly failed for non -1 and 1 values - else: - raise ex - else: - if start_estimate_exp is None: - pytest.fail(f"unexpectedly passed for non -1 and 1 values") - assert np.isclose(start_estimate, start_estimate_exp) - - -@pytest.mark.parametrize( - "y,start_estimate,g_exp", - [ - (np.array([1.0]), 0.5, np.array([0.5])), - (np.array([1.0, 1.0]), 0.5, np.array([0.5, 0.5])), - ], -) -def test_compute_derivatives_negative_least_squares( - y: np.ndarray, start_estimate: float, g_exp: np.ndarray -): - # line to test - g, h = xgboost.compute_derivatives_negative_least_squares(y, start_estimate) - - assert g.shape == h.shape - assert np.allclose(g, g_exp) - assert np.allclose(h, -1) - - -@pytest.mark.parametrize( - "y_float,start_estimate,g_exp,h_exp", - [ - ( - np.array([-1.0, 1.0]), - 0.0, - np.array([-1.0, 1.0]), - np.array([-1.0, -1.0]), - ), - ( - np.array([-1.0, -1.0, 1.0, 1.0]), - 0.0, - np.array([-1.0, -1.0, 1.0, 1.0]), - np.array([-1.0, -1.0, -1.0, -1.0]), - ), - # failure cases - (np.array([False, True]), 0.0, None, None), - (np.array([-2.0, 2.0]), 0.0, None, None), - ], -) -def test_compute_derivatives_binomial_loglikelihood( - y_float: np.ndarray, - start_estimate: float, - g_exp: np.ndarray, - h_exp: np.ndarray, -): - yhat = np.ones_like(y_float) * start_estimate - is_bad = g_exp is None and h_exp is None - try: - # line to test - g, h = xgboost.compute_derivatives_binomial_loglikelihood(y_float, yhat) - except ValueError as ex: - if is_bad: - pass # Expectedly failed for incorrect y_float values" - else: - raise ex - else: - if is_bad: - pytest.fail("Unexpectedly passed for incorrect y_float values") - assert g.shape == h.shape - assert np.allclose(g, g_exp) - assert np.allclose(h, h_exp) +# @pytest.mark.parametrize( +# "y_float, start_estimate_exp", +# [ +# (np.array([-1.0, 1.0]), 0), +# (np.array([-1.0, 1.0, 1.0, 1.0]), 0.5493061443340549), +# (np.array([-1.0, -1.0, -1.0, 1.0]), -0.5493061443340549), +# (np.array([True, True, False, False]), None), +# (np.array([-2.0, -2.0, 2.0, 2.0]), None), +# ], +# ) +# def test_compute_start_estimate_binomial_loglikelihood( +# y_float: np.ndarray, start_estimate_exp: float +# ): +# try: +# # line to test +# start_estimate = xgboost.compute_start_estimate_binomial_loglikelihood(y_float) +# except ValueError as ex: +# if start_estimate_exp is None: +# pass # expectedly failed for non -1 and 1 values +# else: +# raise ex +# else: +# if start_estimate_exp is None: +# pytest.fail(f"unexpectedly passed for non -1 and 1 values") +# assert np.isclose(start_estimate, start_estimate_exp) + + +# @pytest.mark.parametrize( +# "y_float,start_estimate,g_exp,h_exp", +# [ +# ( +# np.array([-1.0, 1.0]), +# 0.0, +# np.array([-1.0, 1.0]), +# np.array([-1.0, -1.0]), +# ), +# ( +# np.array([-1.0, -1.0, 1.0, 1.0]), +# 0.0, +# np.array([-1.0, -1.0, 1.0, 1.0]), +# np.array([-1.0, -1.0, -1.0, -1.0]), +# ), +# # failure cases +# (np.array([False, True]), 0.0, None, None), +# (np.array([-2.0, 2.0]), 0.0, None, None), +# ], +# ) +# def test_compute_derivatives_binomial_loglikelihood( +# y_float: np.ndarray, +# start_estimate: float, +# g_exp: np.ndarray, +# h_exp: np.ndarray, +# ): +# yhat = np.ones_like(y_float) * start_estimate +# is_bad = g_exp is None and h_exp is None +# try: +# # line to test +# g, h = xgboost.compute_derivatives_binomial_loglikelihood(y_float, yhat) +# except ValueError as ex: +# if is_bad: +# pass # Expectedly failed for incorrect y_float values" +# else: +# raise ex +# else: +# if is_bad: +# pytest.fail("Unexpectedly passed for incorrect y_float values") +# assert g.shape == h.shape +# assert np.allclose(g, g_exp) +# assert np.allclose(h, h_exp) diff --git a/tests/test_gradient.py b/tests/test_gradient.py new file mode 100644 index 0000000..c45c359 --- /dev/null +++ b/tests/test_gradient.py @@ -0,0 +1,147 @@ +import math + +import numpy as np +import pytest + +from random_tree_models.gradient import ( + check_y_float, + get_pseudo_residual_log_odds, + get_pseudo_residual_mse, + get_start_estimate_log_odds, +) + + +def test_check_y_float(): + # Test case 1: Valid input with only -1 and 1 + y_float = np.array([-1, 1, -1, 1]) + check_y_float(y_float) # Should not raise an error + + # Test case 2: Valid input with only 1 + y_float = np.array([1, 1, 1, 1]) + check_y_float(y_float) # Should not raise an error + + # Test case 3: Valid input with only -1 + y_float = np.array([-1, -1, -1, -1]) + check_y_float(y_float) # Should not raise an error + + # Test case 4: Invalid input with 0 + y_float = np.array([-1, 1, 0, 1]) + with pytest.raises(ValueError): + check_y_float(y_float) + + # Test case 5: Invalid input with values other than -1 and 1 + y_float = np.array([-1, 1, -2, 2]) + with pytest.raises(ValueError): + check_y_float(y_float) + + # Test case 6: Invalid input with mixed values + y_float = np.array([-1, 1, 0.5, -0.5]) + with pytest.raises(ValueError): + check_y_float(y_float) + + +def test_get_start_estimate_log_odds(): + # Test case 1: Balanced classes (mean close to 0) + y = np.array([1, -1, 1, -1]) + actual_start_estimate = get_start_estimate_log_odds(y) + assert np.isclose(actual_start_estimate, 0.0) + + # Test case 2: All positive class + y = np.array([1, 1, 1, 1]) + actual_start_estimate = get_start_estimate_log_odds(y) + assert math.isinf(actual_start_estimate) + + # Test case 3: All negative class + y = np.array([-1, -1, -1, -1]) + actual_start_estimate = get_start_estimate_log_odds(y) + assert math.isinf(actual_start_estimate) + + # Test case 4: Unbalanced classes + y = np.array([1, 1, 1, -1]) + actual_start_estimate = get_start_estimate_log_odds(y) + v = 0.5493061443340549 + assert np.isclose(actual_start_estimate, v) + + # Test case 5: Another set of unbalanced classes + y = np.array([1, -1, -1, -1]) + actual_start_estimate = get_start_estimate_log_odds(y) + assert np.isclose(actual_start_estimate, -v) + + +def test_get_pseudo_residual_log_odds(): + # Test case 1: Basic test with positive and negative values + y = np.array([1, -1, 1, -1]) + current_estimates = np.array([0.1, 0.2, -0.1, -0.2]) + expected_residuals_1st = np.array( + [0.90033201, -1.19737532, 1.09966799, -0.80262468] + ) + expected_residuals_2nd = np.array( + [-0.99006629, -0.96104298, -0.99006629, -0.96104298] + ) + actual_residuals_1st, actual_residuals_2nd = get_pseudo_residual_log_odds( + y, current_estimates, True + ) + assert actual_residuals_2nd is not None + assert np.allclose(actual_residuals_1st, expected_residuals_1st) + assert np.allclose(actual_residuals_2nd, expected_residuals_2nd) + + # Test case 2: current_estimates close to zero + y = np.array([1, -1]) + current_estimates = np.array([0.001, -0.001]) + expected_residuals_1st = np.array([0.999, -0.999]) + expected_residuals_2nd = np.array([-0.999999, -0.999999]) + actual_residuals_1st, actual_residuals_2nd = get_pseudo_residual_log_odds( + y, current_estimates, True + ) + assert actual_residuals_2nd is not None + assert np.allclose(actual_residuals_1st, expected_residuals_1st) + assert np.allclose(actual_residuals_2nd, expected_residuals_2nd) + + # Test case 3: Larger current_estimates + y = np.array([1, -1]) + current_estimates = np.array([2, -2]) + expected_residuals_1st = np.array([0.03597242, -0.03597242]) + expected_residuals_2nd = np.array([-0.07065082, -0.07065082]) + actual_residuals_1st, actual_residuals_2nd = get_pseudo_residual_log_odds( + y, current_estimates, True + ) + assert actual_residuals_2nd is not None + assert np.allclose(actual_residuals_1st, expected_residuals_1st) + assert np.allclose(actual_residuals_2nd, expected_residuals_2nd) + + +def test_get_pseudo_residual_mse(): + y = np.array([1.0, 2.0, 3.0]) + current_estimates = np.array([0.5, 1.0, 2.0]) + expected_residuals_1st = np.array([0.5, 1.0, 1.0]) + expected_residuals_2nd = np.array([-1.0, -1.0, -1.0]) + actual_residuals_1st, actual_residuals_2nd = get_pseudo_residual_mse( + y, current_estimates, True + ) + assert actual_residuals_2nd is not None + assert np.allclose(actual_residuals_1st, expected_residuals_1st) + assert np.allclose(actual_residuals_2nd, expected_residuals_2nd) + + # Test with negative values + y = np.array([-1.0, -2.0, -3.0]) + current_estimates = np.array([-0.5, -1.0, -2.0]) + expected_residuals_1st = np.array([-0.5, -1.0, -1.0]) + expected_residuals_2nd = np.array([-1.0, -1.0, -1.0]) + actual_residuals_1st, actual_residuals_2nd = get_pseudo_residual_mse( + y, current_estimates, True + ) + assert actual_residuals_2nd is not None + assert np.allclose(actual_residuals_1st, expected_residuals_1st) + assert np.allclose(actual_residuals_2nd, expected_residuals_2nd) + + # Test with zero values + y = np.array([0.0, 0.0, 0.0]) + current_estimates = np.array([0.0, 0.0, 0.0]) + expected_residuals_1st = np.array([0.0, 0.0, 0.0]) + expected_residuals_2nd = np.array([-1.0, -1.0, -1.0]) + actual_residuals_1st, actual_residuals_2nd = get_pseudo_residual_mse( + y, current_estimates, True + ) + assert actual_residuals_2nd is not None + assert np.allclose(actual_residuals_1st, expected_residuals_1st) + assert np.allclose(actual_residuals_2nd, expected_residuals_2nd) diff --git a/tests/test_transform.py b/tests/test_transform.py new file mode 100644 index 0000000..42cefd2 --- /dev/null +++ b/tests/test_transform.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from random_tree_models.transform import ( + bool_to_float, + get_probabilities_from_mapped_bools, + vectorize_bool_to_float, +) + + +def test_vectorize_bool_to_float(): + y = np.array([True, False, True, False]) + res = vectorize_bool_to_float(y) + assert np.all(res == np.array([1.0, -1.0, 1.0, -1.0])) + + y = np.array([True, False, True, True]) + res = vectorize_bool_to_float(y) + assert np.all(res == np.array([1.0, -1.0, 1.0, 1.0])) + + y = np.array([False, False, True, False]) + res = vectorize_bool_to_float(y) + assert np.all(res == np.array([-1.0, -1.0, 1.0, -1.0])) + + +@pytest.mark.parametrize( + "x,exp,is_bad", + [ + (True, 1, False), + (False, -1, False), + ("a", None, True), + (1, 1, False), + (0, -1, False), + (-1, None, True), + (None, None, True), + ], +) +def test_bool_to_float(x, exp, is_bad: bool): + try: + # line to test + res = bool_to_float(x) + except ValueError as ex: + if is_bad: + pass # Failed expectedly to convert non-bool values + else: + if is_bad: + pytest.fail(f"Passed unexpectedly for non-bool value {x} returning {res}") + assert res == exp + + +def test_get_probabilities_from_mapped_bools(): + h = np.array([0.0, 1.0, -1.0]) + actual = get_probabilities_from_mapped_bools(h) + expected = np.array( + [[0.5, 0.5], [0.11920292, 0.88079708], [0.88079708, 0.11920292]] + ) + assert np.allclose(actual, expected) + + h = np.array([0.5, -0.5, 0.2]) + actual = get_probabilities_from_mapped_bools(h) + expected = np.array( + [[0.26894142, 0.73105858], [0.73105858, 0.26894142], [0.40131234, 0.59868766]] + ) + assert np.allclose(actual, expected) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2a11693..02c7e22 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,5 @@ import logging -import numpy as np -import pytest - -import random_tree_models.utils import random_tree_models.utils as utils @@ -12,42 +8,3 @@ def test_get_logger(): assert isinstance(logger, logging.Logger) assert logger.name == "rich" assert logger.level == logging.INFO - - -@pytest.mark.parametrize( - "x,exp,is_bad", - [ - (True, 1, False), - (False, -1, False), - ("a", None, True), - (1, 1, False), - (0, -1, False), - (-1, None, True), - (None, None, True), - ], -) -def test_bool_to_float(x, exp, is_bad: bool): - try: - # line to test - res = random_tree_models.utils.bool_to_float(x) - except ValueError as ex: - if is_bad: - pass # Failed expectedly to convert non-bool values - else: - if is_bad: - pytest.fail(f"Passed unexpectedly for non-bool value {x} returning {res}") - assert res == exp - - -def test_vectorize_bool_to_float(): - y = np.array([True, False, True, False]) - res = utils.vectorize_bool_to_float(y) - assert np.all(res == np.array([1.0, -1.0, 1.0, -1.0])) - - y = np.array([True, False, True, True]) - res = utils.vectorize_bool_to_float(y) - assert np.all(res == np.array([1.0, -1.0, 1.0, 1.0])) - - y = np.array([False, False, True, False]) - res = utils.vectorize_bool_to_float(y) - assert np.all(res == np.array([-1.0, -1.0, 1.0, -1.0]))