Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/random_tree_models/gradient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import math

import numpy as np


def check_y_float(y_float: np.ndarray):
# expects y_float to consist only of the values -1 and 1
unexpected_values = np.abs(y_float) != 1
if np.sum(unexpected_values) > 0:
raise ValueError(
f"expected y_float to contain only -1 and 1, got {y_float[unexpected_values]}"
)


def get_pseudo_residual_mse(
y: np.ndarray, current_estimates: np.ndarray, second_order: bool
) -> tuple[np.ndarray, np.ndarray | None]:
"""
mse loss = sum_i (y_i - estimate_i)^2
pseudo residual_i = d mse loss(y,estimate) / d estimate_i = - (y_i - estimate_i)
since we want to apply it as the negative gradient for steepest descent we flip the sign
"""
first_derivative = y - current_estimates

second_derivative = None
if second_order:
second_derivative = -1 * np.ones_like(first_derivative)

return first_derivative, second_derivative


def get_pseudo_residual_log_odds(
y: np.ndarray, current_estimates: np.ndarray, second_order: bool
) -> tuple[np.ndarray, np.ndarray | None]:
"""
first derivative: d loss / d current_estimates, g in the xgboost paper
second derivative: d^2 loss / d current_estimates^2, h in the xgboost paper

"""
check_y_float(y)

a = np.exp(2 * y * current_estimates)
first_derivative = 2 * y / (1 + a)

second_derivative = None
if second_order:
second_derivative = -(4 * y**2 * a / (1 + a) ** 2)

return first_derivative, second_derivative


def get_start_estimate_mse(y: np.ndarray) -> float:
return float(np.mean(y))


def get_start_estimate_log_odds(y: np.ndarray) -> float:
"""
1/2 log(1+ym)/(1-ym) because ym is in [-1, 1]
equivalent to log(ym)/(1-ym) if ym were in [0, 1]
"""
check_y_float(y)

ym = np.mean(y)

if ym == 1:
return math.inf
elif ym == -1:
return -math.inf

start_estimate = 0.5 * math.log((1 + ym) / (1 - ym))

return start_estimate
60 changes: 14 additions & 46 deletions src/random_tree_models/models/gradientboostedtrees.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import math
import typing as T

import numpy as np
Expand All @@ -14,11 +13,20 @@
validate_data, # type: ignore
)

from random_tree_models.gradient import (
get_pseudo_residual_log_odds,
get_pseudo_residual_mse,
get_start_estimate_log_odds,
get_start_estimate_mse,
)
from random_tree_models.models.decisiontree import (
DecisionTreeRegressor,
)
from random_tree_models.params import MetricNames, is_greater_zero
from random_tree_models.utils import vectorize_bool_to_float
from random_tree_models.transform import (
get_probabilities_from_mapped_bools,
vectorize_bool_to_float,
)


class GradientBoostedTreesTemplate(base.BaseEstimator):
Expand Down Expand Up @@ -54,42 +62,6 @@ def predict(self, X: np.ndarray) -> np.ndarray:
raise NotImplementedError()


def get_pseudo_residual_mse(y: np.ndarray, current_estimates: np.ndarray) -> np.ndarray:
"""
mse loss = sum_i (y_i - estimate_i)^2
pseudo residual_i = d mse loss(y,estimate) / d estimate_i = - (y_i - estimate_i)
since we want to apply it as the negative gradient for steepest descent we flip the sign
"""
return y - current_estimates


def get_pseudo_residual_log_odds(
y: np.ndarray, current_estimates: np.ndarray
) -> np.ndarray:
"""
# dloss/dyhat, g in the xgboost paper
"""
return 2 * y / (1 + np.exp(2 * y * current_estimates))


def get_start_estimate_mse(y: np.ndarray) -> float:
return float(np.mean(y))


def get_start_estimate_log_odds(y: np.ndarray) -> float:
"""
1/2 log(1+ym)/(1-ym) because ym is in [-1, 1]
equivalent to log(ym)/(1-ym) if ym were in [0, 1]
"""
ym = np.mean(y)
if ym == 1:
return math.inf
elif ym == -1:
return -math.inf
start_estimate = 0.5 * math.log((1 + ym) / (1 - ym))
return start_estimate


def find_step_size(
y: np.ndarray, current_estimates: np.ndarray, h: np.ndarray
) -> float:
Expand All @@ -112,12 +84,6 @@ def loss(gamma: float) -> float:
return 1.0


def get_probabilities_from_mapped_bools(h: np.ndarray) -> np.ndarray:
proba = 1 / (1 + np.exp(-2.0 * h))
proba = np.array([1 - proba, proba]).T
return proba


class GradientBoostedTreesRegressor(
base.RegressorMixin,
GradientBoostedTreesTemplate,
Expand Down Expand Up @@ -156,7 +122,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesRegressor":
self.step_sizes_: list[float] = []

for _ in track(range(self.n_trees), total=self.n_trees, description="tree"):
r = get_pseudo_residual_mse(y, current_estimates)
r, _ = get_pseudo_residual_mse(y, current_estimates, second_order=False)

# train decision tree to predict differences
new_tree = DecisionTreeRegressor(
Expand Down Expand Up @@ -311,7 +277,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesClassifier":
self.step_sizes_: list[float] = []

for _ in track(range(self.n_trees), description="tree", total=self.n_trees):
r = get_pseudo_residual_log_odds(y, current_estimates)
r, _ = get_pseudo_residual_log_odds(
y, current_estimates, second_order=False
)

new_tree = DecisionTreeRegressor(
measure_name=self.measure_name,
Expand Down
104 changes: 40 additions & 64 deletions src/random_tree_models/models/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
* sparsity-aware split finding / "default direction" for missing values
"""

import math
import typing as T

import numpy as np
Expand All @@ -26,9 +25,18 @@
validate_data, # type: ignore
)

from random_tree_models.gradient import (
get_pseudo_residual_log_odds,
get_pseudo_residual_mse,
get_start_estimate_log_odds,
get_start_estimate_mse,
)
from random_tree_models.models.decisiontree import DecisionTreeRegressor
from random_tree_models.params import MetricNames, is_greater_zero
from random_tree_models.utils import vectorize_bool_to_float
from random_tree_models.transform import (
get_probabilities_from_mapped_bools,
vectorize_bool_to_float,
)


class XGBoostTemplate(base.BaseEstimator):
Expand Down Expand Up @@ -73,15 +81,6 @@ def predict(self, X: np.ndarray) -> np.ndarray:
raise NotImplementedError()


def compute_derivatives_negative_least_squares(
y: np.ndarray, start_estimate: float
) -> T.Tuple[np.ndarray, np.ndarray]:
"loss = - mean |y-yhat|^2"
g = y - start_estimate # 1st order derivative
h = -1 * np.ones_like(g) # 2nd order derivative
return g, h


# TODO: add tests:
# * X_hist is integer based
# * X_hist has the same shape as X
Expand Down Expand Up @@ -144,14 +143,19 @@ class XGBoostRegressor(base.RegressorMixin, XGBoostTemplate):
"""

def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostRegressor":
X, y = validate_data(self, X, y, ensure_all_finite=False)
X, y = validate_data(self, X, y, ensure_all_finite=self.ensure_all_finite)

self.trees_: list[DecisionTreeRegressor] = []

self.start_estimate_: float = float(np.mean(y))
self.start_estimate_ = get_start_estimate_mse(y)

# initial differences to predict using negative squared error loss
g, h = compute_derivatives_negative_least_squares(y, self.start_estimate_)
current_estimates = self.start_estimate_ * np.ones_like(y)
g, h = get_pseudo_residual_mse(y, current_estimates, second_order=True)

if h is None:
raise ValueError(f"h cannot be None beyond this stage.")

if self.use_hist:
X_hist, all_x_bin_edges = xgboost_histogrammify_with_h(
X, h, n_bins=self.n_bins
Expand Down Expand Up @@ -179,7 +183,9 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostRegressor":
def predict(self, X: np.ndarray) -> np.ndarray:
check_is_fitted(self, ("trees_", "n_features_in_", "start_estimate_"))

X = validate_data(self, X, reset=False, ensure_all_finite=False)
X = validate_data(
self, X, reset=False, ensure_all_finite=self.ensure_all_finite
)

# baseline estimate
y = np.ones(X.shape[0]) * self.start_estimate_
Expand All @@ -197,41 +203,6 @@ def predict(self, X: np.ndarray) -> np.ndarray:
return y


def check_y_float(y_float: np.ndarray):
# expects y_float to consist only of the values -1 and 1
unexpected_values = np.abs(y_float) != 1
if np.sum(unexpected_values) > 0:
raise ValueError(
f"expected y_float to contain only -1 and 1, got {y_float[unexpected_values]}"
)


def compute_start_estimate_binomial_loglikelihood(y_float: np.ndarray) -> float:
check_y_float(y_float)

ym = np.mean(y_float)
start_estimate = 0.5 * math.log((1 + ym) / (1 - ym))

return start_estimate


def compute_derivatives_binomial_loglikelihood(
y_float: np.ndarray, yhat: np.ndarray
) -> T.Tuple[np.ndarray, np.ndarray]:
"loss = - sum log(1+exp(2*y*yhat))"

check_y_float(y_float)

# differences to predict using binomial log-likelihood (yes, the negative of the negative :P)
exp_y_yhat = np.exp(2 * y_float * yhat)
g = 2 * y_float / (1 + exp_y_yhat) # dloss/dyhat, g in the xgboost paper

# d^2loss/dyhat^2, h in the xgboost paper
h = -(4 * y_float**2 * exp_y_yhat / (1 + exp_y_yhat) ** 2)

return g, h


class XGBoostClassifier(base.ClassifierMixin, XGBoostTemplate):
"""XGBoost classifier

Expand All @@ -246,7 +217,7 @@ def __sklearn_tags__(self):
return tags

def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostClassifier":
X, y = validate_data(self, X, y, ensure_all_finite=False)
X, y = validate_data(self, X, y, ensure_all_finite=self.ensure_all_finite)

check_classification_targets(y)

Expand All @@ -269,11 +240,16 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostClassifier":
y = vectorize_bool_to_float(y)

# initial estimate
self.start_estimate_ = compute_start_estimate_binomial_loglikelihood(y)
yhat = np.ones_like(y) * self.start_estimate_
self.start_estimate_ = get_start_estimate_log_odds(y)
current_estimates = np.ones_like(y) * self.start_estimate_

for _ in track(range(self.n_trees), description="tree", total=self.n_trees):
g, h = compute_derivatives_binomial_loglikelihood(y, yhat)
g, h = get_pseudo_residual_log_odds(
y, current_estimates, second_order=True
) # compute_derivatives_binomial_loglikelihood(y, yhat)

if h is None:
raise ValueError(f"h cannot be None beyond this stage.")

if self.use_hist:
_X, all_x_bin_edges = xgboost_histogrammify_with_h(
Expand All @@ -293,16 +269,17 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "XGBoostClassifier":
new_tree.fit(_X, y, g=g, h=h)
self.trees_.append(new_tree)

# update _y
yhat += new_tree.predict(X)
current_estimates += new_tree.predict(X)

return self

def predict_proba(self, X: np.ndarray) -> np.ndarray:
check_is_fitted(self, ("trees_", "classes_", "gammas_", "n_features_in_"))
X = validate_data(self, X, reset=False, ensure_all_finite=False)
X = validate_data(
self, X, reset=False, ensure_all_finite=self.ensure_all_finite
)

g = np.ones(X.shape[0]) * self.start_estimate_
h = np.ones(X.shape[0]) * self.start_estimate_

for boost, tree in track(
enumerate(self.trees_), description="tree", total=len(self.trees_)
Expand All @@ -314,16 +291,15 @@ def predict_proba(self, X: np.ndarray) -> np.ndarray:
else:
_X = X

g += tree.predict(_X)
h += tree.predict(_X)

proba = 1 / (1 + np.exp(-2.0 * g))
proba = np.array([1 - proba, proba]).T
return proba
p = get_probabilities_from_mapped_bools(h)
return p

def predict(self, X: np.ndarray) -> np.ndarray:
proba = self.predict_proba(X)
p = self.predict_proba(X)

ix = np.argmax(proba, axis=1)
ix = np.argmax(p, axis=1)
y = self.classes_[ix]

return y
21 changes: 21 additions & 0 deletions src/random_tree_models/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np


def bool_to_float(x: bool) -> float:
if x == True:
return 1.0
elif x == False:
return -1.0
else:
raise ValueError(f"{x=}, expected bool")


def vectorize_bool_to_float(y: np.ndarray) -> np.ndarray:
f = np.vectorize(bool_to_float)
return f(y)


def get_probabilities_from_mapped_bools(h: np.ndarray) -> np.ndarray:
p = 1 / (1 + np.exp(-2.0 * h))
p = np.array([1 - p, p]).T
return p
Loading
Loading