Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 21 additions & 15 deletions src/random_tree_models/decisiontree/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
from sklearn.utils.multiclass import check_classification_targets, type_of_target
from sklearn.utils.validation import check_is_fitted, validate_data # type: ignore

import random_tree_models.params
from random_tree_models.decisiontree.node import Node
from random_tree_models.decisiontree.predict import predict_with_tree
from random_tree_models.decisiontree.train import grow_tree
from random_tree_models.params import MetricNames
from random_tree_models.params import (
ColumnSelectionMethod,
ColumnSelectionParameters,
MetricNames,
ThresholdSelectionMethod,
ThresholdSelectionParameters,
TreeGrowthParameters,
)


class DecisionTreeTemplate(base.BaseEstimator):
Expand All @@ -19,32 +25,32 @@ class DecisionTreeTemplate(base.BaseEstimator):
"""

max_depth: int
measure_name: random_tree_models.params.MetricNames
measure_name: MetricNames
min_improvement: float
lam: float
frac_subsamples: float
frac_features: float
random_state: int
threshold_method: random_tree_models.params.ThresholdSelectionMethod
threshold_method: ThresholdSelectionMethod
threshold_quantile: float
n_thresholds: int
column_method: random_tree_models.params.ColumnSelectionMethod
column_method: ColumnSelectionMethod
n_columns_to_try: int | None
ensure_all_finite: bool
tree_: Node

def __init__(
self,
measure_name: random_tree_models.params.MetricNames,
measure_name: MetricNames,
max_depth: int = 2,
min_improvement: float = 0.0,
lam: float = 0.0,
frac_subsamples: float = 1.0,
frac_features: float = 1.0,
threshold_method: random_tree_models.params.ThresholdSelectionMethod = random_tree_models.params.ThresholdSelectionMethod.bruteforce,
threshold_method: ThresholdSelectionMethod = ThresholdSelectionMethod.bruteforce,
threshold_quantile: float = 0.1,
n_thresholds: int = 100,
column_method: random_tree_models.params.ColumnSelectionMethod = random_tree_models.params.ColumnSelectionMethod.ascending,
column_method: ColumnSelectionMethod = ColumnSelectionMethod.ascending,
n_columns_to_try: int | None = None,
random_state: int = 42,
ensure_all_finite: bool = True,
Expand All @@ -64,20 +70,20 @@ def __init__(
self.ensure_all_finite = ensure_all_finite

def _organize_growth_parameters(self):
self.growth_params_ = random_tree_models.params.TreeGrowthParameters(
self.growth_params_ = TreeGrowthParameters(
max_depth=self.max_depth,
min_improvement=self.min_improvement,
lam=-abs(self.lam),
frac_subsamples=float(self.frac_subsamples),
frac_features=float(self.frac_features),
random_state=int(self.random_state),
threshold_params=random_tree_models.params.ThresholdSelectionParameters(
threshold_params=ThresholdSelectionParameters(
method=self.threshold_method,
quantile=self.threshold_quantile,
n_thresholds=self.n_thresholds,
random_state=int(self.random_state),
),
column_params=random_tree_models.params.ColumnSelectionParameters(
column_params=ColumnSelectionParameters(
method=self.column_method,
n_trials=self.n_columns_to_try,
),
Expand Down Expand Up @@ -142,10 +148,10 @@ def __init__(
lam: float = 0.0,
frac_subsamples: float = 1.0,
frac_features: float = 1.0,
threshold_method: random_tree_models.params.ThresholdSelectionMethod = random_tree_models.params.ThresholdSelectionMethod.bruteforce,
threshold_method: ThresholdSelectionMethod = ThresholdSelectionMethod.bruteforce,
threshold_quantile: float = 0.1,
n_thresholds: int = 100,
column_method: random_tree_models.params.ColumnSelectionMethod = random_tree_models.params.ColumnSelectionMethod.ascending,
column_method: ColumnSelectionMethod = ColumnSelectionMethod.ascending,
n_columns_to_try: int | None = None,
random_state: int = 42,
ensure_all_finite: bool = True,
Expand Down Expand Up @@ -215,10 +221,10 @@ def __init__(
lam: float = 0.0,
frac_subsamples: float = 1.0,
frac_features: float = 1.0,
threshold_method: random_tree_models.params.ThresholdSelectionMethod = random_tree_models.params.ThresholdSelectionMethod.bruteforce,
threshold_method: ThresholdSelectionMethod = ThresholdSelectionMethod.bruteforce,
threshold_quantile: float = 0.1,
n_thresholds: int = 100,
column_method: random_tree_models.params.ColumnSelectionMethod = random_tree_models.params.ColumnSelectionMethod.ascending,
column_method: ColumnSelectionMethod = ColumnSelectionMethod.ascending,
n_columns_to_try: int | None = None,
random_state: int = 42,
ensure_all_finite: bool = True,
Expand Down
36 changes: 21 additions & 15 deletions src/random_tree_models/decisiontree/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,22 @@

import numpy as np

import random_tree_models.params
import random_tree_models.scoring as scoring
from random_tree_models.decisiontree.node import Node
from random_tree_models.decisiontree.split_objects import BestSplit
from random_tree_models.params import (
ColumnSelectionMethod,
ColumnSelectionParameters,
MetricNames,
ThresholdSelectionMethod,
ThresholdSelectionParameters,
TreeGrowthParameters,
)


def select_thresholds(
feature_values: np.ndarray,
threshold_params: random_tree_models.params.ThresholdSelectionParameters,
threshold_params: ThresholdSelectionParameters,
rng: np.random.RandomState,
) -> np.ndarray:
"Selects thresholds to use for splitting"
Expand All @@ -19,9 +26,9 @@ def select_thresholds(
n_thresholds = threshold_params.n_thresholds
num_quantile_steps = threshold_params.num_quantile_steps

if method == random_tree_models.params.ThresholdSelectionMethod.bruteforce:
if method == ThresholdSelectionMethod.bruteforce:
return feature_values[1:]
elif method == random_tree_models.params.ThresholdSelectionMethod.random:
elif method == ThresholdSelectionMethod.random:
if len(feature_values) - 1 <= n_thresholds:
return feature_values[1:]
else:
Expand All @@ -30,10 +37,10 @@ def select_thresholds(
size=(n_thresholds,),
replace=False,
)
elif method == random_tree_models.params.ThresholdSelectionMethod.quantile:
elif method == ThresholdSelectionMethod.quantile:
qs = np.linspace(0, 1, num_quantile_steps)
return np.quantile(feature_values[1:], qs)
elif method == random_tree_models.params.ThresholdSelectionMethod.uniform:
elif method == ThresholdSelectionMethod.uniform:
x = np.linspace(
feature_values.min(),
feature_values.max(),
Expand All @@ -46,7 +53,7 @@ def select_thresholds(

def get_thresholds_and_target_groups(
feature_values: np.ndarray,
threshold_params: random_tree_models.params.ThresholdSelectionParameters,
threshold_params: ThresholdSelectionParameters,
rng: np.random.RandomState,
) -> T.Generator[T.Tuple[np.ndarray, np.ndarray, bool | None], None, None]:
"Creates a generator for split finding, returning the used threshold, the target groups and a bool indicating if the default direction is left"
Expand Down Expand Up @@ -77,21 +84,21 @@ def get_thresholds_and_target_groups(

def get_column(
X: np.ndarray,
column_params: random_tree_models.params.ColumnSelectionParameters,
column_params: ColumnSelectionParameters,
rng: np.random.RandomState,
) -> list[int]:
# select column order to split on
method = column_params.method
n_columns_to_try = column_params.n_trials

columns = list(range(X.shape[1]))
if method == random_tree_models.params.ColumnSelectionMethod.ascending:
if method == ColumnSelectionMethod.ascending:
pass
elif method == random_tree_models.params.ColumnSelectionMethod.random:
elif method == ColumnSelectionMethod.random:
columns = np.array(columns)
rng.shuffle(columns)
columns = columns.tolist()
elif method == random_tree_models.params.ColumnSelectionMethod.largest_delta:
elif method == ColumnSelectionMethod.largest_delta:
deltas = X.max(axis=0) - X.min(axis=0)
weights = deltas / deltas.sum()
columns = np.array(columns)
Expand All @@ -114,8 +121,7 @@ def find_best_split(
yhat: np.ndarray | None = None,
g: np.ndarray | None = None,
h: np.ndarray | None = None,
growth_params: random_tree_models.params.TreeGrowthParameters
| None = None, # TODO: make required
growth_params: TreeGrowthParameters | None = None, # TODO: make required
rng: np.random.RandomState = np.random.RandomState(42),
) -> BestSplit:
"""Find the best split, detecting the "default direction" with missing data."""
Expand All @@ -141,7 +147,7 @@ def find_best_split(
feature_values, growth_params.threshold_params, rng
):
split_score = scoring.calc_split_score(
random_tree_models.params.MetricNames(measure_name),
MetricNames(measure_name),
y,
target_groups,
yhat=yhat,
Expand All @@ -167,7 +173,7 @@ def find_best_split(
def check_if_split_sensible(
best: BestSplit,
parent_node: Node | None,
growth_params: random_tree_models.params.TreeGrowthParameters,
growth_params: TreeGrowthParameters,
) -> tuple[bool, float | None]:
"Verifies if split is sensible, considering score gain and left/right group sizes"
parent_is_none = parent_node is None
Expand Down
10 changes: 1 addition & 9 deletions src/random_tree_models/gradientboostedtrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import random_tree_models.decisiontree as dtree
from random_tree_models.params import MetricNames
from random_tree_models.utils import bool_to_float


class GradientBoostedTreesTemplate(base.BaseEstimator):
Expand Down Expand Up @@ -134,15 +135,6 @@ def predict(self, X: np.ndarray) -> np.ndarray:
return y


def bool_to_float(x: bool) -> float:
if x == True:
return 1.0
elif x == False:
return -1.0
else:
raise ValueError(f"{x=}, expected bool")


class GradientBoostedTreesClassifier(
base.ClassifierMixin,
GradientBoostedTreesTemplate,
Expand Down
9 changes: 9 additions & 0 deletions src/random_tree_models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,12 @@ def _get_logger(level=logging.INFO):


logger = _get_logger()


def bool_to_float(x: bool) -> float:
if x == True:
return 1.0
elif x == False:
return -1.0
else:
raise ValueError(f"{x=}, expected bool")
2 changes: 1 addition & 1 deletion src/random_tree_models/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

import random_tree_models.decisiontree as dtree
import random_tree_models.gradientboostedtrees as gbt
import random_tree_models.utils as gbt
from random_tree_models.params import MetricNames


Expand Down
11 changes: 6 additions & 5 deletions tests/test_extratrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks

import random_tree_models.decisiontree as dtree
import random_tree_models.extratrees as et
from random_tree_models.decisiontree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
)
from random_tree_models.params import MetricNames
from tests.conftest import expected_failed_checks

Expand Down Expand Up @@ -37,9 +40,7 @@ class TestExtraTreesRegressor:
def test_fit(self):
model = et.ExtraTreesRegressor()
model.fit(self.X, self.y)
assert all(
[isinstance(model, dtree.DecisionTreeRegressor) for model in model.trees_]
)
assert all([isinstance(model, DecisionTreeRegressor) for model in model.trees_])

def test_predict(self):
model = et.ExtraTreesRegressor()
Expand Down Expand Up @@ -69,7 +70,7 @@ def test_fit(self):
model.fit(self.X, self.y)
assert not hasattr(self.model, "classes_")
assert all(
[isinstance(model, dtree.DecisionTreeClassifier) for model in model.trees_]
[isinstance(model, DecisionTreeClassifier) for model in model.trees_]
)

def test_predict(self):
Expand Down
37 changes: 5 additions & 32 deletions tests/test_gradientboostedtrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks

import random_tree_models.decisiontree as dtree
import random_tree_models.gradientboostedtrees as gbt
from random_tree_models.decisiontree import (
DecisionTreeRegressor,
)
from tests.conftest import expected_failed_checks


Expand Down Expand Up @@ -36,9 +38,7 @@ class TestGradientBoostedTreesRegressor:
def test_fit(self):
model = gbt.GradientBoostedTreesRegressor()
model.fit(self.X, self.y)
assert all(
[isinstance(model, dtree.DecisionTreeRegressor) for model in model.trees_]
)
assert all([isinstance(model, DecisionTreeRegressor) for model in model.trees_])

def test_predict(self):
model = gbt.GradientBoostedTreesRegressor()
Expand Down Expand Up @@ -67,9 +67,7 @@ def test_fit(self):
model = gbt.GradientBoostedTreesClassifier()
model.fit(self.X, self.y)
assert not hasattr(self.model, "classes_")
assert all(
[isinstance(model, dtree.DecisionTreeRegressor) for model in model.trees_]
)
assert all([isinstance(model, DecisionTreeRegressor) for model in model.trees_])

def test_predict(self):
model = gbt.GradientBoostedTreesClassifier()
Expand All @@ -88,28 +86,3 @@ def test_gbt_estimators_with_sklearn_checks(estimator, check):
Reference: https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.parametrize_with_checks.html#sklearn.utils.estimator_checks.parametrize_with_checks
"""
check(estimator)


@pytest.mark.parametrize(
"x,exp,is_bad",
[
(True, 1, False),
(False, -1, False),
("a", None, True),
(1, 1, False),
(0, -1, False),
(-1, None, True),
(None, None, True),
],
)
def test_bool_to_float(x, exp, is_bad: bool):
try:
# line to test
res = gbt.bool_to_float(x)
except ValueError as ex:
if is_bad:
pass # Failed expectedly to convert non-bool values
else:
if is_bad:
pytest.fail(f"Passed unexpectedly for non-bool value {x} returning {res}")
assert res == exp
Loading
Loading