diff --git a/nbs/core/decision-tree.ipynb b/nbs/core/decision-tree.ipynb index 2c8c7b3..16e7d9b 100644 --- a/nbs/core/decision-tree.ipynb +++ b/nbs/core/decision-tree.ipynb @@ -63,7 +63,11 @@ "import seaborn as sns\n", "import sklearn.datasets as sk_datasets\n", "\n", - "import random_tree_models.decisiontree as dtree\n", + "from random_tree_models.decisiontree import (\n", + " DecisionTreeClassifier,\n", + " DecisionTreeRegressor,\n", + ")\n", + "from random_tree_models.decisiontree.visualize import show_tree\n", "from random_tree_models.scoring import MetricNames" ] }, @@ -111,7 +115,7 @@ "metadata": {}, "outputs": [], "source": [ - "model = dtree.DecisionTreeClassifier(measure_name=MetricNames.gini, max_depth=4)" + "model = DecisionTreeClassifier(measure_name=MetricNames.gini, max_depth=4)" ] }, { @@ -138,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model)" + "show_tree(model)" ] }, { @@ -216,7 +220,7 @@ "metadata": {}, "outputs": [], "source": [ - "model = dtree.DecisionTreeRegressor(measure_name=MetricNames.variance, max_depth=2)" + "model = DecisionTreeRegressor(measure_name=MetricNames.variance, max_depth=2)" ] }, { @@ -234,7 +238,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model)" + "show_tree(model)" ] }, { diff --git a/nbs/core/extra-trees.ipynb b/nbs/core/extra-trees.ipynb index 123bd62..ed44855 100644 --- a/nbs/core/extra-trees.ipynb +++ b/nbs/core/extra-trees.ipynb @@ -41,10 +41,10 @@ "import seaborn as sns\n", "import sklearn.datasets as sk_datasets\n", "\n", - "import random_tree_models.decisiontree as dtree\n", "import random_tree_models.extratrees as et\n", "from random_tree_models.scoring import MetricNames\n", - "from random_tree_models.utils import ThresholdSelectionMethod" + "from random_tree_models.params import ThresholdSelectionMethod\n", + "from random_tree_models.decisiontree.visualize import show_tree" ] }, { @@ -143,7 +143,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { @@ -249,7 +249,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { diff --git a/nbs/core/gradient-boosted-trees.ipynb b/nbs/core/gradient-boosted-trees.ipynb index f6f9490..81bd07e 100644 --- a/nbs/core/gradient-boosted-trees.ipynb +++ b/nbs/core/gradient-boosted-trees.ipynb @@ -71,7 +71,7 @@ "import seaborn as sns\n", "import sklearn.datasets as sk_datasets\n", "\n", - "import random_tree_models.decisiontree as dtree\n", + "from random_tree_models.decisiontree.visualize import show_tree\n", "import random_tree_models.gradientboostedtrees as gbtree\n", "from random_tree_models.scoring import MetricNames" ] @@ -160,7 +160,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { @@ -256,7 +256,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { diff --git a/nbs/core/isolation-forest.ipynb b/nbs/core/isolation-forest.ipynb index 9c2c12b..5ced2af 100644 --- a/nbs/core/isolation-forest.ipynb +++ b/nbs/core/isolation-forest.ipynb @@ -56,9 +56,9 @@ "import seaborn as sns\n", "import sklearn.datasets as sk_datasets\n", "\n", - "import random_tree_models.decisiontree as dtree\n", + "from random_tree_models.decisiontree.visualize import show_tree\n", "import random_tree_models.isolationforest as iforest\n", - "from random_tree_models.utils import ColumnSelectionMethod, ThresholdSelectionMethod" + "from random_tree_models.params import ColumnSelectionMethod, ThresholdSelectionMethod" ] }, { @@ -111,7 +111,6 @@ "source": [ "frac_subsamples = 2 / 3\n", "frac_features = 1 # math.sqrt(X.shape[1]) / X.shape[1]\n", - "frac_subsamples, frac_features, X.shape[1]\n", "\n", "# threshold_method = ThresholdSelectionMethod.uniform # selects a random threshold from the linear space between the min and max values in X\n", "threshold_method = (\n", @@ -167,7 +166,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { diff --git a/nbs/core/random-forest.ipynb b/nbs/core/random-forest.ipynb index 39d9773..9f5448b 100644 --- a/nbs/core/random-forest.ipynb +++ b/nbs/core/random-forest.ipynb @@ -41,9 +41,9 @@ "import seaborn as sns\n", "import sklearn.datasets as sk_datasets\n", "\n", - "import random_tree_models.decisiontree as dtree\n", + "from random_tree_models.decisiontree.visualize import show_tree\n", "import random_tree_models.randomforest as rf\n", - "from random_tree_models.scoring import MetricNames" + "from random_tree_models.params import MetricNames" ] }, { @@ -135,7 +135,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { @@ -238,7 +238,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { diff --git a/nbs/core/robust-random-cut-forest.ipynb b/nbs/core/robust-random-cut-forest.ipynb index 52ac9ae..034230f 100644 --- a/nbs/core/robust-random-cut-forest.ipynb +++ b/nbs/core/robust-random-cut-forest.ipynb @@ -55,9 +55,9 @@ "import seaborn as sns\n", "import sklearn.datasets as sk_datasets\n", "\n", - "import random_tree_models.decisiontree as dtree\n", + "from random_tree_models.decisiontree.visualize import show_tree\n", "import random_tree_models.isolationforest as iforest\n", - "from random_tree_models.utils import ColumnSelectionMethod, ThresholdSelectionMethod" + "from random_tree_models.params import ColumnSelectionMethod, ThresholdSelectionMethod" ] }, { @@ -164,7 +164,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { diff --git a/nbs/core/xgboost.ipynb b/nbs/core/xgboost.ipynb index 8c8541f..4813e46 100644 --- a/nbs/core/xgboost.ipynb +++ b/nbs/core/xgboost.ipynb @@ -88,7 +88,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "$$ \\text{loss}^{(t)} = \\sum_{i=1}^n l(y_i, \\hat{y}^{(t-1)}_i + f^{(t)}(x_i) ) + \\Omega \\left( f^{(t)} \\right) $$ " + "$$ \\text{loss} ^ {(t)} = \\sum_{i=1}^n l(y_i, \\hat{y}^{(t-1)}_i + f^{(t)}(x_i) ) + \\Omega \\left( f^{(t)} \\right) $$" ] }, { @@ -97,7 +97,7 @@ "metadata": {}, "source": [ "For the regularization the authors use\n", - "$$ \\Omega (f_t) = \\gamma N^{(t)}_\\text{leafs} + \\frac{1}{2} \\lambda \\sum^{N^{(t)}_\\text{leafs}}_j w_j^2$$\n", + "$$ \\Omega (f_t) = \\gamma N^{(t)}_\\text{leafs} + \\frac{1}{2} \\lambda \\sum^{N^{(t)}_\\text{leafs}}_j w_j^2 $$\n", "\n", "where $\\gamma$ is some constant and $w_j$ is a leaf weight (seems like the $\\gamma_{jm}$ from Friedman et al. but isn't clarified)" ] @@ -284,10 +284,9 @@ "import sklearn.datasets as sk_datasets\n", "from scipy import stats\n", "\n", - "import random_tree_models.decisiontree as dtree\n", - "import random_tree_models.gradientboostedtrees as gbtree\n", + "from random_tree_models.decisiontree.visualize import show_tree\n", "import random_tree_models.xgboost as xgboost\n", - "from random_tree_models.scoring import MetricNames" + "from random_tree_models.params import MetricNames" ] }, { @@ -355,7 +354,7 @@ "outputs": [], "source": [ "model = xgboost.XGBoostClassifier(\n", - " measure_name=\"xgboost\", max_depth=2, n_trees=3, lam=0.0\n", + " measure_name=MetricNames.xgboost, max_depth=2, n_trees=3, lam=0.0\n", ")" ] }, @@ -374,7 +373,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { @@ -449,7 +448,7 @@ "outputs": [], "source": [ "model = xgboost.XGBoostRegressor(\n", - " measure_name=\"xgboost\", max_depth=2, n_trees=3, lam=0.0\n", + " measure_name=MetricNames.xgboost, max_depth=2, n_trees=3, lam=0.0\n", ")" ] }, @@ -468,7 +467,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { @@ -610,7 +609,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { @@ -719,7 +718,7 @@ "metadata": {}, "outputs": [], "source": [ - "dtree.show_tree(model.trees_[0])" + "show_tree(model.trees_[0])" ] }, { diff --git a/nbs/dev/xgboost-profiling-histogramming-yay-or-nay.ipynb b/nbs/dev/xgboost-profiling-histogramming-yay-or-nay.ipynb index 899d2ed..2b3ddf7 100644 --- a/nbs/dev/xgboost-profiling-histogramming-yay-or-nay.ipynb +++ b/nbs/dev/xgboost-profiling-histogramming-yay-or-nay.ipynb @@ -165,9 +165,7 @@ "metadata": {}, "outputs": [], "source": [ - "execution_stats_reg_vanilla = get_class_stats(\n", - " False, None, n_samples_arr, n_features_arr\n", - ")" + "execution_stats_reg_vanilla = get_class_stats(False, 256, n_samples_arr, n_features_arr)" ] }, { @@ -363,9 +361,7 @@ "metadata": {}, "outputs": [], "source": [ - "execution_stats_class_vanilla = get_reg_stats(\n", - " False, None, n_samples_arr, n_features_arr\n", - ")" + "execution_stats_class_vanilla = get_reg_stats(False, 256, n_samples_arr, n_features_arr)" ] }, { @@ -499,7 +495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.13.3" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 494de84..f370462 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,8 @@ build-backend = "maturin" [dependency-groups] test = [ "pytest>=7.3.1", + "dirty-equals>=0.9.0", + "inline-snapshot>=0.27.2", ] nb = [ "ipywidgets>=8.0.6", @@ -44,6 +46,6 @@ dev = [ "snakeviz>=2.2.0", "pip-audit>=2.9.0", "pytest-cov>=6.2.1", - {include-group = "nb"}, - {include-group = "test"}, + {include-group = "nb"}, + {include-group = "test"}, ] diff --git a/src/random_tree_models/decisiontree.py b/src/random_tree_models/decisiontree.py deleted file mode 100644 index 9e9bc13..0000000 --- a/src/random_tree_models/decisiontree.py +++ /dev/null @@ -1,829 +0,0 @@ -import typing as T -import uuid - -import numpy as np -import sklearn.base as base -from pydantic import ( - ConfigDict, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, -) -from pydantic.dataclasses import dataclass -from rich import print as rprint -from rich.tree import Tree -from sklearn.utils.multiclass import check_classification_targets, type_of_target -from sklearn.utils.validation import ( - check_is_fitted, - validate_data, # type: ignore -) - -import random_tree_models.leafweights as leafweights -import random_tree_models.scoring as scoring -import random_tree_models.utils as utils -from random_tree_models.scoring import MetricNames - -logger = utils.logger - - -@dataclass(validate_on_init=True) -class SplitScore: - name: StrictStr # name of the score used - value: StrictFloat | None = None # optimization value gini etc - - -@dataclass -class Node: - """Decision node in a decision tree""" - - # Stuff for making a decision - array_column: StrictInt | None = None # index of the column to use - threshold: float | None = None # threshold for decision - prediction: float | None = None # value to use for predictions - default_is_left: bool | None = None # default direction is x is nan - - # decendants - right: "Node | None" = None # right decendany of type Node - left: "Node | None" = None # left decendant of type Node - - # misc info - measure: SplitScore | None = None - - n_obs: StrictInt | None = None # number of observations in node - reason: StrictStr | None = None # place for some comment - - depth: StrictInt | None = None # depth of the node - - def __post_init__(self): - # unique identifier of the node - self.node_id = uuid.uuid4() - - @property - def is_leaf(self) -> bool: - return self.left is None and self.right is None - - -def check_is_baselevel(y: np.ndarray, depth: int, max_depth: int) -> T.Tuple[bool, str]: - """Verifies if the tree traversal reached the baselevel / a leaf - * group homogeneous / cannot sensibly be splitted further - * no data in the group - * max depth reached - """ - if max_depth is not None and depth >= max_depth: - return (True, "max depth reached") - elif len(np.unique(y)) == 1: - return (True, "homogenous group") - elif len(y) <= 1: - return (True, "<= 1 data point in group") - else: - return (False, "") - - -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class BestSplit: - score: StrictFloat - column: StrictInt - threshold: StrictFloat - target_groups: np.ndarray = Field(default_factory=lambda: np.zeros(10)) - default_is_left: StrictBool | None = None - - -def select_thresholds( - feature_values: np.ndarray, - threshold_params: utils.ThresholdSelectionParameters, - rng: np.random.RandomState, -) -> np.ndarray: - "Selects thresholds to use for splitting" - - method = threshold_params.method - n_thresholds = threshold_params.n_thresholds - num_quantile_steps = threshold_params.num_quantile_steps - - if method == utils.ThresholdSelectionMethod.bruteforce: - return feature_values[1:] - elif method == utils.ThresholdSelectionMethod.random: - if len(feature_values) - 1 <= n_thresholds: - return feature_values[1:] - else: - return rng.choice( - feature_values[1:], - size=(n_thresholds,), - replace=False, - ) - elif method == utils.ThresholdSelectionMethod.quantile: - qs = np.linspace(0, 1, num_quantile_steps) - return np.quantile(feature_values[1:], qs) - elif method == utils.ThresholdSelectionMethod.uniform: - x = np.linspace( - feature_values.min(), - feature_values.max(), - n_thresholds + 2, - ) - return rng.choice(x[1:], size=[1]) - else: - raise NotImplementedError(f"Unknown threshold selection method: {method}") - - -def get_thresholds_and_target_groups( - feature_values: np.ndarray, - threshold_params: utils.ThresholdSelectionParameters, - rng: np.random.RandomState, -) -> T.Generator[T.Tuple[np.ndarray, np.ndarray, bool | None], None, None]: - "Creates a generator for split finding, returning the used threshold, the target groups and a bool indicating if the default direction is left" - is_missing = np.isnan(feature_values) - is_finite = np.logical_not(is_missing) - all_finite = is_finite.all() - - if all_finite: - default_direction_is_left = None - thresholds = select_thresholds(feature_values, threshold_params, rng) - - for threshold in thresholds: - target_groups = feature_values < threshold - yield (threshold, target_groups, default_direction_is_left) - else: - finite_feature_values = feature_values[is_finite] - thresholds = select_thresholds(finite_feature_values, threshold_params, rng) - - for threshold in thresholds: - # default direction left - feature value <= threshold or missing (i.e. missing are included left of the threshold) - target_groups = np.logical_or(feature_values < threshold, is_missing) - yield (threshold, target_groups, True) - - # default direction right - feature value <= threshold and finite (i.e. missing are included right of the threshold) - target_groups = np.logical_and(feature_values < threshold, is_finite) - yield (threshold, target_groups, False) - - -def get_column( - X: np.ndarray, - column_params: utils.ColumnSelectionParameters, - rng: np.random.RandomState, -) -> list[int]: - # select column order to split on - method = column_params.method - n_columns_to_try = column_params.n_trials - - columns = list(range(X.shape[1])) - if method == utils.ColumnSelectionMethod.ascending: - pass - elif method == utils.ColumnSelectionMethod.random: - columns = np.array(columns) - rng.shuffle(columns) - columns = columns.tolist() - elif method == utils.ColumnSelectionMethod.largest_delta: - deltas = X.max(axis=0) - X.min(axis=0) - weights = deltas / deltas.sum() - columns = np.array(columns) - columns = rng.choice(columns, p=weights, size=len(columns), replace=False) - columns = columns.tolist() - else: - raise NotImplementedError( - f"Unknown column selection method: {column_params.method}" - ) - if n_columns_to_try is not None: - columns = columns[:n_columns_to_try] - - return columns - - -def find_best_split( - X: np.ndarray, - y: np.ndarray, - measure_name: str, - yhat: np.ndarray | None = None, - g: np.ndarray | None = None, - h: np.ndarray | None = None, - growth_params: utils.TreeGrowthParameters | None = None, # TODO: make required - rng: np.random.RandomState = np.random.RandomState(42), -) -> BestSplit: - """Find the best split, detecting the "default direction" with missing data.""" - - if len(np.unique(y)) == 1: - raise ValueError( - f"Tried to find a split for homogenous y: {y[:3]} ... {y[-3:]}" - ) - - best = None # this will be an BestSplit instance - - if growth_params is None: - raise ValueError(f"{growth_params=} but is not allowed to be None") - - for array_column in get_column(X, growth_params.column_params, rng): - feature_values = X[:, array_column] - - for ( - threshold, - target_groups, - default_is_left, - ) in get_thresholds_and_target_groups( - feature_values, growth_params.threshold_params, rng - ): - split_score = scoring.calc_split_score( - scoring.MetricNames(measure_name), - y, - target_groups, - yhat=yhat, - g=g, - h=h, - growth_params=growth_params, - ) - - if best is None or split_score > best.score: - best = BestSplit( - score=float(split_score), - column=int(array_column), - threshold=float(threshold), - target_groups=target_groups, - default_is_left=default_is_left, - ) - - if best is None: - raise ValueError(f"Something went wrong {best=} cannot be None.") - return best - - -def check_if_split_sensible( - best: BestSplit, - parent_node: Node | None, - growth_params: utils.TreeGrowthParameters, -) -> tuple[bool, float | None]: - "Verifies if split is sensible, considering score gain and left/right group sizes" - parent_is_none = parent_node is None - if parent_is_none: - return False, None - - measure_is_none = parent_node.measure is None - if measure_is_none: - return False, None - - value_is_none = parent_node.measure.value is None # type: ignore - if value_is_none: - return False, None - - # score gain - gain = best.score - parent_node.measure.value # type: ignore - is_insufficient_gain = gain < growth_params.min_improvement - - # left/right group assignment - all_on_one_side = bool(best.target_groups.all()) - all_on_other_side = bool(np.logical_not(best.target_groups).all()) - is_all_onesided = all_on_one_side or all_on_other_side - - is_not_sensible = is_all_onesided or is_insufficient_gain - - return is_not_sensible, gain - - -def calc_leaf_weight_and_split_score( - y: np.ndarray, - measure_name: scoring.MetricNames, - growth_params: utils.TreeGrowthParameters, - g: np.ndarray | None = None, - h: np.ndarray | None = None, -) -> tuple[float | None, float]: - leaf_weight = leafweights.calc_leaf_weight(y, measure_name, growth_params, g=g, h=h) - - yhat = leaf_weight * np.ones_like(y) - score = scoring.calc_split_score( - measure_name, - y, - np.ones_like(y, dtype=bool), - yhat=yhat, - g=g, - h=h, - growth_params=growth_params, - ) - - return leaf_weight, score - - -def select_arrays_for_child_node( - go_left: bool, - best: BestSplit, - X: np.ndarray, - y: np.ndarray, - g: np.ndarray | None = None, - h: np.ndarray | None = None, -) -> tuple[np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None]: - mask = best.target_groups == go_left - _X = X[mask, :] - _y = y[mask] - _g = g[mask] if g is not None else None - _h = h[mask] if h is not None else None - return _X, _y, _g, _h - - -def grow_tree( - X: np.ndarray, - y: np.ndarray, - measure_name: MetricNames, - growth_params: utils.TreeGrowthParameters, - parent_node: Node | None = None, - depth: int = 0, - g: np.ndarray | None = None, - h: np.ndarray | None = None, - random_state: int = 42, - **kwargs, -) -> Node: - """Implementation of the Classification And Regression Tree (CART) algorithm - - Args: - X (np.ndarray): Input feature values to do thresholding on. - y (np.ndarray): Target values. - measure_name (str): Values indicating which functions in scoring.SplitScoreMetrics and leafweights.LeafWeightSchemes to call. - parent_node (Node, optional): Parent node in tree. Defaults to None. - depth (int, optional): Current tree depth. Defaults to 0. - growth_params (utils.TreeGrowthParameters, optional): Parameters controlling tree growth. Defaults to None. - g (np.ndarray, optional): Boosting and loss specific precomputed 1st order derivative dloss/dyhat. Defaults to None. - h (np.ndarray, optional): Boosting and loss specific precomputed 2nd order derivative d^2loss/dyhat^2. Defaults to None. - - Raises: - ValueError: Fails if parent node passes an empty y array. - - Returns: - Node: Tree node with leaf weight, node score and potential child nodes. - - Note: - Currently measure_name controls how the split score and the leaf weights are computed. - - But only the decision tree algorithm directly uses y for that and can predict y using the leaf weight values directly. - - For the boosting algorithms g and h are used to compute split score and leaf weights. Their leaf weights - sometimes also need post-processing, e.g. for binary classification. Computation of g and h and post-processing is not - done here but in the respective class implementations of the algorithms. - """ - - n_obs = len(y) - if n_obs == 0: - raise ValueError( - f"Something went wrong. {parent_node=} handed down an empty set of data points." - ) - - is_baselevel, reason = check_is_baselevel( - y, depth, max_depth=growth_params.max_depth - ) - if parent_node is None: - scoring.reset_incrementing_score() - - # compute leaf weight (for prediction) and node score (for split gain check) - leaf_weight, score = calc_leaf_weight_and_split_score( - y, measure_name, growth_params, g, h - ) - - if is_baselevel: # end of the line buddy - return Node( - prediction=leaf_weight, - measure=SplitScore(measure_name, value=score), - n_obs=n_obs, - reason=reason, - depth=depth, - ) - - # find best split - rng = np.random.RandomState(random_state) - - best = find_best_split( - X, y, measure_name, g=g, h=h, growth_params=growth_params, rng=rng - ) - - # check if improvement due to split is below minimum requirement - is_not_sensible_split, gain = check_if_split_sensible( - best, parent_node, growth_params - ) - - if is_not_sensible_split: - reason = f"gain due split ({gain=}) lower than {growth_params.min_improvement=} or all data points assigned to one side (is left {best.target_groups.mean()=:.2%})" - leaf_node = Node( - prediction=leaf_weight, - measure=SplitScore(measure_name, value=score), - n_obs=n_obs, - reason=reason, - depth=depth, - ) - return leaf_node - - # create new parent node for subsequent child nodes - new_node = Node( - array_column=best.column, - threshold=best.threshold, - prediction=leaf_weight, - default_is_left=best.default_is_left, - measure=SplitScore(measure_name, best.score), - n_obs=n_obs, - reason="", - depth=depth, - ) - random_state_left, random_state_right = rng.randint(0, 2**32, size=2) - - # descend left - _X, _y, _g, _h = select_arrays_for_child_node(True, best, X, y, g, h) - new_node.left = grow_tree( - _X, - _y, - measure_name=measure_name, - growth_params=growth_params, - parent_node=new_node, - depth=depth + 1, - g=_g, - h=_h, - random_state=random_state_left, - ) - - # descend right - _X, _y, _g, _h = select_arrays_for_child_node(False, best, X, y, g, h) - new_node.right = grow_tree( - _X, - _y, - measure_name=measure_name, - growth_params=growth_params, - parent_node=new_node, - depth=depth + 1, - g=_g, - h=_h, - random_state=random_state_right, - ) - - return new_node - - -def find_leaf_node(node: Node, x: np.ndarray) -> Node: - "Traverses tree to find the leaf corresponding to x" - - if node.is_leaf: - return node - - is_missing = np.isnan(x[node.array_column]) - if is_missing: - go_left = node.default_is_left - if go_left is None: - raise ValueError( - f"{x[node.array_column]=} is missing but was not observed as a feature that can be missing during training." - ) - else: - go_left = x[node.array_column] < node.threshold - - if go_left: - if node.left is not None: - node = find_leaf_node(node.left, x) - else: - raise ValueError(f"Oddly tried to access node.left even though it is None.") - else: - if node.right is not None: - node = find_leaf_node(node.right, x) - else: - raise ValueError( - f"Oddly tried to access node.right even though it is None." - ) - - return node - - -def predict_with_tree(tree: Node, X: np.ndarray) -> np.ndarray: - "Traverse a previously built tree to make one prediction per row in X" - if not isinstance(tree, Node): - raise ValueError( - f"Passed `tree` needs to be an instantiation of Node, got {tree=}" - ) - n_obs = len(X) - predictions = [] - - for i in range(n_obs): - leaf_node = find_leaf_node(tree, X[i, :]) - - predictions.append(leaf_node.prediction) - - predictions = np.array(predictions) - return predictions - - -class DecisionTreeTemplate(base.BaseEstimator): - """Template for DecisionTree classes - - Based on: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator - """ - - max_depth: int - measure_name: scoring.MetricNames - min_improvement: float - lam: float - frac_subsamples: float - frac_features: float - random_state: int - threshold_method: utils.ThresholdSelectionMethod - threshold_quantile: float - n_thresholds: int - column_method: utils.ColumnSelectionMethod - n_columns_to_try: int | None - ensure_all_finite: bool - tree_: Node - - def __init__( - self, - measure_name: scoring.MetricNames, - max_depth: int = 2, - min_improvement: float = 0.0, - lam: float = 0.0, - frac_subsamples: float = 1.0, - frac_features: float = 1.0, - threshold_method: utils.ThresholdSelectionMethod = utils.ThresholdSelectionMethod.bruteforce, - threshold_quantile: float = 0.1, - n_thresholds: int = 100, - column_method: utils.ColumnSelectionMethod = utils.ColumnSelectionMethod.ascending, - n_columns_to_try: int | None = None, - random_state: int = 42, - ensure_all_finite: bool = True, - ) -> None: - self.max_depth = max_depth - self.measure_name = measure_name - self.min_improvement = min_improvement - self.lam = lam - self.frac_subsamples = frac_subsamples - self.frac_features = frac_features - self.random_state = random_state - self.threshold_method = threshold_method - self.threshold_quantile = threshold_quantile - self.n_thresholds = n_thresholds - self.column_method = column_method - self.n_columns_to_try = n_columns_to_try - self.ensure_all_finite = ensure_all_finite - - def _organize_growth_parameters(self): - self.growth_params_ = utils.TreeGrowthParameters( - max_depth=self.max_depth, - min_improvement=self.min_improvement, - lam=-abs(self.lam), - frac_subsamples=float(self.frac_subsamples), - frac_features=float(self.frac_features), - random_state=int(self.random_state), - threshold_params=utils.ThresholdSelectionParameters( - method=self.threshold_method, - quantile=self.threshold_quantile, - n_thresholds=self.n_thresholds, - random_state=int(self.random_state), - ), - column_params=utils.ColumnSelectionParameters( - method=self.column_method, - n_trials=self.n_columns_to_try, - ), - ) - - def _select_samples_and_features( - self, X: np.ndarray, y: np.ndarray - ) -> T.Tuple[np.ndarray, np.ndarray, np.ndarray]: - "Sub-samples rows and columns from X and y" - if not hasattr(self, "growth_params_"): - raise ValueError(f"Try calling `fit` first.") - - ix = np.arange(len(X)) - rng = np.random.RandomState(self.growth_params_.random_state) - if self.growth_params_.frac_subsamples < 1.0: - n_samples = int(self.growth_params_.frac_subsamples * len(X)) - ix_samples = rng.choice(ix, size=n_samples, replace=False) - else: - ix_samples = ix - - if self.frac_features < 1.0: - n_columns = int(X.shape[1] * self.frac_features) - ix_features = rng.choice( - np.arange(X.shape[1]), - size=n_columns, - replace=False, - ) - else: - ix_features = np.arange(X.shape[1]) - - _X = X[ix_samples, :] - _X = _X[:, ix_features] - - _y = y[ix_samples] - return _X, _y, ix_features - - def _select_features(self, X: np.ndarray, ix_features: np.ndarray) -> np.ndarray: - return X[:, ix_features] - - def fit( - self, - X: np.ndarray, - y: np.ndarray, - ) -> "DecisionTreeTemplate": - raise NotImplementedError() - - def predict(self, X: np.ndarray) -> np.ndarray: - raise NotImplementedError() - - -class DecisionTreeRegressor(base.RegressorMixin, DecisionTreeTemplate): - """DecisionTreeRegressor - - Based on: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator - """ - - def __init__( - self, - measure_name: MetricNames = MetricNames.variance, - max_depth: int = 2, - min_improvement: float = 0.0, - lam: float = 0.0, - frac_subsamples: float = 1.0, - frac_features: float = 1.0, - threshold_method: utils.ThresholdSelectionMethod = utils.ThresholdSelectionMethod.bruteforce, - threshold_quantile: float = 0.1, - n_thresholds: int = 100, - column_method: utils.ColumnSelectionMethod = utils.ColumnSelectionMethod.ascending, - n_columns_to_try: int | None = None, - random_state: int = 42, - ensure_all_finite: bool = True, - ) -> None: - super().__init__( - measure_name=measure_name, - max_depth=max_depth, - min_improvement=min_improvement, - lam=lam, - frac_subsamples=frac_subsamples, - frac_features=frac_features, - threshold_method=threshold_method, - threshold_quantile=threshold_quantile, - n_thresholds=n_thresholds, - column_method=column_method, - n_columns_to_try=n_columns_to_try, - random_state=random_state, - ensure_all_finite=ensure_all_finite, - ) - - def fit( - self, - X: np.ndarray, - y: np.ndarray, - **kwargs, - ) -> "DecisionTreeRegressor": - self._organize_growth_parameters() - - X, y = validate_data(self, X, y, ensure_all_finite=False) - - _X, _y, self.ix_features_ = self._select_samples_and_features(X, y) - - self.tree_ = grow_tree( - _X, - _y, - measure_name=self.measure_name, - growth_params=self.growth_params_, - random_state=self.random_state, - **kwargs, - ) - - return self - - def predict(self, X: np.ndarray) -> np.ndarray: - check_is_fitted(self, ("tree_", "growth_params_")) - - X = validate_data(self, X, reset=False, ensure_all_finite=False) - - _X = self._select_features(X, self.ix_features_) - - y = predict_with_tree(self.tree_, _X) - - return y - - -class DecisionTreeClassifier(base.ClassifierMixin, DecisionTreeTemplate): - """DecisionTreeClassifier - - Based on: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator - """ - - def __init__( - self, - measure_name: MetricNames = MetricNames.gini, - max_depth: int = 2, - min_improvement: float = 0.0, - lam: float = 0.0, - frac_subsamples: float = 1.0, - frac_features: float = 1.0, - threshold_method: utils.ThresholdSelectionMethod = utils.ThresholdSelectionMethod.bruteforce, - threshold_quantile: float = 0.1, - n_thresholds: int = 100, - column_method: utils.ColumnSelectionMethod = utils.ColumnSelectionMethod.ascending, - n_columns_to_try: int | None = None, - random_state: int = 42, - ensure_all_finite: bool = True, - ) -> None: - super().__init__( - measure_name=measure_name, - max_depth=max_depth, - min_improvement=min_improvement, - lam=lam, - frac_subsamples=frac_subsamples, - frac_features=frac_features, - threshold_method=threshold_method, - threshold_quantile=threshold_quantile, - n_thresholds=n_thresholds, - column_method=column_method, - n_columns_to_try=n_columns_to_try, - random_state=random_state, - ) - self.ensure_all_finite = ensure_all_finite - - def _more_tags(self) -> T.Dict[str, bool]: - """Describes to scikit-learn parametrize_with_checks the scope of this class - - Reference: https://scikit-learn.org/stable/developers/develop.html#estimator-tags - """ - return {"binary_only": True} - - def __sklearn_tags__(self): - # https://scikit-learn.org/stable/developers/develop.html - tags = super().__sklearn_tags__() # type: ignore - tags.classifier_tags.multi_class = False - return tags - - def fit( - self, - X: np.ndarray, - y: np.ndarray, - ) -> "DecisionTreeClassifier": - X, y = validate_data(self, X, y, ensure_all_finite=False) - - check_classification_targets(y) - - y_type = type_of_target(y, input_name="y", raise_unknown=True) # type: ignore - if y_type != "binary": - raise ValueError( - "Only binary classification is supported. The type of the target " - f"is {y_type}." - ) - - if len(np.unique(y)) == 1: - raise ValueError("Cannot train with only one class present") - - self._organize_growth_parameters() - - self.classes_, y = np.unique(y, return_inverse=True) - - _X, _y, self.ix_features_ = self._select_samples_and_features(X, y) - - self.tree_ = grow_tree( - _X, - _y, - measure_name=self.measure_name, - growth_params=self.growth_params_, - random_state=self.random_state, - ) - - return self - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - check_is_fitted(self, ("tree_", "classes_", "growth_params_")) - X = validate_data(self, X, reset=False, ensure_all_finite=False) - - _X = self._select_features(X, self.ix_features_) - - proba = predict_with_tree(self.tree_, _X) - proba = np.array([1 - proba, proba]).T - return proba - - def predict(self, X: np.ndarray) -> np.ndarray: - proba = self.predict_proba(X) - ix = np.argmax(proba, axis=1) - y = self.classes_[ix] - - return y - - -def walk_tree( - decision_tree: Node, - tree: Tree, - parent: Node | None = None, - is_left: bool | None = None, -): - arrow = ( - "" - if parent is None - else f"[magenta](< {parent.threshold:.3f})[/magenta]" - if is_left - else f"[magenta](>= {parent.threshold:.3f})[/magenta]" - ) - - if decision_tree.is_leaf: # base cases - branch = tree.add( - f"{arrow} 🍁 # obs: [cyan]{decision_tree.n_obs}[/cyan], value: [green]{decision_tree.prediction:.3f}[/green], leaf reason '{decision_tree.reason}'" - ) - return None - else: - branch = tree.add( - f"{arrow} col idx: {decision_tree.array_column}, threshold: [magenta]{decision_tree.threshold:.3f}[/magenta]" - ) - - if decision_tree.left is not None: # go left - walk_tree(decision_tree.left, branch, decision_tree, True) - - if decision_tree.right is not None: # go right - walk_tree(decision_tree.right, branch, decision_tree, False) - - -def show_tree(decision_tree: DecisionTreeTemplate): - tree = Tree(f"Represenation of 🌲 ({decision_tree})") - walk_tree(decision_tree.tree_, tree) - rprint(tree) diff --git a/src/random_tree_models/decisiontree/__init__.py b/src/random_tree_models/decisiontree/__init__.py new file mode 100644 index 0000000..a33a9a4 --- /dev/null +++ b/src/random_tree_models/decisiontree/__init__.py @@ -0,0 +1,11 @@ +from .estimators import ( + DecisionTreeClassifier as DecisionTreeClassifier, +) +from .estimators import ( + DecisionTreeRegressor as DecisionTreeRegressor, +) +from .estimators import ( + DecisionTreeTemplate as DecisionTreeTemplate, +) +from .predict import find_leaf_node as find_leaf_node +from .train import grow_tree as grow_tree diff --git a/src/random_tree_models/decisiontree/estimators.py b/src/random_tree_models/decisiontree/estimators.py new file mode 100644 index 0000000..87c8b2c --- /dev/null +++ b/src/random_tree_models/decisiontree/estimators.py @@ -0,0 +1,305 @@ +import typing as T + +import numpy as np +import sklearn.base as base +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.validation import check_is_fitted, validate_data # type: ignore + +import random_tree_models.params +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.predict import predict_with_tree +from random_tree_models.decisiontree.train import grow_tree +from random_tree_models.params import MetricNames + + +class DecisionTreeTemplate(base.BaseEstimator): + """Template for DecisionTree classes + + Based on: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator + """ + + max_depth: int + measure_name: random_tree_models.params.MetricNames + min_improvement: float + lam: float + frac_subsamples: float + frac_features: float + random_state: int + threshold_method: random_tree_models.params.ThresholdSelectionMethod + threshold_quantile: float + n_thresholds: int + column_method: random_tree_models.params.ColumnSelectionMethod + n_columns_to_try: int | None + ensure_all_finite: bool + tree_: Node + + def __init__( + self, + measure_name: random_tree_models.params.MetricNames, + max_depth: int = 2, + min_improvement: float = 0.0, + lam: float = 0.0, + frac_subsamples: float = 1.0, + frac_features: float = 1.0, + threshold_method: random_tree_models.params.ThresholdSelectionMethod = random_tree_models.params.ThresholdSelectionMethod.bruteforce, + threshold_quantile: float = 0.1, + n_thresholds: int = 100, + column_method: random_tree_models.params.ColumnSelectionMethod = random_tree_models.params.ColumnSelectionMethod.ascending, + n_columns_to_try: int | None = None, + random_state: int = 42, + ensure_all_finite: bool = True, + ) -> None: + self.max_depth = max_depth + self.measure_name = measure_name + self.min_improvement = min_improvement + self.lam = lam + self.frac_subsamples = frac_subsamples + self.frac_features = frac_features + self.random_state = random_state + self.threshold_method = threshold_method + self.threshold_quantile = threshold_quantile + self.n_thresholds = n_thresholds + self.column_method = column_method + self.n_columns_to_try = n_columns_to_try + self.ensure_all_finite = ensure_all_finite + + def _organize_growth_parameters(self): + self.growth_params_ = random_tree_models.params.TreeGrowthParameters( + max_depth=self.max_depth, + min_improvement=self.min_improvement, + lam=-abs(self.lam), + frac_subsamples=float(self.frac_subsamples), + frac_features=float(self.frac_features), + random_state=int(self.random_state), + threshold_params=random_tree_models.params.ThresholdSelectionParameters( + method=self.threshold_method, + quantile=self.threshold_quantile, + n_thresholds=self.n_thresholds, + random_state=int(self.random_state), + ), + column_params=random_tree_models.params.ColumnSelectionParameters( + method=self.column_method, + n_trials=self.n_columns_to_try, + ), + ) + + def _select_samples_and_features( + self, X: np.ndarray, y: np.ndarray + ) -> T.Tuple[np.ndarray, np.ndarray, np.ndarray]: + "Sub-samples rows and columns from X and y" + if not hasattr(self, "growth_params_"): + raise ValueError(f"Try calling `fit` first.") + + ix = np.arange(len(X)) + rng = np.random.RandomState(self.growth_params_.random_state) + if self.growth_params_.frac_subsamples < 1.0: + n_samples = int(self.growth_params_.frac_subsamples * len(X)) + ix_samples = rng.choice(ix, size=n_samples, replace=False) + else: + ix_samples = ix + + if self.frac_features < 1.0: + n_columns = int(X.shape[1] * self.frac_features) + ix_features = rng.choice( + np.arange(X.shape[1]), + size=n_columns, + replace=False, + ) + else: + ix_features = np.arange(X.shape[1]) + + _X = X[ix_samples, :] + _X = _X[:, ix_features] + + _y = y[ix_samples] + return _X, _y, ix_features + + def _select_features(self, X: np.ndarray, ix_features: np.ndarray) -> np.ndarray: + return X[:, ix_features] + + def fit( + self, + X: np.ndarray, + y: np.ndarray, + ) -> "DecisionTreeTemplate": + raise NotImplementedError() + + def predict(self, X: np.ndarray) -> np.ndarray: + raise NotImplementedError() + + +class DecisionTreeRegressor(base.RegressorMixin, DecisionTreeTemplate): + """DecisionTreeRegressor + + Based on: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator + """ + + def __init__( + self, + measure_name: MetricNames = MetricNames.variance, + max_depth: int = 2, + min_improvement: float = 0.0, + lam: float = 0.0, + frac_subsamples: float = 1.0, + frac_features: float = 1.0, + threshold_method: random_tree_models.params.ThresholdSelectionMethod = random_tree_models.params.ThresholdSelectionMethod.bruteforce, + threshold_quantile: float = 0.1, + n_thresholds: int = 100, + column_method: random_tree_models.params.ColumnSelectionMethod = random_tree_models.params.ColumnSelectionMethod.ascending, + n_columns_to_try: int | None = None, + random_state: int = 42, + ensure_all_finite: bool = True, + ) -> None: + super().__init__( + measure_name=measure_name, + max_depth=max_depth, + min_improvement=min_improvement, + lam=lam, + frac_subsamples=frac_subsamples, + frac_features=frac_features, + threshold_method=threshold_method, + threshold_quantile=threshold_quantile, + n_thresholds=n_thresholds, + column_method=column_method, + n_columns_to_try=n_columns_to_try, + random_state=random_state, + ensure_all_finite=ensure_all_finite, + ) + + def fit( + self, + X: np.ndarray, + y: np.ndarray, + **kwargs, + ) -> "DecisionTreeRegressor": + self._organize_growth_parameters() + + X, y = validate_data(self, X, y, ensure_all_finite=False) + + _X, _y, self.ix_features_ = self._select_samples_and_features(X, y) + + self.tree_ = grow_tree( + _X, + _y, + measure_name=self.measure_name, + growth_params=self.growth_params_, + random_state=self.random_state, + **kwargs, + ) + + return self + + def predict(self, X: np.ndarray) -> np.ndarray: + check_is_fitted(self, ("tree_", "growth_params_")) + + X = validate_data(self, X, reset=False, ensure_all_finite=False) + + _X = self._select_features(X, self.ix_features_) + + y = predict_with_tree(self.tree_, _X) + + return y + + +class DecisionTreeClassifier(base.ClassifierMixin, DecisionTreeTemplate): + """DecisionTreeClassifier + + Based on: https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator + """ + + def __init__( + self, + measure_name: MetricNames = MetricNames.gini, + max_depth: int = 2, + min_improvement: float = 0.0, + lam: float = 0.0, + frac_subsamples: float = 1.0, + frac_features: float = 1.0, + threshold_method: random_tree_models.params.ThresholdSelectionMethod = random_tree_models.params.ThresholdSelectionMethod.bruteforce, + threshold_quantile: float = 0.1, + n_thresholds: int = 100, + column_method: random_tree_models.params.ColumnSelectionMethod = random_tree_models.params.ColumnSelectionMethod.ascending, + n_columns_to_try: int | None = None, + random_state: int = 42, + ensure_all_finite: bool = True, + ) -> None: + super().__init__( + measure_name=measure_name, + max_depth=max_depth, + min_improvement=min_improvement, + lam=lam, + frac_subsamples=frac_subsamples, + frac_features=frac_features, + threshold_method=threshold_method, + threshold_quantile=threshold_quantile, + n_thresholds=n_thresholds, + column_method=column_method, + n_columns_to_try=n_columns_to_try, + random_state=random_state, + ) + self.ensure_all_finite = ensure_all_finite + + def _more_tags(self) -> T.Dict[str, bool]: + """Describes to scikit-learn parametrize_with_checks the scope of this class + + Reference: https://scikit-learn.org/stable/developers/develop.html#estimator-tags + """ + return {"binary_only": True} + + def __sklearn_tags__(self): + # https://scikit-learn.org/stable/developers/develop.html + tags = super().__sklearn_tags__() # type: ignore + tags.classifier_tags.multi_class = False + return tags + + def fit( + self, + X: np.ndarray, + y: np.ndarray, + ) -> "DecisionTreeClassifier": + X, y = validate_data(self, X, y, ensure_all_finite=False) + + check_classification_targets(y) + + y_type = type_of_target(y, input_name="y", raise_unknown=True) # type: ignore + if y_type != "binary": + raise ValueError( + "Only binary classification is supported. The type of the target " + f"is {y_type}." + ) + + if len(np.unique(y)) == 1: + raise ValueError("Cannot train with only one class present") + + self._organize_growth_parameters() + + self.classes_, y = np.unique(y, return_inverse=True) + + _X, _y, self.ix_features_ = self._select_samples_and_features(X, y) + + self.tree_ = grow_tree( + _X, + _y, + measure_name=self.measure_name, + growth_params=self.growth_params_, + random_state=self.random_state, + ) + + return self + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + check_is_fitted(self, ("tree_", "classes_", "growth_params_")) + X = validate_data(self, X, reset=False, ensure_all_finite=False) + + _X = self._select_features(X, self.ix_features_) + + proba = predict_with_tree(self.tree_, _X) + proba = np.array([1 - proba, proba]).T + return proba + + def predict(self, X: np.ndarray) -> np.ndarray: + proba = self.predict_proba(X) + ix = np.argmax(proba, axis=1) + y = self.classes_[ix] + + return y diff --git a/src/random_tree_models/decisiontree/node.py b/src/random_tree_models/decisiontree/node.py new file mode 100644 index 0000000..9489d14 --- /dev/null +++ b/src/random_tree_models/decisiontree/node.py @@ -0,0 +1,37 @@ +import uuid + +from pydantic import StrictInt, StrictStr +from pydantic.dataclasses import dataclass + +from random_tree_models.decisiontree.split_objects import SplitScore + + +@dataclass +class Node: + """Decision node in a decision tree""" + + # Stuff for making a decision + array_column: StrictInt | None = None # index of the column to use + threshold: float | None = None # threshold for decision + prediction: float | None = None # value to use for predictions + default_is_left: bool | None = None # default direction is x is nan + + # decendants + right: "Node | None" = None # right decendany of type Node + left: "Node | None" = None # left decendant of type Node + + # misc info + measure: SplitScore | None = None + + n_obs: StrictInt | None = None # number of observations in node + reason: StrictStr | None = None # place for some comment + + depth: StrictInt | None = None # depth of the node + + def __post_init__(self): + # unique identifier of the node + self.node_id = uuid.uuid4() + + @property + def is_leaf(self) -> bool: + return self.left is None and self.right is None diff --git a/src/random_tree_models/decisiontree/predict.py b/src/random_tree_models/decisiontree/predict.py new file mode 100644 index 0000000..59fac01 --- /dev/null +++ b/src/random_tree_models/decisiontree/predict.py @@ -0,0 +1,53 @@ +import numpy as np + +from random_tree_models.decisiontree.node import Node + + +def find_leaf_node(node: Node, x: np.ndarray) -> Node: + "Traverses tree to find the leaf corresponding to x" + + if node.is_leaf: + return node + + is_missing = np.isnan(x[node.array_column]) + if is_missing: + go_left = node.default_is_left + if go_left is None: + raise ValueError( + f"{x[node.array_column]=} is missing but was not observed as a feature that can be missing during training." + ) + else: + go_left = x[node.array_column] < node.threshold + + if go_left: + if node.left is not None: + node = find_leaf_node(node.left, x) + else: + raise ValueError(f"Oddly tried to access node.left even though it is None.") + else: + if node.right is not None: + node = find_leaf_node(node.right, x) + else: + raise ValueError( + f"Oddly tried to access node.right even though it is None." + ) + + return node + + +def predict_with_tree(tree: Node, X: np.ndarray) -> np.ndarray: + "Traverse a previously built tree to make one prediction per row in X" + if not isinstance(tree, Node): + raise ValueError( + f"Passed `tree` needs to be an instantiation of Node, got {tree=}" + ) + n_obs = len(X) + predictions = [] + + for i in range(n_obs): + leaf_node = find_leaf_node(tree, X[i, :]) + + predictions.append(leaf_node.prediction) + + predictions = np.array(predictions) + return predictions diff --git a/src/random_tree_models/decisiontree/split.py b/src/random_tree_models/decisiontree/split.py new file mode 100644 index 0000000..e79c0df --- /dev/null +++ b/src/random_tree_models/decisiontree/split.py @@ -0,0 +1,212 @@ +import typing as T + +import numpy as np + +import random_tree_models.params +import random_tree_models.scoring as scoring +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.split_objects import BestSplit + + +def select_thresholds( + feature_values: np.ndarray, + threshold_params: random_tree_models.params.ThresholdSelectionParameters, + rng: np.random.RandomState, +) -> np.ndarray: + "Selects thresholds to use for splitting" + + method = threshold_params.method + n_thresholds = threshold_params.n_thresholds + num_quantile_steps = threshold_params.num_quantile_steps + + if method == random_tree_models.params.ThresholdSelectionMethod.bruteforce: + return feature_values[1:] + elif method == random_tree_models.params.ThresholdSelectionMethod.random: + if len(feature_values) - 1 <= n_thresholds: + return feature_values[1:] + else: + return rng.choice( + feature_values[1:], + size=(n_thresholds,), + replace=False, + ) + elif method == random_tree_models.params.ThresholdSelectionMethod.quantile: + qs = np.linspace(0, 1, num_quantile_steps) + return np.quantile(feature_values[1:], qs) + elif method == random_tree_models.params.ThresholdSelectionMethod.uniform: + x = np.linspace( + feature_values.min(), + feature_values.max(), + n_thresholds + 2, + ) + return rng.choice(x[1:], size=[1]) + else: + raise NotImplementedError(f"Unknown threshold selection method: {method}") + + +def get_thresholds_and_target_groups( + feature_values: np.ndarray, + threshold_params: random_tree_models.params.ThresholdSelectionParameters, + rng: np.random.RandomState, +) -> T.Generator[T.Tuple[np.ndarray, np.ndarray, bool | None], None, None]: + "Creates a generator for split finding, returning the used threshold, the target groups and a bool indicating if the default direction is left" + is_missing = np.isnan(feature_values) + is_finite = np.logical_not(is_missing) + all_finite = is_finite.all() + + if all_finite: + default_direction_is_left = None + thresholds = select_thresholds(feature_values, threshold_params, rng) + + for threshold in thresholds: + target_groups = feature_values < threshold + yield (threshold, target_groups, default_direction_is_left) + else: + finite_feature_values = feature_values[is_finite] + thresholds = select_thresholds(finite_feature_values, threshold_params, rng) + + for threshold in thresholds: + # default direction left - feature value <= threshold or missing (i.e. missing are included left of the threshold) + target_groups = np.logical_or(feature_values < threshold, is_missing) + yield (threshold, target_groups, True) + + # default direction right - feature value <= threshold and finite (i.e. missing are included right of the threshold) + target_groups = np.logical_and(feature_values < threshold, is_finite) + yield (threshold, target_groups, False) + + +def get_column( + X: np.ndarray, + column_params: random_tree_models.params.ColumnSelectionParameters, + rng: np.random.RandomState, +) -> list[int]: + # select column order to split on + method = column_params.method + n_columns_to_try = column_params.n_trials + + columns = list(range(X.shape[1])) + if method == random_tree_models.params.ColumnSelectionMethod.ascending: + pass + elif method == random_tree_models.params.ColumnSelectionMethod.random: + columns = np.array(columns) + rng.shuffle(columns) + columns = columns.tolist() + elif method == random_tree_models.params.ColumnSelectionMethod.largest_delta: + deltas = X.max(axis=0) - X.min(axis=0) + weights = deltas / deltas.sum() + columns = np.array(columns) + columns = rng.choice(columns, p=weights, size=len(columns), replace=False) + columns = columns.tolist() + else: + raise NotImplementedError( + f"Unknown column selection method: {column_params.method}" + ) + if n_columns_to_try is not None: + columns = columns[:n_columns_to_try] + + return columns + + +def find_best_split( + X: np.ndarray, + y: np.ndarray, + measure_name: str, + yhat: np.ndarray | None = None, + g: np.ndarray | None = None, + h: np.ndarray | None = None, + growth_params: random_tree_models.params.TreeGrowthParameters + | None = None, # TODO: make required + rng: np.random.RandomState = np.random.RandomState(42), +) -> BestSplit: + """Find the best split, detecting the "default direction" with missing data.""" + + if len(np.unique(y)) == 1: + raise ValueError( + f"Tried to find a split for homogenous y: {y[:3]} ... {y[-3:]}" + ) + + best = None # this will be an BestSplit instance + + if growth_params is None: + raise ValueError(f"{growth_params=} but is not allowed to be None") + + for array_column in get_column(X, growth_params.column_params, rng): + feature_values = X[:, array_column] + + for ( + threshold, + target_groups, + default_is_left, + ) in get_thresholds_and_target_groups( + feature_values, growth_params.threshold_params, rng + ): + split_score = scoring.calc_split_score( + random_tree_models.params.MetricNames(measure_name), + y, + target_groups, + yhat=yhat, + g=g, + h=h, + growth_params=growth_params, + ) + + if best is None or split_score > best.score: + best = BestSplit( + score=float(split_score), + column=int(array_column), + threshold=float(threshold), + target_groups=target_groups, + default_is_left=default_is_left, + ) + + if best is None: + raise ValueError(f"Something went wrong {best=} cannot be None.") + return best + + +def check_if_split_sensible( + best: BestSplit, + parent_node: Node | None, + growth_params: random_tree_models.params.TreeGrowthParameters, +) -> tuple[bool, float | None]: + "Verifies if split is sensible, considering score gain and left/right group sizes" + parent_is_none = parent_node is None + if parent_is_none: + return False, None + + measure_is_none = parent_node.measure is None + if measure_is_none: + return False, None + + value_is_none = parent_node.measure.value is None # type: ignore + if value_is_none: + return False, None + + # score gain + gain = best.score - parent_node.measure.value # type: ignore + is_insufficient_gain = gain < growth_params.min_improvement + + # left/right group assignment + all_on_one_side = bool(best.target_groups.all()) + all_on_other_side = bool(np.logical_not(best.target_groups).all()) + is_all_onesided = all_on_one_side or all_on_other_side + + is_not_sensible = is_all_onesided or is_insufficient_gain + + return is_not_sensible, gain + + +def select_arrays_for_child_node( + go_left: bool, + best: BestSplit, + X: np.ndarray, + y: np.ndarray, + g: np.ndarray | None = None, + h: np.ndarray | None = None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None]: + mask = best.target_groups == go_left + _X = X[mask, :] + _y = y[mask] + _g = g[mask] if g is not None else None + _h = h[mask] if h is not None else None + return _X, _y, _g, _h diff --git a/src/random_tree_models/decisiontree/split_objects.py b/src/random_tree_models/decisiontree/split_objects.py new file mode 100644 index 0000000..e1e95c7 --- /dev/null +++ b/src/random_tree_models/decisiontree/split_objects.py @@ -0,0 +1,18 @@ +import numpy as np +from pydantic import ConfigDict, Field, StrictBool, StrictFloat, StrictInt, StrictStr +from pydantic.dataclasses import dataclass + + +@dataclass(validate_on_init=True) +class SplitScore: + name: StrictStr # name of the score used + value: StrictFloat | None = None # optimization value gini etc + + +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class BestSplit: + score: StrictFloat + column: StrictInt + threshold: StrictFloat + target_groups: np.ndarray = Field(default_factory=lambda: np.zeros(10)) + default_is_left: StrictBool | None = None diff --git a/src/random_tree_models/decisiontree/train.py b/src/random_tree_models/decisiontree/train.py new file mode 100644 index 0000000..1f33cc1 --- /dev/null +++ b/src/random_tree_models/decisiontree/train.py @@ -0,0 +1,187 @@ +import typing as T + +import numpy as np + +import random_tree_models.leafweights as leafweights +import random_tree_models.params +import random_tree_models.scoring as scoring +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.split import ( + check_if_split_sensible, + find_best_split, + select_arrays_for_child_node, +) +from random_tree_models.decisiontree.split_objects import SplitScore +from random_tree_models.params import MetricNames + + +def check_is_baselevel(y: np.ndarray, depth: int, max_depth: int) -> T.Tuple[bool, str]: + """Verifies if the tree traversal reached the baselevel / a leaf + * group homogeneous / cannot sensibly be splitted further + * no data in the group + * max depth reached + """ + if max_depth is not None and depth >= max_depth: + return (True, "max depth reached") + elif len(np.unique(y)) == 1: + return (True, "homogenous group") + elif len(y) <= 1: + return (True, "<= 1 data point in group") + else: + return (False, "") + + +def calc_leaf_weight_and_split_score( + y: np.ndarray, + measure_name: random_tree_models.params.MetricNames, + growth_params: random_tree_models.params.TreeGrowthParameters, + g: np.ndarray | None = None, + h: np.ndarray | None = None, +) -> tuple[float | None, float]: + leaf_weight = leafweights.calc_leaf_weight(y, measure_name, growth_params, g=g, h=h) + + yhat = leaf_weight * np.ones_like(y) + score = scoring.calc_split_score( + measure_name, + y, + np.ones_like(y, dtype=bool), + yhat=yhat, + g=g, + h=h, + growth_params=growth_params, + ) + + return leaf_weight, score + + +def grow_tree( + X: np.ndarray, + y: np.ndarray, + measure_name: MetricNames, + growth_params: random_tree_models.params.TreeGrowthParameters, + parent_node: Node | None = None, + depth: int = 0, + g: np.ndarray | None = None, + h: np.ndarray | None = None, + random_state: int = 42, + **kwargs, +) -> Node: + """Implementation of the Classification And Regression Tree (CART) algorithm + + Args: + X (np.ndarray): Input feature values to do thresholding on. + y (np.ndarray): Target values. + measure_name (str): Values indicating which functions in scoring.SplitScoreMetrics and leafweights.LeafWeightSchemes to call. + parent_node (Node, optional): Parent node in tree. Defaults to None. + depth (int, optional): Current tree depth. Defaults to 0. + growth_params (utils.TreeGrowthParameters, optional): Parameters controlling tree growth. Defaults to None. + g (np.ndarray, optional): Boosting and loss specific precomputed 1st order derivative dloss/dyhat. Defaults to None. + h (np.ndarray, optional): Boosting and loss specific precomputed 2nd order derivative d^2loss/dyhat^2. Defaults to None. + + Raises: + ValueError: Fails if parent node passes an empty y array. + + Returns: + Node: Tree node with leaf weight, node score and potential child nodes. + + Note: + Currently measure_name controls how the split score and the leaf weights are computed. + + But only the decision tree algorithm directly uses y for that and can predict y using the leaf weight values directly. + + For the boosting algorithms g and h are used to compute split score and leaf weights. Their leaf weights + sometimes also need post-processing, e.g. for binary classification. Computation of g and h and post-processing is not + done here but in the respective class implementations of the algorithms. + """ + + n_obs = len(y) + if n_obs == 0: + raise ValueError( + f"Something went wrong. {parent_node=} handed down an empty set of data points." + ) + + is_baselevel, reason = check_is_baselevel( + y, depth, max_depth=growth_params.max_depth + ) + if parent_node is None: + scoring.reset_incrementing_score() + + # compute leaf weight (for prediction) and node score (for split gain check) + leaf_weight, score = calc_leaf_weight_and_split_score( + y, measure_name, growth_params, g, h + ) + + if is_baselevel: # end of the line buddy + return Node( + prediction=leaf_weight, + measure=SplitScore(measure_name, value=score), + n_obs=n_obs, + reason=reason, + depth=depth, + ) + + # find best split + rng = np.random.RandomState(random_state) + + best = find_best_split( + X, y, measure_name, g=g, h=h, growth_params=growth_params, rng=rng + ) + + # check if improvement due to split is below minimum requirement + is_not_sensible_split, gain = check_if_split_sensible( + best, parent_node, growth_params + ) + + if is_not_sensible_split: + reason = f"gain due split ({gain=}) lower than {growth_params.min_improvement=} or all data points assigned to one side (is left {best.target_groups.mean()=:.2%})" + leaf_node = Node( + prediction=leaf_weight, + measure=SplitScore(measure_name, value=score), + n_obs=n_obs, + reason=reason, + depth=depth, + ) + return leaf_node + + # create new parent node for subsequent child nodes + new_node = Node( + array_column=best.column, + threshold=best.threshold, + prediction=leaf_weight, + default_is_left=best.default_is_left, + measure=SplitScore(measure_name, best.score), + n_obs=n_obs, + reason="", + depth=depth, + ) + random_state_left, random_state_right = rng.randint(0, 2**32, size=2) + + # descend left + _X, _y, _g, _h = select_arrays_for_child_node(True, best, X, y, g, h) + new_node.left = grow_tree( + _X, + _y, + measure_name=measure_name, + growth_params=growth_params, + parent_node=new_node, + depth=depth + 1, + g=_g, + h=_h, + random_state=random_state_left, + ) + + # descend right + _X, _y, _g, _h = select_arrays_for_child_node(False, best, X, y, g, h) + new_node.right = grow_tree( + _X, + _y, + measure_name=measure_name, + growth_params=growth_params, + parent_node=new_node, + depth=depth + 1, + g=_g, + h=_h, + random_state=random_state_right, + ) + + return new_node diff --git a/src/random_tree_models/decisiontree/visualize.py b/src/random_tree_models/decisiontree/visualize.py new file mode 100644 index 0000000..8d4aa60 --- /dev/null +++ b/src/random_tree_models/decisiontree/visualize.py @@ -0,0 +1,42 @@ +from rich import print as rprint +from rich.tree import Tree + +from random_tree_models.decisiontree.estimators import DecisionTreeTemplate +from random_tree_models.decisiontree.node import Node + + +def walk_tree( + decision_tree: Node, + tree: Tree, + parent: Node | None = None, + is_left: bool | None = None, +): + arrow = ( + "" + if parent is None + else f"[magenta](< {parent.threshold:.3f})[/magenta]" + if is_left + else f"[magenta](>= {parent.threshold:.3f})[/magenta]" + ) + + if decision_tree.is_leaf: # base cases + branch = tree.add( + f"{arrow} 🍁 # obs: [cyan]{decision_tree.n_obs}[/cyan], value: [green]{decision_tree.prediction:.3f}[/green], leaf reason '{decision_tree.reason}'" + ) + return None + else: + branch = tree.add( + f"{arrow} col idx: {decision_tree.array_column}, threshold: [magenta]{decision_tree.threshold:.3f}[/magenta]" + ) + + if decision_tree.left is not None: # go left + walk_tree(decision_tree.left, branch, decision_tree, True) + + if decision_tree.right is not None: # go right + walk_tree(decision_tree.right, branch, decision_tree, False) + + +def show_tree(decision_tree: DecisionTreeTemplate): + tree = Tree(f"Represenation of 🌲 ({decision_tree})") + walk_tree(decision_tree.tree_, tree) + rprint(tree) diff --git a/src/random_tree_models/extratrees.py b/src/random_tree_models/extratrees.py index a76b97c..1a67dba 100644 --- a/src/random_tree_models/extratrees.py +++ b/src/random_tree_models/extratrees.py @@ -10,8 +10,8 @@ ) import random_tree_models.decisiontree as dtree -import random_tree_models.utils as utils -from random_tree_models.scoring import MetricNames +import random_tree_models.params as utils +from random_tree_models.params import MetricNames class ExtraTreesTemplate(base.BaseEstimator): diff --git a/src/random_tree_models/gradientboostedtrees.py b/src/random_tree_models/gradientboostedtrees.py index 35f3c98..7bbe872 100644 --- a/src/random_tree_models/gradientboostedtrees.py +++ b/src/random_tree_models/gradientboostedtrees.py @@ -14,7 +14,7 @@ ) import random_tree_models.decisiontree as dtree -from random_tree_models.scoring import MetricNames +from random_tree_models.params import MetricNames class GradientBoostedTreesTemplate(base.BaseEstimator): diff --git a/src/random_tree_models/isolationforest.py b/src/random_tree_models/isolationforest.py index 532044b..4be543d 100644 --- a/src/random_tree_models/isolationforest.py +++ b/src/random_tree_models/isolationforest.py @@ -5,14 +5,22 @@ from sklearn import base from sklearn.utils.validation import check_is_fitted, validate_data # type: ignore -import random_tree_models.decisiontree as dtree -from random_tree_models.scoring import MetricNames -from random_tree_models.utils import ColumnSelectionMethod, ThresholdSelectionMethod - - -def predict_with_isolationtree(tree: dtree.Node, X: np.ndarray) -> np.ndarray: +from random_tree_models.decisiontree import ( + DecisionTreeTemplate, + find_leaf_node, + grow_tree, +) +from random_tree_models.decisiontree.node import Node +from random_tree_models.params import ( + ColumnSelectionMethod, + MetricNames, + ThresholdSelectionMethod, +) + + +def predict_with_isolationtree(tree: Node, X: np.ndarray) -> np.ndarray: "Traverse a previously built tree to make one prediction per row in X" - if not isinstance(tree, dtree.Node): + if not isinstance(tree, Node): raise ValueError( f"Passed `tree` needs to be an instantiation of Node, got {tree=}" ) @@ -20,13 +28,13 @@ def predict_with_isolationtree(tree: dtree.Node, X: np.ndarray) -> np.ndarray: predictions = np.zeros(X.shape[0], dtype=int) for i in range(n_obs): - leaf_node = dtree.find_leaf_node(tree, X[i, :]) + leaf_node = find_leaf_node(tree, X[i, :]) predictions[i] = leaf_node.depth return predictions -class IsolationTree(base.OutlierMixin, dtree.DecisionTreeTemplate): +class IsolationTree(base.OutlierMixin, DecisionTreeTemplate): """Isolation tree Liu et al. 2006, Isolation Forest, algorithm 2 @@ -60,7 +68,7 @@ def fit( _X, _y, self.ix_features_ = self._select_samples_and_features(X, dummy_y) - self.tree_ = dtree.grow_tree( + self.tree_ = grow_tree( _X, _y, measure_name=self.measure_name, diff --git a/src/random_tree_models/leafweights.py b/src/random_tree_models/leafweights.py index 165a616..de48e39 100644 --- a/src/random_tree_models/leafweights.py +++ b/src/random_tree_models/leafweights.py @@ -1,8 +1,7 @@ import numpy as np -import random_tree_models.utils as utils -from random_tree_models import scoring -from random_tree_models.scoring import MetricNames +import random_tree_models.params as utils +from random_tree_models.params import MetricNames def leaf_weight_mean(y: np.ndarray) -> float: @@ -32,7 +31,7 @@ def leaf_weight_xgboost( def calc_leaf_weight( y: np.ndarray, - measure_name: scoring.MetricNames, + measure_name: utils.MetricNames, growth_params: utils.TreeGrowthParameters, g: np.ndarray | None = None, h: np.ndarray | None = None, diff --git a/src/random_tree_models/params.py b/src/random_tree_models/params.py new file mode 100644 index 0000000..b261bb8 --- /dev/null +++ b/src/random_tree_models/params.py @@ -0,0 +1,104 @@ +from enum import StrEnum, auto +from typing import Any + +from pydantic import BaseModel, StrictFloat, StrictInt + + +class ColumnSelectionMethod(StrEnum): + ascending = "ascending" + largest_delta = "largest_delta" + random = "random" + + +class ThresholdSelectionMethod(StrEnum): + bruteforce = "bruteforce" + quantile = "quantile" + random = "random" + uniform = "uniform" + + +class ThresholdSelectionParameters(BaseModel): + method: ThresholdSelectionMethod + quantile: StrictFloat = 0.1 + random_state: StrictInt = 0 + n_thresholds: StrictInt = 100 + num_quantile_steps: StrictInt = -1 + + def model_post_init(self, context: Any): + # verify method + # expected = ThresholdSelectionMethod.__members__.keys() + # is_okay = self.method in expected + # if not is_okay: + # raise ValueError( + # f"passed value for method ('{self.method}') not one of {expected}" + # ) + + # verify quantile + is_okay = 0.0 < self.quantile < 1.0 + if not is_okay: + raise ValueError(f"{self.quantile=} not in (0, 1)") + is_okay = 1 / self.quantile % 1 == 0 + if not is_okay: + raise ValueError(f"{self.quantile=} not a valid quantile") + + # verify random_state + is_okay = self.random_state >= 0 + if not is_okay: + raise ValueError(f"{self.random_state=} not in [0, inf)") + + # verify n_thresholds valid int + is_okay = self.n_thresholds > 0 + if not is_okay: + raise ValueError(f"{self.n_thresholds=} not > 0") + + # set dq + self.num_quantile_steps = int(1 / self.quantile) + 1 + + +class ColumnSelectionParameters(BaseModel): + method: ColumnSelectionMethod + n_trials: StrictInt | None = None + + +class TreeGrowthParameters(BaseModel): + max_depth: StrictInt + min_improvement: StrictFloat = 0.0 + # xgboost lambda - multiplied with sum of squares of leaf weights + # see Chen et al. 2016 equation 2 + lam: StrictFloat = 0.0 + frac_subsamples: StrictFloat = 1.0 + frac_features: StrictFloat = 1.0 + random_state: StrictInt = 0 + threshold_params: ThresholdSelectionParameters = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.bruteforce, + quantile=0.1, + random_state=0, + n_thresholds=100, + ) + column_params: ColumnSelectionParameters = ColumnSelectionParameters( + method=ColumnSelectionMethod.ascending, n_trials=None + ) + + def model_post_init(self, context: Any): + # verify frac_subsamples + is_okay = 0.0 < self.frac_subsamples <= 1.0 + if not is_okay: + raise ValueError(f"{self.frac_subsamples=} not in (0, 1]") + + # verify frac_features + is_okay = 0.0 < self.frac_features <= 1.0 + if not is_okay: + raise ValueError(f"{self.frac_features=} not in (0, 1]") + + +class MetricNames(StrEnum): + variance = auto() + entropy = auto() + entropy_rs = auto() + gini = auto() + gini_rs = auto() + # variance for split score because Friedman et al. 2001 in Algorithm 1 + # step 4 minimize the squared error between actual and predicted dloss/dyhat + friedman_binary_classification = auto() + xgboost = auto() + incrementing = auto() diff --git a/src/random_tree_models/randomforest.py b/src/random_tree_models/randomforest.py index 0bdc356..dbe0164 100644 --- a/src/random_tree_models/randomforest.py +++ b/src/random_tree_models/randomforest.py @@ -10,7 +10,7 @@ ) import random_tree_models.decisiontree as dtree -from random_tree_models.scoring import MetricNames +from random_tree_models.params import MetricNames class RandomForestTemplate(base.BaseEstimator): diff --git a/src/random_tree_models/scoring.py b/src/random_tree_models/scoring.py index 39c7fc2..864f83f 100644 --- a/src/random_tree_models/scoring.py +++ b/src/random_tree_models/scoring.py @@ -1,9 +1,8 @@ -from enum import StrEnum, auto - import numpy as np -import random_tree_models.utils as utils +import random_tree_models.params as utils from random_tree_models import rs_entropy, rs_gini_impurity +from random_tree_models.params import MetricNames def check_y_and_target_groups(y: np.ndarray, target_groups: np.ndarray | None = None): @@ -216,19 +215,6 @@ def reset_incrementing_score(): INC_SCORE = 0 -class MetricNames(StrEnum): - variance = auto() - entropy = auto() - entropy_rs = auto() - gini = auto() - gini_rs = auto() - # variance for split score because Friedman et al. 2001 in Algorithm 1 - # step 4 minimize the squared error between actual and predicted dloss/dyhat - friedman_binary_classification = auto() - xgboost = auto() - incrementing = auto() - - def calc_split_score( metric: MetricNames, y: np.ndarray, diff --git a/src/random_tree_models/utils.py b/src/random_tree_models/utils.py index 0efe452..313a9a3 100644 --- a/src/random_tree_models/utils.py +++ b/src/random_tree_models/utils.py @@ -1,98 +1,8 @@ import logging -from enum import StrEnum -from typing import Any -from pydantic import BaseModel, StrictFloat, StrictInt from rich.logging import RichHandler -class ColumnSelectionMethod(StrEnum): - ascending = "ascending" - largest_delta = "largest_delta" - random = "random" - - -class ThresholdSelectionMethod(StrEnum): - bruteforce = "bruteforce" - quantile = "quantile" - random = "random" - uniform = "uniform" - - -class ThresholdSelectionParameters(BaseModel): - method: ThresholdSelectionMethod - quantile: StrictFloat = 0.1 - random_state: StrictInt = 0 - n_thresholds: StrictInt = 100 - num_quantile_steps: StrictInt = -1 - - def model_post_init(self, context: Any): - # verify method - # expected = ThresholdSelectionMethod.__members__.keys() - # is_okay = self.method in expected - # if not is_okay: - # raise ValueError( - # f"passed value for method ('{self.method}') not one of {expected}" - # ) - - # verify quantile - is_okay = 0.0 < self.quantile < 1.0 - if not is_okay: - raise ValueError(f"{self.quantile=} not in (0, 1)") - is_okay = 1 / self.quantile % 1 == 0 - if not is_okay: - raise ValueError(f"{self.quantile=} not a valid quantile") - - # verify random_state - is_okay = self.random_state >= 0 - if not is_okay: - raise ValueError(f"{self.random_state=} not in [0, inf)") - - # verify n_thresholds valid int - is_okay = self.n_thresholds > 0 - if not is_okay: - raise ValueError(f"{self.n_thresholds=} not > 0") - - # set dq - self.num_quantile_steps = int(1 / self.quantile) + 1 - - -class ColumnSelectionParameters(BaseModel): - method: ColumnSelectionMethod - n_trials: StrictInt | None = None - - -class TreeGrowthParameters(BaseModel): - max_depth: StrictInt - min_improvement: StrictFloat = 0.0 - # xgboost lambda - multiplied with sum of squares of leaf weights - # see Chen et al. 2016 equation 2 - lam: StrictFloat = 0.0 - frac_subsamples: StrictFloat = 1.0 - frac_features: StrictFloat = 1.0 - random_state: StrictInt = 0 - threshold_params: ThresholdSelectionParameters = ThresholdSelectionParameters( - method=ThresholdSelectionMethod.bruteforce, - quantile=0.1, - random_state=0, - n_thresholds=100, - ) - column_params: ColumnSelectionParameters = ColumnSelectionParameters( - method=ColumnSelectionMethod.ascending, n_trials=None - ) - - def model_post_init(self, context: Any): - # verify frac_subsamples - is_okay = 0.0 < self.frac_subsamples <= 1.0 - if not is_okay: - raise ValueError(f"{self.frac_subsamples=} not in (0, 1]") - - # verify frac_features - is_okay = 0.0 < self.frac_features <= 1.0 - if not is_okay: - raise ValueError(f"{self.frac_features=} not in (0, 1]") - - def _get_logger(level=logging.INFO): for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) diff --git a/src/random_tree_models/xgboost.py b/src/random_tree_models/xgboost.py index a228546..6a36310 100644 --- a/src/random_tree_models/xgboost.py +++ b/src/random_tree_models/xgboost.py @@ -28,7 +28,7 @@ import random_tree_models.decisiontree as dtree import random_tree_models.gradientboostedtrees as gbt -from random_tree_models.scoring import MetricNames +from random_tree_models.params import MetricNames class XGBoostTemplate(base.BaseEstimator): diff --git a/tests/decisiontree/__init__.py b/tests/decisiontree/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/decisiontree/conftest.py b/tests/decisiontree/conftest.py new file mode 100644 index 0000000..a86f141 --- /dev/null +++ b/tests/decisiontree/conftest.py @@ -0,0 +1,28 @@ +from random_tree_models.decisiontree.node import Node + +# first value in each tuple is the value to test and the second is the flag indicating if this should work +BOOL_OPTIONS_NONE_OKAY = [(False, True), (True, True), ("blub", False)] +INT_OPTIONS_NONE_OKAY = [(0, True), (None, True), ("blub", False)] +INT_OPTIONS_NONE_NOT_OKAY = [(0, True), (None, False), ("blub", False)] +FLOAT_OPTIONS_NONE_OKAY = [ + (-1.0, True), + (None, True), + ("blub", False), +] +FLOAT_OPTIONS_NONE_NOT_OKAY = [ + (-1.0, True), + (None, False), + ("blub", False), +] +NODE_OPTIONS_NONE_OKAY = [ + (Node(), True), + (None, True), + ("blub", False), +] +STR_OPTIONS_NONE_OKAY = [("blub", True), (None, True), (1.0, False)] +STR_OPTIONS_NONE_NOT_OKAY = [ + ("blub", True), + (None, False), + (1, False), + (1.0, False), +] diff --git a/tests/decisiontree/test_estimators.py b/tests/decisiontree/test_estimators.py new file mode 100644 index 0000000..20092c7 --- /dev/null +++ b/tests/decisiontree/test_estimators.py @@ -0,0 +1,184 @@ +import numpy as np +import pytest +from sklearn.utils.estimator_checks import parametrize_with_checks + +import random_tree_models.params +from random_tree_models.decisiontree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, +) +from random_tree_models.decisiontree.estimators import DecisionTreeTemplate +from random_tree_models.decisiontree.node import Node +from random_tree_models.params import MetricNames +from tests.conftest import expected_failed_checks + + +class TestDecisionTreeTemplate: + model = DecisionTreeTemplate(measure_name=MetricNames.entropy) + X = np.random.normal(size=(100, 10)) + y = np.random.normal(size=(100,)) + + def test_tree_(self): + assert not hasattr(self.model, "tree_") + + def test_growth_params_(self): + assert not hasattr(self.model, "growth_params_") + + self.model._organize_growth_parameters() + assert isinstance( + self.model.growth_params_, random_tree_models.params.TreeGrowthParameters + ) + + def test_fit(self): + try: + self.model.fit(None, None) # type: ignore + except NotImplementedError as ex: + pytest.xfail("DecisionTreeTemplate.fit expectedly refused call") + + def test_predict(self): + try: + self.model.predict(None) # type: ignore + except NotImplementedError as ex: + pytest.xfail("DecisionTreeTemplate.predict expectedly refused call") + + def test_select_samples_and_features_no_sampling(self): + self.model.frac_features = 1.0 + self.model.frac_subsamples = 1.0 + self.model._organize_growth_parameters() + + # line to test + X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) + + assert np.allclose(X, self.X) + assert np.allclose(y, self.y) + assert np.allclose(ix_features, np.arange(0, self.X.shape[1], 1)) + + def test_select_samples_and_features_with_column_sampling(self): + self.model.frac_features = 0.5 + self.model.frac_subsamples = 1.0 + self.model._organize_growth_parameters() + + # line to test + X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) + + assert np.isclose( + X.shape[1], self.X.shape[1] * self.model.frac_features, atol=1 + ) + assert np.isclose(y.shape[0], self.y.shape[0]) + assert all([ix in np.arange(0, self.X.shape[1], 1) for ix in ix_features]) + + def test_select_samples_and_features_with_row_sampling(self): + self.model.frac_features = 1.0 + self.model.frac_subsamples = 0.5 + self.model._organize_growth_parameters() + + # line to test + X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) + + assert np.isclose(X.shape[0], self.X.shape[0] * self.model.frac_subsamples) + assert np.isclose(y.shape[0], self.y.shape[0] * self.model.frac_subsamples) + assert np.allclose(ix_features, np.arange(0, self.X.shape[1], 1)) + + def test_select_samples_and_features_with_column_and_row_sampling(self): + self.model.frac_features = 0.5 + self.model.frac_subsamples = 0.5 + self.model._organize_growth_parameters() + + # line to test + X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) + + assert np.isclose( + X.shape[1], self.X.shape[1] * self.model.frac_features, atol=1 + ) + assert np.isclose(X.shape[0], self.X.shape[0] * self.model.frac_subsamples) + assert np.isclose(y.shape[0], self.y.shape[0] * self.model.frac_subsamples) + assert all([ix in np.arange(0, self.X.shape[1], 1) for ix in ix_features]) + + def test_select_samples_and_features_sampling_reproducibility(self): + self.model.frac_features = 0.5 + self.model.frac_subsamples = 0.5 + self.model._organize_growth_parameters() + + # line to test + X0, y0, ix_features0 = self.model._select_samples_and_features(self.X, self.y) + X1, y1, ix_features1 = self.model._select_samples_and_features(self.X, self.y) + + assert np.allclose(X0, X1) + assert np.allclose(y0, y1) + assert np.allclose(ix_features0, ix_features1) + + def test_select_features(self): + ix_features = np.arange(0, self.X.shape[1], 1) + _X = self.model._select_features(self.X, ix_features) + assert np.allclose(_X, self.X) + + ix_features = np.array([0, 1, 2]) + _X = self.model._select_features(self.X, ix_features) + assert _X.shape[1] == 3 + + +class TestDecisionTreeRegressor: + model = DecisionTreeRegressor() + + X = np.array( + [ + [-1, -1], + [1, -1], + [1, 1], + [-1, 1], + ] + ) + y = np.array([0.0, 0.0, 1.0, 1.0]) + + def test_fit(self): + model = DecisionTreeRegressor() + model.fit(self.X, self.y) + assert isinstance(model.tree_, Node) + + def test_predict(self): + model = DecisionTreeRegressor() + model.fit(self.X, self.y) + predictions = model.predict(self.X) + assert np.allclose(predictions, self.y) + + +class TestDecisionTreeClassifier: + model = DecisionTreeClassifier() + + X = np.array( + [ + [-1, -1], + [1, -1], + [1, 1], + [-1, 1], + ] + ) + y = np.array([False, False, True, True]) + + def test_classes_(self): + assert not hasattr(self.model, "classes_") + + def test_fit(self): + model = DecisionTreeClassifier() + model.fit(self.X, self.y) + assert not hasattr(self.model, "classes_") + assert isinstance(model.tree_, Node) + + def test_predict(self): + model = DecisionTreeClassifier() + model.fit(self.X, self.y) + predictions = model.predict(self.X) + assert (predictions == self.y).all() + + +@parametrize_with_checks( + [DecisionTreeRegressor(), DecisionTreeClassifier()], + expected_failed_checks=expected_failed_checks, # type: ignore +) +def test_dtree_estimators_with_sklearn_checks(estimator, check): + """Test of estimators using scikit-learn test suite + + Reference: https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.parametrize_with_checks.html#sklearn.utils.estimator_checks.parametrize_with_checks + """ + + check(estimator) diff --git a/tests/decisiontree/test_node.py b/tests/decisiontree/test_node.py new file mode 100644 index 0000000..9203810 --- /dev/null +++ b/tests/decisiontree/test_node.py @@ -0,0 +1,84 @@ +import pytest +from pydantic import ValidationError + +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.split_objects import SplitScore + +from .conftest import ( + BOOL_OPTIONS_NONE_OKAY, + FLOAT_OPTIONS_NONE_OKAY, + INT_OPTIONS_NONE_OKAY, + NODE_OPTIONS_NONE_OKAY, + STR_OPTIONS_NONE_OKAY, +) + + +@pytest.mark.parametrize( + "int_val, float_val, node_val, str_val, bool_val", + [ + (int_val, float_val, node_val, str_val, bool_val) + for int_val in INT_OPTIONS_NONE_OKAY + for float_val in FLOAT_OPTIONS_NONE_OKAY + for node_val in NODE_OPTIONS_NONE_OKAY + for str_val in STR_OPTIONS_NONE_OKAY + for bool_val in BOOL_OPTIONS_NONE_OKAY + ], +) +def test_Node(int_val, float_val, node_val, str_val, bool_val): + array_column, array_column_okay = int_val + threshold, threshold_okay = float_val + prediction, prediction_okay = float_val + left, left_okay = node_val + right, right_okay = node_val + n_obs, n_obs_okay = int_val + reason, reason_okay = str_val + default_is_left, default_is_left_okay = bool_val + + is_okay = all( + [ + array_column_okay, + threshold_okay, + prediction_okay, + left_okay, + right_okay, + n_obs_okay, + reason_okay, + default_is_left_okay, + ] + ) + measure = SplitScore(name="blub", value=1.0) + try: + # line to test + node = Node( + array_column=array_column, + threshold=threshold, + prediction=prediction, + default_is_left=default_is_left, + left=left, + right=right, + measure=measure, + n_obs=n_obs, + reason=reason, + ) + except ValidationError as ex: + if is_okay: + raise ex + else: + pytest.xfail("SplitScore validation failed as expected") + else: + for att in [ + "array_column", + "threshold", + "prediction", + "default_is_left", + "left", + "right", + "measure", + "n_obs", + "reason", + "node_id", + ]: + assert hasattr(node, att), f"{att=} missing in Node" + assert node.is_leaf == ((left is None) and (right is None)), ( + f"left: {left is None} right: {right is None}" + ) diff --git a/tests/decisiontree/test_predict.py b/tests/decisiontree/test_predict.py new file mode 100644 index 0000000..f0806d7 --- /dev/null +++ b/tests/decisiontree/test_predict.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.predict import find_leaf_node, predict_with_tree + + +@pytest.mark.parametrize( + "x,exp", + [ + (np.array([-1, -1]), 0.0), + (np.array([1, -1]), 1.0), + (np.array([1, 1]), 2.0), + (np.array([-1, 1]), 3.0), + ], +) +def test_find_leaf_node(x: np.ndarray, exp: float): + tree = Node( + array_column=0, + threshold=0.0, + left=Node( + array_column=1, + threshold=0.0, + left=Node(prediction=0.0), + right=Node(prediction=3.0), + ), + right=Node( + array_column=1, + threshold=0.0, + left=Node(prediction=1.0), + right=Node(prediction=2.0), + ), + ) + # line to test + leaf = find_leaf_node(tree, x) + + assert leaf.prediction == exp + + +def test_predict_with_tree(): + X = np.array( + [ + [-1.0, -1.0], + [1.0, -1.0], + [1.0, 1.0], + [-1.0, 1.0], + ] + ) + tree = Node( + array_column=0, + threshold=0.0, + left=Node( + array_column=1, + threshold=0.0, + left=Node(prediction=0.0), + right=Node(prediction=3.0), + ), + right=Node( + array_column=1, + threshold=0.0, + left=Node(prediction=1.0), + right=Node(prediction=2.0), + ), + ) + + # line to test + predictions = predict_with_tree(tree, X) + + assert np.allclose(predictions, np.arange(0, 4, 1)) diff --git a/tests/decisiontree/test_split.py b/tests/decisiontree/test_split.py new file mode 100644 index 0000000..266b5de --- /dev/null +++ b/tests/decisiontree/test_split.py @@ -0,0 +1,761 @@ +from typing import Generator + +import numpy as np +import pytest +from pydantic import ValidationError +from scipy import stats + +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.split import ( + BestSplit, + check_if_split_sensible, + find_best_split, + get_column, + get_thresholds_and_target_groups, + select_arrays_for_child_node, + select_thresholds, +) +from random_tree_models.decisiontree.split_objects import SplitScore +from random_tree_models.params import ( + ColumnSelectionMethod, + ColumnSelectionParameters, + ThresholdSelectionMethod, + ThresholdSelectionParameters, + TreeGrowthParameters, +) + +from .conftest import ( + BOOL_OPTIONS_NONE_OKAY, + FLOAT_OPTIONS_NONE_NOT_OKAY, + INT_OPTIONS_NONE_NOT_OKAY, +) + + +@pytest.mark.parametrize( + "score,column,threshold,target_groups,default_is_left", + [ + (score, column, threshold, target_groups, default_is_left) + for score in FLOAT_OPTIONS_NONE_NOT_OKAY + for column in INT_OPTIONS_NONE_NOT_OKAY + for threshold in FLOAT_OPTIONS_NONE_NOT_OKAY + for target_groups in [ + (np.array([1, 2, 3]), True), + (np.array([]), True), + (None, False), + ] + for default_is_left in BOOL_OPTIONS_NONE_OKAY + ], +) +def test_BestSplit(score, column, threshold, target_groups, default_is_left): + score, score_okay = score + column, column_okay = column + threshold, threshold_okay = threshold + target_groups, target_groups_okay = target_groups + default_is_left, default_is_left_okay = default_is_left + + is_okay = all( + [ + score_okay, + column_okay, + threshold_okay, + target_groups_okay, + default_is_left_okay, + ] + ) + is_bad = not is_okay + + try: + # line to test + best = BestSplit( + score=score, + column=column, + threshold=threshold, + target_groups=target_groups, + default_is_left=default_is_left, + ) + except ValidationError as ex: + if is_okay: + raise ex + else: + pytest.xfail("BestSplit validation failed as expected") + else: + if is_bad: + pytest.fail( + f"BestSplit validation did pass unexpectedly with {score=}, {column=}, {threshold=}, {target_groups=}, {score_okay=}, {column_okay=}, {threshold_okay=}, {target_groups_okay=}, {is_bad=}" + ) + + assert hasattr(best, "score") + assert hasattr(best, "column") + assert hasattr(best, "threshold") + assert hasattr(best, "target_groups") + assert hasattr(best, "default_is_left") + + +class Test_select_thresholds: + """ + bruteforce: returns all possible thresholds from the 2nd onward + random: + * returns a random subset of the thresholds if n_thresholds smaller than avaliable values + * is reproducible with random_state + quantile: returns num_quantile_steps thresholds which are ordered + uniform: returns single value between min and max + """ + + def test_bruteforce(self): + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.bruteforce + ) + feature_values = np.linspace(-1, 1, 100) + rng = np.random.RandomState(42) + + # line to test + thresholds = select_thresholds(feature_values, params, rng=rng) + + assert np.allclose(thresholds, feature_values[1:]) + + def test_random_when_to_few_values(self): + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.random, n_thresholds=1000 + ) + feature_values = np.linspace(-1, 1, 100) + rng = np.random.RandomState(42) + + # line to test + thresholds = select_thresholds(feature_values, params, rng=rng) + + assert np.allclose(thresholds, feature_values[1:]) + + def test_random_when_enough_values(self): + n_thresholds = 10 + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.random, n_thresholds=n_thresholds + ) + feature_values = np.linspace(-1, 1, 100) + rng = np.random.RandomState(42) + + # line to test + thresholds0 = select_thresholds(feature_values, params, rng=rng) + + assert thresholds0.shape == (n_thresholds,) + assert np.unique(thresholds0).shape == (n_thresholds,) + + def test_random_reproducible(self): + n_thresholds = 10 + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.random, n_thresholds=n_thresholds + ) + feature_values = np.linspace(-1, 1, 100) + + # line to test + rng = np.random.RandomState(42) + thresholds0 = select_thresholds(feature_values, params, rng=rng) + rng = np.random.RandomState(42) + thresholds1 = select_thresholds(feature_values, params, rng=rng) + + assert np.allclose(thresholds0, thresholds1) + + def test_random_produces_changing_thresholds(self): + n_thresholds = 10 + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.random, n_thresholds=n_thresholds + ) + feature_values = np.linspace(-1, 1, 100) + rng = np.random.RandomState(42) + + # line to test + thresholds0 = select_thresholds(feature_values, params, rng=rng) + thresholds1 = select_thresholds(feature_values, params, rng=rng) + + assert not np.allclose(thresholds0, thresholds1) + + def test_quantile(self): + n_thresholds = 10 + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.quantile, + n_thresholds=n_thresholds, + quantile=0.1, + ) + feature_values = np.linspace(-1, 1, 100) + rng = np.random.RandomState(42) + + # line to test + thresholds = select_thresholds(feature_values, params, rng=rng) + + assert thresholds.shape == (11,) + assert (thresholds[1:] > thresholds[:-1]).all() + + def test_uniform(self): + n_thresholds = 10 + params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.uniform, n_thresholds=n_thresholds + ) + rng = np.random.RandomState(42) + feature_values = rng.normal(loc=0, scale=1, size=100) + + # line to test + thresholds = select_thresholds(feature_values, params, rng=rng) + + assert thresholds.shape == (1,) + assert thresholds[0] >= feature_values.min() + assert thresholds[0] <= feature_values.max() + + +class Test_get_thresholds_and_target_groups: + """ + * preduces a generator + * produces twice as many items to iterate in the case of missing values + * each item contains the current threshold, the target groups and a boolean that indicates the default direction + * the default direction is always None if there are no missing values and otherwise boolean + """ + + def test_produces_generator(self): + feature_values = np.linspace(-1, 1, 10) + threshold_params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.bruteforce + ) + rng = np.random.RandomState(42) + + # line to test + gen = get_thresholds_and_target_groups( + feature_values, threshold_params, rng=rng + ) + + assert isinstance(gen, Generator) + + def test_finite_only_case(self): + feature_values = np.linspace(-1, 1, 10) + threshold_params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.bruteforce + ) + rng = np.random.RandomState(42) + + # line to test + thresholds_and_target_groups = get_thresholds_and_target_groups( + feature_values, threshold_params, rng=rng + ) + + c = 0 + for ( + threshold, + target_groups, + default_direction_is_left, + ) in thresholds_and_target_groups: + assert isinstance(target_groups, np.ndarray) + assert threshold in feature_values[1:] + assert target_groups.dtype == bool + assert default_direction_is_left is None + c += 1 + + assert c == len(feature_values[1:]) + + def test_with_missing_case(self): + feature_values = np.linspace(-1, 1, 10) + feature_values[5] = np.nan + threshold_params = ThresholdSelectionParameters( + method=ThresholdSelectionMethod.bruteforce + ) + rng = np.random.RandomState(42) + + thresholds_and_target_groups = get_thresholds_and_target_groups( + feature_values, threshold_params, rng=rng + ) + + # line to test + c = 0 + for ( + threshold, + target_groups, + default_direction_is_left, + ) in thresholds_and_target_groups: + assert isinstance(target_groups, np.ndarray) + assert threshold in feature_values[1:] + assert target_groups.dtype == bool + assert default_direction_is_left in [True, False] + c += 1 + + assert c == 2 * (len(feature_values[1:]) - 1) + + +class Test_get_column: + """ + * method ascending just returns ascending integer list for columns + * method random returns random integer list for columns + * method largest_delta returns column indices with largest feature max-min differences first + * if n_columns_to_try is given it is used to shorted the returned list + """ + + def test_ascending(self): + n_columns = 10 + n_trials = None + column_params = ColumnSelectionParameters( + method=ColumnSelectionMethod.ascending, n_trials=n_trials + ) + X = np.random.normal(size=(100, n_columns)) + rng = np.random.RandomState(42) + + # line to test + columns = get_column(X, column_params, rng=rng) + + assert columns == list(range(n_columns)) + + def test_ascending_first_n_trials_columns(self): + n_columns = 10 + n_trials = 5 + column_params = ColumnSelectionParameters( + method=ColumnSelectionMethod.ascending, n_trials=n_trials + ) + X = np.random.normal(size=(100, n_columns)) + rng = np.random.RandomState(42) + + # line to test + columns = get_column(X, column_params, rng=rng) + + assert columns == list(range(n_trials)) + + def test_random(self): + n_columns = 10 + n_trials = None + column_params = ColumnSelectionParameters( + method=ColumnSelectionMethod.random, n_trials=n_trials + ) + X = np.random.normal(size=(100, n_columns)) + rng = np.random.RandomState(42) + + # line to test + columns = get_column(X, column_params, rng=rng) + + assert not all([i0 < i1 for i0, i1 in zip(columns[:-1], columns[1:])]) + assert sorted(columns) == list(range(n_columns)) + + def test_random_is_reproducible(self): + n_columns = 10 + n_trials = None + column_params = ColumnSelectionParameters( + method=ColumnSelectionMethod.random, n_trials=n_trials + ) + X = np.random.normal(size=(100, n_columns)) + + # line to test + rng = np.random.RandomState(42) + columns0 = get_column(X, column_params, rng=rng) + rng = np.random.RandomState(42) + columns1 = get_column(X, column_params, rng=rng) + + assert columns0 == columns1 + + def test_largest_delta(self): + n_columns = 5 + n_trials = None + column_params = ColumnSelectionParameters( + method=ColumnSelectionMethod.largest_delta, n_trials=n_trials + ) + rng = np.random.RandomState(42) + X = np.array([[0, 0.001], [0, 0.01], [0, 0.1], [0, 1.0], [0, 10.0]]).T + + n_repetitions = 100 + all_columns = np.zeros((n_repetitions, n_columns), dtype=int) + + for i in range(n_repetitions): + # line to test + all_columns[i, :] = get_column(X, column_params, rng=rng) + + assert np.allclose(stats.mode(all_columns, axis=0).mode, [4, 3, 2, 1, 0]) + + +class Test_find_best_split: + """ + cases to test for all measure_name values: + * simple & 1d is split as expected + * classification: y = 1 class, y = 2 classes, y = 3 classes + * regression: y = 1 value, y = 2 values, y = 3 values where 2 are more similar + * simple & 2d is split as expected + * same as 1d but 1st column useless and 2nd contains the needed info + """ + + X_1D = np.array( + [ + [ + 1, + ], + [ + 2, + ], + [ + 3, + ], + [ + 4, + ], + ] + ) + + X_1D_missing = np.array( + [ + [ + 1, + ], + [ + np.nan, + ], + [ + 3, + ], + [ + 4, + ], + ] + ) + + X_2D = np.hstack((np.ones_like(X_1D), X_1D)) + X_2D_missing = np.hstack((np.ones_like(X_1D_missing), X_1D_missing)) + + y_1class = np.ones(X_1D.shape[0], dtype=bool) + y_2class = np.array([False, False, True, True]) + y_3class = np.array([0, 0, 1, 2]) + + y_1reg = np.ones(X_1D.shape[0]) + y_2reg = np.array([-1.0, -1.0, 1.0, 1.0]) + y_3reg = np.array([-1.0, -0.9, 1.0, 2.0]) + + # xgboost - least squares + g_1reg = np.array([0.0, 0.0, 0.0, 0.0]) + g_2reg = np.array([-1.0, -1.0, 1.0, 1.0]) + g_3reg = np.array([-1.275, -1.175, 0.725, 1.725]) + + h_1reg = np.array([-1.0, -1.0, -1.0, -1.0]) + h_2reg = np.array([-1.0, -1.0, -1.0, -1.0]) + h_3reg = np.array([-1.0, -1.0, -1.0, -1.0]) + + # xgboost - binomial log-likelihood + g_2class = np.array([-1.0, -1.0, 1.0, 1.0]) + h_2class = np.array([-1.0, -1.0, -1.0, -1.0]) + + @pytest.mark.parametrize( + "y,ix,measure_name,g,h", + [ + (y_1class, None, "gini", None, None), + (y_2class, 2, "gini", None, None), + (y_3class, 2, "gini", None, None), + (y_1class, None, "entropy", None, None), + (y_2class, 2, "entropy", None, None), + (y_3class, 2, "entropy", None, None), + (y_1reg, None, "variance", None, None), + (y_2reg, 2, "variance", None, None), + (y_3reg, 2, "variance", None, None), + (y_1reg, None, "xgboost", g_1reg, h_1reg), + (y_2reg, 2, "xgboost", g_2reg, h_2reg), + (y_3reg, 2, "xgboost", g_3reg, h_3reg), + # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled + (y_2class, 2, "xgboost", g_2class, h_2class), + # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled + ], + ) + def test_1d( + self, + y: np.ndarray, + ix: int, + measure_name: str, + g: np.ndarray, + h: np.ndarray, + ): + is_homogenous = len(np.unique(y)) == 1 + grow_params = TreeGrowthParameters(max_depth=2) + try: + # line to test + best = find_best_split( + self.X_1D, + y, + measure_name=measure_name, + g=g, + h=h, + growth_params=grow_params, + ) + except ValueError as ex: + if is_homogenous: + pytest.xfail("Splitting a homogneous y failed as expected") + else: + raise ex + else: + if is_homogenous: + pytest.fail("Splitting a homogneous y passed unexpectedly") + + threshold_exp = float(self.X_1D[ix, 0]) + assert best.threshold == threshold_exp + + @pytest.mark.parametrize( + "y,ix,measure_name,g,h", + [ + (y_1class, None, "gini", None, None), + (y_2class, 2, "gini", None, None), + (y_3class, 2, "gini", None, None), + (y_1class, None, "entropy", None, None), + (y_2class, 2, "entropy", None, None), + (y_3class, 2, "entropy", None, None), + (y_1reg, None, "variance", None, None), + (y_2reg, 2, "variance", None, None), + (y_3reg, 2, "variance", None, None), + (y_1reg, None, "xgboost", g_1reg, h_1reg), + (y_2reg, 2, "xgboost", g_2reg, h_2reg), + (y_3reg, 2, "xgboost", g_3reg, h_3reg), + # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled + (y_2class, 2, "xgboost", g_2class, h_2class), + # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled + ], + ) + def test_1d_missing( + self, + y: np.ndarray, + ix: int, + measure_name: str, + g: np.ndarray, + h: np.ndarray, + ): + is_homogenous = len(np.unique(y)) == 1 + grow_params = TreeGrowthParameters(max_depth=2) + try: + # line to test + best = find_best_split( + self.X_1D_missing, + y, + measure_name=measure_name, + g=g, + h=h, + growth_params=grow_params, + ) + except ValueError as ex: + if is_homogenous: + pytest.xfail("Splitting a homogneous y failed as expected") + else: + raise ex + else: + if is_homogenous: + pytest.fail("Splitting a homogneous y passed unexpectedly") + + threshold_exp = float(self.X_1D_missing[ix, 0]) + assert best.threshold == threshold_exp + + @pytest.mark.parametrize( + "y,ix,measure_name,g,h", + [ + (y_1class, None, "gini", None, None), + (y_2class, 2, "gini", None, None), + (y_3class, 2, "gini", None, None), + (y_1class, None, "entropy", None, None), + (y_2class, 2, "entropy", None, None), + (y_3class, 2, "entropy", None, None), + (y_1reg, None, "variance", None, None), + (y_2reg, 2, "variance", None, None), + (y_3reg, 2, "variance", None, None), + (y_1reg, None, "xgboost", g_1reg, h_1reg), + (y_2reg, 2, "xgboost", g_2reg, h_2reg), + (y_3reg, 2, "xgboost", g_3reg, h_3reg), + # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled + (y_2class, 2, "xgboost", g_2class, h_2class), + # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled + ], + ) + def test_2d( + self, + y: np.ndarray, + ix: int, + measure_name: str, + g: np.ndarray, + h: np.ndarray, + ): + is_homogenous = len(np.unique(y)) == 1 + growth_params = TreeGrowthParameters(max_depth=2) + try: + # line to test + best = find_best_split( + self.X_2D, + y, + measure_name, + g=g, + h=h, + growth_params=growth_params, + ) + except ValueError as ex: + if is_homogenous: + pytest.xfail("Splitting a homogneous y failed as expected") + else: + raise ex + else: + if is_homogenous: + pytest.fail("Splitting a homogneous y passed unexpectedly") + + assert best.column == 1 + threshold_exp = float(self.X_2D[ix, 1]) + assert best.threshold == threshold_exp + + @pytest.mark.parametrize( + "y,ix,measure_name,g,h", + [ + (y_1class, None, "gini", None, None), + (y_2class, 2, "gini", None, None), + (y_3class, 2, "gini", None, None), + (y_1class, None, "entropy", None, None), + (y_2class, 2, "entropy", None, None), + (y_3class, 2, "entropy", None, None), + (y_1reg, None, "variance", None, None), + (y_2reg, 2, "variance", None, None), + (y_3reg, 2, "variance", None, None), + (y_1reg, None, "xgboost", g_1reg, h_1reg), + (y_2reg, 2, "xgboost", g_2reg, h_2reg), + (y_3reg, 2, "xgboost", g_3reg, h_3reg), + # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled + (y_2class, 2, "xgboost", g_2class, h_2class), + # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled + ], + ) + def test_2d_missing( + self, + y: np.ndarray, + ix: int, + measure_name: str, + g: np.ndarray, + h: np.ndarray, + ): + is_homogenous = len(np.unique(y)) == 1 + growth_params = TreeGrowthParameters(max_depth=2) + try: + # line to test + best = find_best_split( + self.X_2D_missing, + y, + measure_name, + g=g, + h=h, + growth_params=growth_params, + ) + except ValueError as ex: + if is_homogenous: + pytest.xfail("Splitting a homogneous y failed as expected") + else: + raise ex + else: + if is_homogenous: + pytest.fail("Splitting a homogneous y passed unexpectedly") + + assert best.column == 1 + threshold_exp = float(self.X_2D_missing[ix, 1]) + assert best.threshold == threshold_exp + + +@pytest.mark.parametrize( + "best,parent_node,growth_params,is_no_sensible_split_exp", + [ + # parent is None #1 + ( + BestSplit(score=-1.0, column=0, threshold=0.0, target_groups=np.array([])), + None, + TreeGrowthParameters(max_depth=2), + False, + ), + # parent is None #2 + ( + BestSplit(score=-1.0, column=0, threshold=0.0, target_groups=np.array([])), + Node(measure=SplitScore("bla")), + TreeGrowthParameters(max_depth=2), + False, + ), + # split is sufficient + ( + BestSplit( + score=-1.0, + column=0, + threshold=0.0, + target_groups=np.array([False, True]), + ), + Node(measure=SplitScore("bla", value=-1.1)), + TreeGrowthParameters(max_depth=2, min_improvement=0.01), + False, + ), + # split is insufficient - because min gain not exceeded + ( + BestSplit( + score=-1.0, + column=0, + threshold=0.0, + target_groups=np.array([False, True]), + ), + Node(measure=SplitScore("bla", value=-1.1)), + TreeGrowthParameters(max_depth=2, min_improvement=0.2), + True, + ), + # split is insufficient - because all items sorted left + ( + BestSplit( + score=-1.0, + column=0, + threshold=0.0, + target_groups=np.array([True, True]), + ), + Node(measure=SplitScore("bla", value=-1.1)), + TreeGrowthParameters(max_depth=2, min_improvement=0.0), + True, + ), + # split is insufficient - because all items sorted right + ( + BestSplit( + score=-1.0, + column=0, + threshold=0.0, + target_groups=np.array([False, False]), + ), + Node(measure=SplitScore("bla", value=-1.1)), + TreeGrowthParameters(max_depth=2, min_improvement=0.0), + True, + ), + ], +) +def test_check_if_split_sensible( + best: BestSplit, + parent_node: Node, + growth_params: TreeGrowthParameters, + is_no_sensible_split_exp: bool, +): + # line to test + is_not_sensible_split, gain = check_if_split_sensible( + best, parent_node, growth_params + ) + + assert is_not_sensible_split == is_no_sensible_split_exp + if parent_node is None or parent_node.measure.value is None: # type: ignore + assert gain is None + + +@pytest.mark.parametrize("go_left", [True, False]) +def test_select_arrays_for_child_node(go_left: bool): + best = BestSplit( + score=1.0, + column=0, + threshold=2.0, + target_groups=np.array([True, True, False]), + ) + + X = np.array([[1], [2], [3]]) + y = np.array([True, True, False]) + g = np.array([1, 2, 3]) + h = np.array([4, 5, 6]) + + # line to test + _X, _y, _g, _h = select_arrays_for_child_node( + go_left=go_left, + best=best, + X=X, + y=y, + g=g, + h=h, + ) + assert _g is not None + assert _h is not None + if go_left: + assert np.allclose(_X, X[:2]) + assert np.allclose(_y, y[:2]) + assert np.allclose(_g, g[:2]) + assert np.allclose(_h, h[:2]) + else: + assert np.allclose(_X, X[2:]) + assert np.allclose(_y, y[2:]) + assert np.allclose(_g, g[2:]) + assert np.allclose(_h, h[2:]) diff --git a/tests/decisiontree/test_split_objects.py b/tests/decisiontree/test_split_objects.py new file mode 100644 index 0000000..e6942f9 --- /dev/null +++ b/tests/decisiontree/test_split_objects.py @@ -0,0 +1,37 @@ +import pytest +from pydantic import ValidationError + +from random_tree_models.decisiontree.split_objects import SplitScore + +from .conftest import FLOAT_OPTIONS_NONE_OKAY, STR_OPTIONS_NONE_NOT_OKAY + + +@pytest.mark.parametrize( + "name,value", + [ + (name, value) + for name in STR_OPTIONS_NONE_NOT_OKAY + for value in FLOAT_OPTIONS_NONE_OKAY + ], +) +def test_SplitScore(name, value): + name, name_okay = name + value, value_okay = value + is_okay = name_okay and value_okay + is_bad = not is_okay + try: + # line to test + measure = SplitScore(name=name, value=value) + except ValidationError as ex: + if is_okay: + raise ValueError(f"whoops {name=} {value=} failed with {ex}") + else: + pytest.xfail("SplitScore validation failed as expected") + else: + if is_bad: + pytest.fail( + f"SplitScore test unexpectedly passed for {name=}, {value=}, {name_okay=}, {value_okay=}, {is_okay=}" + ) + + assert hasattr(measure, "name") + assert hasattr(measure, "value") diff --git a/tests/decisiontree/test_train.py b/tests/decisiontree/test_train.py new file mode 100644 index 0000000..ef5fb5b --- /dev/null +++ b/tests/decisiontree/test_train.py @@ -0,0 +1,98 @@ +import numpy as np +import pytest +from dirty_equals import IsApprox +from inline_snapshot import snapshot + +from random_tree_models.decisiontree.node import Node +from random_tree_models.decisiontree.split_objects import SplitScore +from random_tree_models.decisiontree.train import ( + calc_leaf_weight_and_split_score, + check_is_baselevel, + grow_tree, +) +from random_tree_models.params import MetricNames, TreeGrowthParameters + + +@pytest.mark.parametrize( + "y, depths", + [ + (y, depths) + for y in [(np.array([1, 2]), False), (np.array([]), True)] + for depths in [(1, 2, False), (2, 2, True), (3, 2, True)] + ], +) +def test_check_is_baselevel(y, depths): + y, is_baselevel_exp_y = y + depth, max_depth, is_baselevel_exp_depth = depths + is_baselevel_exp = is_baselevel_exp_depth or is_baselevel_exp_y + + # line to test + is_baselevel, msg = check_is_baselevel(y, depth=depth, max_depth=max_depth) + + assert is_baselevel == is_baselevel_exp + assert isinstance(msg, str) + + +def test_calc_leaf_weight_and_split_score(): + y = np.array([True, True, False]) + measure_name = MetricNames.gini + growth_params = TreeGrowthParameters(max_depth=2) + g = np.array([1, 2, 3]) + h = np.array([4, 5, 6]) + + # line to test + leaf_weight, split_score = calc_leaf_weight_and_split_score( + y, measure_name, growth_params, g, h + ) + + assert leaf_weight == IsApprox(0.6666666666666666) + assert split_score == IsApprox(-0.4444444444444445) + + +class Test_grow_tree: + X = np.array([[1], [2], [3]]) + y = np.array([True, True, False]) + target_groups = np.array([True, True, False]) + measure_name = MetricNames.gini + depth_dummy = 0 + + def test_baselevel(self): + # test returned leaf node + growth_params = TreeGrowthParameters(max_depth=2) + parent_node = None + + # line to test + leaf_node = grow_tree( + self.X, + self.y, + self.measure_name, + growth_params=growth_params, + parent_node=parent_node, + depth=self.depth_dummy, + ) + + assert leaf_node == snapshot( + Node( + array_column=0, + threshold=3.0, + prediction=0.6666666666666666, + right=Node( + prediction=0.0, + measure=SplitScore(name="gini", value=0.0), + n_obs=1, + reason="homogenous group", + depth=1, + ), + left=Node( + prediction=1.0, + measure=SplitScore(name="gini", value=0.0), + n_obs=2, + reason="homogenous group", + depth=1, + ), + measure=SplitScore(name="gini", value=0.0), + n_obs=3, + reason="", + depth=0, + ) + ) diff --git a/tests/decisiontree/test_visualize.py b/tests/decisiontree/test_visualize.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_decisiontree.py b/tests/test_decisiontree.py deleted file mode 100644 index 635085c..0000000 --- a/tests/test_decisiontree.py +++ /dev/null @@ -1,1320 +0,0 @@ -import types -from unittest.mock import patch - -import numpy as np -import pytest -from pydantic import ValidationError -from scipy import stats -from sklearn.utils.estimator_checks import parametrize_with_checks - -import random_tree_models.decisiontree as dtree -import random_tree_models.utils as utils -from random_tree_models import scoring -from random_tree_models.scoring import MetricNames -from random_tree_models.utils import ThresholdSelectionMethod -from tests.conftest import expected_failed_checks - -# first value in each tuple is the value to test and the second is the flag indicating if this should work -BOOL_OPTIONS_NONE_OKAY = [(False, True), (True, True), ("blub", False)] -INT_OPTIONS_NONE_OKAY = [(0, True), (None, True), ("blub", False)] -INT_OPTIONS_NONE_NOT_OKAY = [(0, True), (None, False), ("blub", False)] -FLOAT_OPTIONS_NONE_OKAY = [ - (-1.0, True), - (None, True), - ("blub", False), -] -FLOAT_OPTIONS_NONE_NOT_OKAY = [ - (-1.0, True), - (None, False), - ("blub", False), -] -NODE_OPTIONS_NONE_OKAY = [ - (dtree.Node(), True), - (None, True), - ("blub", False), -] -STR_OPTIONS_NONE_OKAY = [("blub", True), (None, True), (1.0, False)] -STR_OPTIONS_NONE_NOT_OKAY = [ - ("blub", True), - (None, False), - (1, False), - (1.0, False), -] - - -@pytest.mark.parametrize( - "name,value", - [ - (name, value) - for name in STR_OPTIONS_NONE_NOT_OKAY - for value in FLOAT_OPTIONS_NONE_OKAY - ], -) -def test_SplitScore(name, value): - name, name_okay = name - value, value_okay = value - is_okay = name_okay and value_okay - is_bad = not is_okay - try: - # line to test - measure = dtree.SplitScore(name=name, value=value) - except ValidationError as ex: - if is_okay: - raise ValueError(f"whoops {name=} {value=} failed with {ex}") - else: - pytest.xfail("SplitScore validation failed as expected") - else: - if is_bad: - pytest.fail( - f"SplitScore test unexpectedly passed for {name=}, {value=}, {name_okay=}, {value_okay=}, {is_okay=}" - ) - - assert hasattr(measure, "name") - assert hasattr(measure, "value") - - -@pytest.mark.parametrize( - "int_val, float_val, node_val, str_val, bool_val", - [ - (int_val, float_val, node_val, str_val, bool_val) - for int_val in INT_OPTIONS_NONE_OKAY - for float_val in FLOAT_OPTIONS_NONE_OKAY - for node_val in NODE_OPTIONS_NONE_OKAY - for str_val in STR_OPTIONS_NONE_OKAY - for bool_val in BOOL_OPTIONS_NONE_OKAY - ], -) -def test_Node(int_val, float_val, node_val, str_val, bool_val): - array_column, array_column_okay = int_val - threshold, threshold_okay = float_val - prediction, prediction_okay = float_val - left, left_okay = node_val - right, right_okay = node_val - n_obs, n_obs_okay = int_val - reason, reason_okay = str_val - default_is_left, default_is_left_okay = bool_val - - is_okay = all( - [ - array_column_okay, - threshold_okay, - prediction_okay, - left_okay, - right_okay, - n_obs_okay, - reason_okay, - default_is_left_okay, - ] - ) - measure = dtree.SplitScore(name="blub", value=1.0) - try: - # line to test - node = dtree.Node( - array_column=array_column, - threshold=threshold, - prediction=prediction, - default_is_left=default_is_left, - left=left, - right=right, - measure=measure, - n_obs=n_obs, - reason=reason, - ) - except ValidationError as ex: - if is_okay: - raise ex - else: - pytest.xfail("SplitScore validation failed as expected") - else: - for att in [ - "array_column", - "threshold", - "prediction", - "default_is_left", - "left", - "right", - "measure", - "n_obs", - "reason", - "node_id", - ]: - assert hasattr(node, att), f"{att=} missing in Node" - assert node.is_leaf == ((left is None) and (right is None)), ( - f"left: {left is None} right: {right is None}" - ) - - -@pytest.mark.parametrize( - "y, depths", - [ - (y, depths) - for y in [(np.array([1, 2]), False), (np.array([]), True)] - for depths in [(1, 2, False), (2, 2, True), (3, 2, True)] - ], -) -def test_check_is_baselevel(y, depths): - node = dtree.Node() - - y, is_baselevel_exp_y = y - depth, max_depth, is_baselevel_exp_depth = depths - is_baselevel_exp = is_baselevel_exp_depth or is_baselevel_exp_y - - # line to test - is_baselevel, msg = dtree.check_is_baselevel(y, depth=depth, max_depth=max_depth) - - assert is_baselevel == is_baselevel_exp - assert isinstance(msg, str) - - -@pytest.mark.parametrize( - "score,column,threshold,target_groups,default_is_left", - [ - (score, column, threshold, target_groups, default_is_left) - for score in FLOAT_OPTIONS_NONE_NOT_OKAY - for column in INT_OPTIONS_NONE_NOT_OKAY - for threshold in FLOAT_OPTIONS_NONE_NOT_OKAY - for target_groups in [ - (np.array([1, 2, 3]), True), - (np.array([]), True), - (None, False), - ] - for default_is_left in BOOL_OPTIONS_NONE_OKAY - ], -) -def test_BestSplit(score, column, threshold, target_groups, default_is_left): - score, score_okay = score - column, column_okay = column - threshold, threshold_okay = threshold - target_groups, target_groups_okay = target_groups - default_is_left, default_is_left_okay = default_is_left - - is_okay = all( - [ - score_okay, - column_okay, - threshold_okay, - target_groups_okay, - default_is_left_okay, - ] - ) - is_bad = not is_okay - - try: - # line to test - best = dtree.BestSplit( - score=score, - column=column, - threshold=threshold, - target_groups=target_groups, - default_is_left=default_is_left, - ) - except ValidationError as ex: - if is_okay: - raise ex - else: - pytest.xfail("BestSplit validation failed as expected") - else: - if is_bad: - pytest.fail( - f"BestSplit validation did pass unexpectedly with {score=}, {column=}, {threshold=}, {target_groups=}, {score_okay=}, {column_okay=}, {threshold_okay=}, {target_groups_okay=}, {is_bad=}" - ) - - assert hasattr(best, "score") - assert hasattr(best, "column") - assert hasattr(best, "threshold") - assert hasattr(best, "target_groups") - assert hasattr(best, "default_is_left") - - -class Test_select_thresholds: - """ - bruteforce: returns all possible thresholds from the 2nd onward - random: - * returns a random subset of the thresholds if n_thresholds smaller than avaliable values - * is reproducible with random_state - quantile: returns num_quantile_steps thresholds which are ordered - uniform: returns single value between min and max - """ - - def test_bruteforce(self): - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.bruteforce - ) - feature_values = np.linspace(-1, 1, 100) - rng = np.random.RandomState(42) - - # line to test - thresholds = dtree.select_thresholds(feature_values, params, rng=rng) - - assert np.allclose(thresholds, feature_values[1:]) - - def test_random_when_to_few_values(self): - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.random, n_thresholds=1000 - ) - feature_values = np.linspace(-1, 1, 100) - rng = np.random.RandomState(42) - - # line to test - thresholds = dtree.select_thresholds(feature_values, params, rng=rng) - - assert np.allclose(thresholds, feature_values[1:]) - - def test_random_when_enough_values(self): - n_thresholds = 10 - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.random, n_thresholds=n_thresholds - ) - feature_values = np.linspace(-1, 1, 100) - rng = np.random.RandomState(42) - - # line to test - thresholds0 = dtree.select_thresholds(feature_values, params, rng=rng) - - assert thresholds0.shape == (n_thresholds,) - assert np.unique(thresholds0).shape == (n_thresholds,) - - def test_random_reproducible(self): - n_thresholds = 10 - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.random, n_thresholds=n_thresholds - ) - feature_values = np.linspace(-1, 1, 100) - - # line to test - rng = np.random.RandomState(42) - thresholds0 = dtree.select_thresholds(feature_values, params, rng=rng) - rng = np.random.RandomState(42) - thresholds1 = dtree.select_thresholds(feature_values, params, rng=rng) - - assert np.allclose(thresholds0, thresholds1) - - def test_random_produces_changing_thresholds(self): - n_thresholds = 10 - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.random, n_thresholds=n_thresholds - ) - feature_values = np.linspace(-1, 1, 100) - rng = np.random.RandomState(42) - - # line to test - thresholds0 = dtree.select_thresholds(feature_values, params, rng=rng) - thresholds1 = dtree.select_thresholds(feature_values, params, rng=rng) - - assert not np.allclose(thresholds0, thresholds1) - - def test_quantile(self): - n_thresholds = 10 - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.quantile, - n_thresholds=n_thresholds, - quantile=0.1, - ) - feature_values = np.linspace(-1, 1, 100) - rng = np.random.RandomState(42) - - # line to test - thresholds = dtree.select_thresholds(feature_values, params, rng=rng) - - assert thresholds.shape == (11,) - assert (thresholds[1:] > thresholds[:-1]).all() - - def test_uniform(self): - n_thresholds = 10 - params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.uniform, n_thresholds=n_thresholds - ) - rng = np.random.RandomState(42) - feature_values = rng.normal(loc=0, scale=1, size=100) - - # line to test - thresholds = dtree.select_thresholds(feature_values, params, rng=rng) - - assert thresholds.shape == (1,) - assert thresholds[0] >= feature_values.min() - assert thresholds[0] <= feature_values.max() - - -class Test_get_thresholds_and_target_groups: - """ - * preduces a generator - * produces twice as many items to iterate in the case of missing values - * each item contains the current threshold, the target groups and a boolean that indicates the default direction - * the default direction is always None if there are no missing values and otherwise boolean - """ - - def test_produces_generator(self): - feature_values = np.linspace(-1, 1, 10) - threshold_params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.bruteforce - ) - rng = np.random.RandomState(42) - - # line to test - gen = dtree.get_thresholds_and_target_groups( - feature_values, threshold_params, rng=rng - ) - - assert isinstance(gen, types.GeneratorType) - - def test_finite_only_case(self): - feature_values = np.linspace(-1, 1, 10) - threshold_params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.bruteforce - ) - rng = np.random.RandomState(42) - - # line to test - thresholds_and_target_groups = dtree.get_thresholds_and_target_groups( - feature_values, threshold_params, rng=rng - ) - - c = 0 - for ( - threshold, - target_groups, - default_direction_is_left, - ) in thresholds_and_target_groups: - assert isinstance(target_groups, np.ndarray) - assert threshold in feature_values[1:] - assert target_groups.dtype == bool - assert default_direction_is_left is None - c += 1 - - assert c == len(feature_values[1:]) - - def test_with_missing_case(self): - feature_values = np.linspace(-1, 1, 10) - feature_values[5] = np.nan - threshold_params = utils.ThresholdSelectionParameters( - method=ThresholdSelectionMethod.bruteforce - ) - rng = np.random.RandomState(42) - - thresholds_and_target_groups = dtree.get_thresholds_and_target_groups( - feature_values, threshold_params, rng=rng - ) - - # line to test - c = 0 - for ( - threshold, - target_groups, - default_direction_is_left, - ) in thresholds_and_target_groups: - assert isinstance(target_groups, np.ndarray) - assert threshold in feature_values[1:] - assert target_groups.dtype == bool - assert default_direction_is_left in [True, False] - c += 1 - - assert c == 2 * (len(feature_values[1:]) - 1) - - -class Test_get_column: - """ - * method ascending just returns ascending integer list for columns - * method random returns random integer list for columns - * method largest_delta returns column indices with largest feature max-min differences first - * if n_columns_to_try is given it is used to shorted the returned list - """ - - def test_ascending(self): - n_columns = 10 - n_trials = None - column_params = utils.ColumnSelectionParameters( - method=utils.ColumnSelectionMethod.ascending, n_trials=n_trials - ) - X = np.random.normal(size=(100, n_columns)) - rng = np.random.RandomState(42) - - # line to test - columns = dtree.get_column(X, column_params, rng=rng) - - assert columns == list(range(n_columns)) - - def test_ascending_first_n_trials_columns(self): - n_columns = 10 - n_trials = 5 - column_params = utils.ColumnSelectionParameters( - method=utils.ColumnSelectionMethod.ascending, n_trials=n_trials - ) - X = np.random.normal(size=(100, n_columns)) - rng = np.random.RandomState(42) - - # line to test - columns = dtree.get_column(X, column_params, rng=rng) - - assert columns == list(range(n_trials)) - - def test_random(self): - n_columns = 10 - n_trials = None - column_params = utils.ColumnSelectionParameters( - method=utils.ColumnSelectionMethod.random, n_trials=n_trials - ) - X = np.random.normal(size=(100, n_columns)) - rng = np.random.RandomState(42) - - # line to test - columns = dtree.get_column(X, column_params, rng=rng) - - assert not all([i0 < i1 for i0, i1 in zip(columns[:-1], columns[1:])]) - assert sorted(columns) == list(range(n_columns)) - - def test_random_is_reproducible(self): - n_columns = 10 - n_trials = None - column_params = utils.ColumnSelectionParameters( - method=utils.ColumnSelectionMethod.random, n_trials=n_trials - ) - X = np.random.normal(size=(100, n_columns)) - - # line to test - rng = np.random.RandomState(42) - columns0 = dtree.get_column(X, column_params, rng=rng) - rng = np.random.RandomState(42) - columns1 = dtree.get_column(X, column_params, rng=rng) - - assert columns0 == columns1 - - def test_largest_delta(self): - n_columns = 5 - n_trials = None - column_params = utils.ColumnSelectionParameters( - method=utils.ColumnSelectionMethod.largest_delta, n_trials=n_trials - ) - rng = np.random.RandomState(42) - X = np.array([[0, 0.001], [0, 0.01], [0, 0.1], [0, 1.0], [0, 10.0]]).T - - n_repetitions = 100 - all_columns = np.zeros((n_repetitions, n_columns), dtype=int) - - for i in range(n_repetitions): - # line to test - all_columns[i, :] = dtree.get_column(X, column_params, rng=rng) - - assert np.allclose(stats.mode(all_columns, axis=0).mode, [4, 3, 2, 1, 0]) - - -class Test_find_best_split: - """ - cases to test for all measure_name values: - * simple & 1d is split as expected - * classification: y = 1 class, y = 2 classes, y = 3 classes - * regression: y = 1 value, y = 2 values, y = 3 values where 2 are more similar - * simple & 2d is split as expected - * same as 1d but 1st column useless and 2nd contains the needed info - """ - - X_1D = np.array( - [ - [ - 1, - ], - [ - 2, - ], - [ - 3, - ], - [ - 4, - ], - ] - ) - - X_1D_missing = np.array( - [ - [ - 1, - ], - [ - np.nan, - ], - [ - 3, - ], - [ - 4, - ], - ] - ) - - X_2D = np.hstack((np.ones_like(X_1D), X_1D)) - X_2D_missing = np.hstack((np.ones_like(X_1D_missing), X_1D_missing)) - - y_1class = np.ones(X_1D.shape[0], dtype=bool) - y_2class = np.array([False, False, True, True]) - y_3class = np.array([0, 0, 1, 2]) - - y_1reg = np.ones(X_1D.shape[0]) - y_2reg = np.array([-1.0, -1.0, 1.0, 1.0]) - y_3reg = np.array([-1.0, -0.9, 1.0, 2.0]) - - # xgboost - least squares - g_1reg = np.array([0.0, 0.0, 0.0, 0.0]) - g_2reg = np.array([-1.0, -1.0, 1.0, 1.0]) - g_3reg = np.array([-1.275, -1.175, 0.725, 1.725]) - - h_1reg = np.array([-1.0, -1.0, -1.0, -1.0]) - h_2reg = np.array([-1.0, -1.0, -1.0, -1.0]) - h_3reg = np.array([-1.0, -1.0, -1.0, -1.0]) - - # xgboost - binomial log-likelihood - g_2class = np.array([-1.0, -1.0, 1.0, 1.0]) - h_2class = np.array([-1.0, -1.0, -1.0, -1.0]) - - @pytest.mark.parametrize( - "y,ix,measure_name,g,h", - [ - (y_1class, None, "gini", None, None), - (y_2class, 2, "gini", None, None), - (y_3class, 2, "gini", None, None), - (y_1class, None, "entropy", None, None), - (y_2class, 2, "entropy", None, None), - (y_3class, 2, "entropy", None, None), - (y_1reg, None, "variance", None, None), - (y_2reg, 2, "variance", None, None), - (y_3reg, 2, "variance", None, None), - (y_1reg, None, "xgboost", g_1reg, h_1reg), - (y_2reg, 2, "xgboost", g_2reg, h_2reg), - (y_3reg, 2, "xgboost", g_3reg, h_3reg), - # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled - (y_2class, 2, "xgboost", g_2class, h_2class), - # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled - ], - ) - def test_1d( - self, - y: np.ndarray, - ix: int, - measure_name: str, - g: np.ndarray, - h: np.ndarray, - ): - is_homogenous = len(np.unique(y)) == 1 - grow_params = utils.TreeGrowthParameters(max_depth=2) - try: - # line to test - best = dtree.find_best_split( - self.X_1D, - y, - measure_name=measure_name, - g=g, - h=h, - growth_params=grow_params, - ) - except ValueError as ex: - if is_homogenous: - pytest.xfail("Splitting a homogneous y failed as expected") - else: - raise ex - else: - if is_homogenous: - pytest.fail("Splitting a homogneous y passed unexpectedly") - - threshold_exp = float(self.X_1D[ix, 0]) - assert best.threshold == threshold_exp - - @pytest.mark.parametrize( - "y,ix,measure_name,g,h", - [ - (y_1class, None, "gini", None, None), - (y_2class, 2, "gini", None, None), - (y_3class, 2, "gini", None, None), - (y_1class, None, "entropy", None, None), - (y_2class, 2, "entropy", None, None), - (y_3class, 2, "entropy", None, None), - (y_1reg, None, "variance", None, None), - (y_2reg, 2, "variance", None, None), - (y_3reg, 2, "variance", None, None), - (y_1reg, None, "xgboost", g_1reg, h_1reg), - (y_2reg, 2, "xgboost", g_2reg, h_2reg), - (y_3reg, 2, "xgboost", g_3reg, h_3reg), - # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled - (y_2class, 2, "xgboost", g_2class, h_2class), - # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled - ], - ) - def test_1d_missing( - self, - y: np.ndarray, - ix: int, - measure_name: str, - g: np.ndarray, - h: np.ndarray, - ): - is_homogenous = len(np.unique(y)) == 1 - grow_params = utils.TreeGrowthParameters(max_depth=2) - try: - # line to test - best = dtree.find_best_split( - self.X_1D_missing, - y, - measure_name=measure_name, - g=g, - h=h, - growth_params=grow_params, - ) - except ValueError as ex: - if is_homogenous: - pytest.xfail("Splitting a homogneous y failed as expected") - else: - raise ex - else: - if is_homogenous: - pytest.fail("Splitting a homogneous y passed unexpectedly") - - threshold_exp = float(self.X_1D_missing[ix, 0]) - assert best.threshold == threshold_exp - - @pytest.mark.parametrize( - "y,ix,measure_name,g,h", - [ - (y_1class, None, "gini", None, None), - (y_2class, 2, "gini", None, None), - (y_3class, 2, "gini", None, None), - (y_1class, None, "entropy", None, None), - (y_2class, 2, "entropy", None, None), - (y_3class, 2, "entropy", None, None), - (y_1reg, None, "variance", None, None), - (y_2reg, 2, "variance", None, None), - (y_3reg, 2, "variance", None, None), - (y_1reg, None, "xgboost", g_1reg, h_1reg), - (y_2reg, 2, "xgboost", g_2reg, h_2reg), - (y_3reg, 2, "xgboost", g_3reg, h_3reg), - # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled - (y_2class, 2, "xgboost", g_2class, h_2class), - # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled - ], - ) - def test_2d( - self, - y: np.ndarray, - ix: int, - measure_name: str, - g: np.ndarray, - h: np.ndarray, - ): - is_homogenous = len(np.unique(y)) == 1 - growth_params = utils.TreeGrowthParameters(max_depth=2) - try: - # line to test - best = dtree.find_best_split( - self.X_2D, - y, - measure_name, - g=g, - h=h, - growth_params=growth_params, - ) - except ValueError as ex: - if is_homogenous: - pytest.xfail("Splitting a homogneous y failed as expected") - else: - raise ex - else: - if is_homogenous: - pytest.fail("Splitting a homogneous y passed unexpectedly") - - assert best.column == 1 - threshold_exp = float(self.X_2D[ix, 1]) - assert best.threshold == threshold_exp - - @pytest.mark.parametrize( - "y,ix,measure_name,g,h", - [ - (y_1class, None, "gini", None, None), - (y_2class, 2, "gini", None, None), - (y_3class, 2, "gini", None, None), - (y_1class, None, "entropy", None, None), - (y_2class, 2, "entropy", None, None), - (y_3class, 2, "entropy", None, None), - (y_1reg, None, "variance", None, None), - (y_2reg, 2, "variance", None, None), - (y_3reg, 2, "variance", None, None), - (y_1reg, None, "xgboost", g_1reg, h_1reg), - (y_2reg, 2, "xgboost", g_2reg, h_2reg), - (y_3reg, 2, "xgboost", g_3reg, h_3reg), - # (y_1class, None, "xgboost", g_1class, h_1class), # currently not handled - (y_2class, 2, "xgboost", g_2class, h_2class), - # (y_3class, 2, "xgboost", g_3class, h_3class), # currently not handled - ], - ) - def test_2d_missing( - self, - y: np.ndarray, - ix: int, - measure_name: str, - g: np.ndarray, - h: np.ndarray, - ): - is_homogenous = len(np.unique(y)) == 1 - growth_params = utils.TreeGrowthParameters(max_depth=2) - try: - # line to test - best = dtree.find_best_split( - self.X_2D_missing, - y, - measure_name, - g=g, - h=h, - growth_params=growth_params, - ) - except ValueError as ex: - if is_homogenous: - pytest.xfail("Splitting a homogneous y failed as expected") - else: - raise ex - else: - if is_homogenous: - pytest.fail("Splitting a homogneous y passed unexpectedly") - - assert best.column == 1 - threshold_exp = float(self.X_2D_missing[ix, 1]) - assert best.threshold == threshold_exp - - -@pytest.mark.parametrize( - "best,parent_node,growth_params,is_no_sensible_split_exp", - [ - # parent is None #1 - ( - dtree.BestSplit( - score=-1.0, column=0, threshold=0.0, target_groups=np.array([]) - ), - None, - utils.TreeGrowthParameters(max_depth=2), - False, - ), - # parent is None #2 - ( - dtree.BestSplit( - score=-1.0, column=0, threshold=0.0, target_groups=np.array([]) - ), - dtree.Node(measure=dtree.SplitScore("bla")), - utils.TreeGrowthParameters(max_depth=2), - False, - ), - # split is sufficient - ( - dtree.BestSplit( - score=-1.0, - column=0, - threshold=0.0, - target_groups=np.array([False, True]), - ), - dtree.Node(measure=dtree.SplitScore("bla", value=-1.1)), - utils.TreeGrowthParameters(max_depth=2, min_improvement=0.01), - False, - ), - # split is insufficient - because min gain not exceeded - ( - dtree.BestSplit( - score=-1.0, - column=0, - threshold=0.0, - target_groups=np.array([False, True]), - ), - dtree.Node(measure=dtree.SplitScore("bla", value=-1.1)), - utils.TreeGrowthParameters(max_depth=2, min_improvement=0.2), - True, - ), - # split is insufficient - because all items sorted left - ( - dtree.BestSplit( - score=-1.0, - column=0, - threshold=0.0, - target_groups=np.array([True, True]), - ), - dtree.Node(measure=dtree.SplitScore("bla", value=-1.1)), - utils.TreeGrowthParameters(max_depth=2, min_improvement=0.0), - True, - ), - # split is insufficient - because all items sorted right - ( - dtree.BestSplit( - score=-1.0, - column=0, - threshold=0.0, - target_groups=np.array([False, False]), - ), - dtree.Node(measure=dtree.SplitScore("bla", value=-1.1)), - utils.TreeGrowthParameters(max_depth=2, min_improvement=0.0), - True, - ), - ], -) -def test_check_if_split_sensible( - best: dtree.BestSplit, - parent_node: dtree.Node, - growth_params: utils.TreeGrowthParameters, - is_no_sensible_split_exp: bool, -): - # line to test - is_not_sensible_split, gain = dtree.check_if_split_sensible( - best, parent_node, growth_params - ) - - assert is_not_sensible_split == is_no_sensible_split_exp - if parent_node is None or parent_node.measure.value is None: # type: ignore - assert gain is None - - -def test_calc_leaf_weight_and_split_score(): - # calls leafweights.calc_leaf_weight and scoreing.SplitScoreMetrics - # and returns two floats - y = np.array([True, True, False]) - measure_name = scoring.MetricNames.gini - growth_params = utils.TreeGrowthParameters(max_depth=2) - g = np.array([1, 2, 3]) - h = np.array([4, 5, 6]) - leaf_weight_exp = 1.0 - score_exp = 42.0 - with ( - patch( - "random_tree_models.decisiontree.leafweights.calc_leaf_weight", - return_value=leaf_weight_exp, - ) as mock_calc_leaf_weight, - patch( - "random_tree_models.decisiontree.scoring.calc_split_score", - return_value=score_exp, - ) as mock_SplitScoreMetrics, - ): - # line to test - leaf_weight, split_score = dtree.calc_leaf_weight_and_split_score( - y, measure_name, growth_params, g, h - ) - - assert leaf_weight == leaf_weight_exp - assert split_score == score_exp - assert mock_calc_leaf_weight.call_count == 1 - assert mock_SplitScoreMetrics.call_count == 1 - - -@pytest.mark.parametrize("go_left", [True, False]) -def test_select_arrays_for_child_node(go_left: bool): - best = dtree.BestSplit( - score=1.0, - column=0, - threshold=2.0, - target_groups=np.array([True, True, False]), - ) - - X = np.array([[1], [2], [3]]) - y = np.array([True, True, False]) - g = np.array([1, 2, 3]) - h = np.array([4, 5, 6]) - - # line to test - _X, _y, _g, _h = dtree.select_arrays_for_child_node( - go_left=go_left, - best=best, - X=X, - y=y, - g=g, - h=h, - ) - assert _g is not None - assert _h is not None - if go_left: - assert np.allclose(_X, X[:2]) - assert np.allclose(_y, y[:2]) - assert np.allclose(_g, g[:2]) - assert np.allclose(_h, h[:2]) - else: - assert np.allclose(_X, X[2:]) - assert np.allclose(_y, y[2:]) - assert np.allclose(_g, g[2:]) - assert np.allclose(_h, h[2:]) - - -class Test_grow_tree: - X = np.array([[1], [2], [3]]) - y = np.array([True, True, False]) - target_groups = np.array([True, True, False]) - measure_name = MetricNames.gini - depth_dummy = 0 - - def test_baselevel(self): - # test returned leaf node - growth_params = utils.TreeGrowthParameters(max_depth=2) - parent_node = None - is_baselevel = True - reason = "very custom leaf node comment" - with patch( - "random_tree_models.decisiontree.check_is_baselevel", - return_value=[is_baselevel, reason], - ) as mock_check_is_baselevel: - # line to test - leaf_node = dtree.grow_tree( - self.X, - self.y, - self.measure_name, - growth_params=growth_params, - parent_node=parent_node, - depth=self.depth_dummy, - ) - - mock_check_is_baselevel.assert_called_once() - assert leaf_node.is_leaf == True - assert leaf_node.reason == reason - - def test_split_improvement_insufficient(self): - # test split improvement below minimum - growth_params = utils.TreeGrowthParameters(max_depth=2, min_improvement=0.2) - parent_score = -1.0 - new_score = -0.9 - best = dtree.BestSplit( - score=new_score, - column=0, - threshold=3.0, - target_groups=self.target_groups, - ) - measure = dtree.SplitScore(self.measure_name, parent_score) - parent_node = dtree.Node( - array_column=0, - threshold=1.0, - prediction=0.9, - left=None, - right=None, - measure=measure, - n_obs=3, - reason="", - ) - is_baselevel = False - leaf_reason = "very custom leaf node comment" - gain = new_score - parent_score - split_reason = f"gain due split ({gain=}) lower than {growth_params.min_improvement=} or all data points assigned to one side (is left {best.target_groups.mean()=:.2%})" - with ( - patch( - "random_tree_models.decisiontree.check_is_baselevel", - return_value=[is_baselevel, leaf_reason], - ) as mock_check_is_baselevel, - patch( - "random_tree_models.decisiontree.find_best_split", - return_value=best, - ) as mock_find_best_split, - ): - # line to test - node = dtree.grow_tree( - self.X, - self.y, - self.measure_name, - growth_params=growth_params, - parent_node=parent_node, - depth=self.depth_dummy, - ) - - mock_check_is_baselevel.assert_called_once() - mock_find_best_split.assert_called_once() - assert node.reason == split_reason - assert node.prediction == np.mean(self.y) - assert node.n_obs == len(self.y) - - def test_split_improvement_sufficient(self): - # test split improvement above minumum, leading to two leaf nodes - growth_params = utils.TreeGrowthParameters(max_depth=2, min_improvement=0.0) - parent_score = -1.0 - new_score = -0.9 - best = dtree.BestSplit( - score=new_score, - column=0, - threshold=3.0, - target_groups=self.target_groups, - ) - measure = dtree.SplitScore(self.measure_name, parent_score) - parent_node = dtree.Node( - array_column=0, - threshold=1.0, - prediction=0.9, - left=None, - right=None, - measure=measure, - n_obs=3, - reason="", - ) - - leaf_reason = "very custom leaf node comment" - - with ( - patch( - "random_tree_models.decisiontree.check_is_baselevel", - side_effect=[ - (False, "bla"), - (True, leaf_reason), - (True, leaf_reason), - ], - ) as mock_check_is_baselevel, - patch( - "random_tree_models.decisiontree.find_best_split", - side_effect=[best], - ) as mock_find_best_split, - ): - # line to test - tree = dtree.grow_tree( - self.X, - self.y, - self.measure_name, - growth_params=growth_params, - parent_node=parent_node, - depth=self.depth_dummy, - ) - - assert mock_check_is_baselevel.call_count == 3 - assert mock_find_best_split.call_count == 1 - - # parent - assert tree.reason == "" - assert tree.prediction == np.mean(self.y) - assert tree.n_obs == len(self.y) - assert tree.is_leaf == False - - # left leaf - assert tree.left is not None - assert tree.left.reason == leaf_reason - assert tree.left.prediction == 1.0 - assert tree.left.n_obs == 2 - assert tree.left.is_leaf == True - - # right leaf - assert tree.right is not None - assert tree.right.reason == leaf_reason - assert tree.right.prediction == 0.0 - assert tree.right.n_obs == 1 - assert tree.right.is_leaf == True - - -@pytest.mark.parametrize( - "x,exp", - [ - (np.array([-1, -1]), 0.0), - (np.array([1, -1]), 1.0), - (np.array([1, 1]), 2.0), - (np.array([-1, 1]), 3.0), - ], -) -def test_find_leaf_node(x: np.ndarray, exp: float): - tree = dtree.Node( - array_column=0, - threshold=0.0, - left=dtree.Node( - array_column=1, - threshold=0.0, - left=dtree.Node(prediction=0.0), - right=dtree.Node(prediction=3.0), - ), - right=dtree.Node( - array_column=1, - threshold=0.0, - left=dtree.Node(prediction=1.0), - right=dtree.Node(prediction=2.0), - ), - ) - # line to test - leaf = dtree.find_leaf_node(tree, x) - - assert leaf.prediction == exp - - -def test_predict_with_tree(): - X = np.array( - [ - [-1.0, -1.0], - [1.0, -1.0], - [1.0, 1.0], - [-1.0, 1.0], - ] - ) - tree = dtree.Node( - array_column=0, - threshold=0.0, - left=dtree.Node( - array_column=1, - threshold=0.0, - left=dtree.Node(prediction=0.0), - right=dtree.Node(prediction=3.0), - ), - right=dtree.Node( - array_column=1, - threshold=0.0, - left=dtree.Node(prediction=1.0), - right=dtree.Node(prediction=2.0), - ), - ) - - # line to test - predictions = dtree.predict_with_tree(tree, X) - - assert np.allclose(predictions, np.arange(0, 4, 1)) - - -class TestDecisionTreeTemplate: - model = dtree.DecisionTreeTemplate(measure_name=MetricNames.entropy) - X = np.random.normal(size=(100, 10)) - y = np.random.normal(size=(100,)) - - def test_tree_(self): - assert not hasattr(self.model, "tree_") - - def test_growth_params_(self): - assert not hasattr(self.model, "growth_params_") - - self.model._organize_growth_parameters() - assert isinstance(self.model.growth_params_, utils.TreeGrowthParameters) - - def test_fit(self): - try: - self.model.fit(None, None) # type: ignore - except NotImplementedError as ex: - pytest.xfail("DecisionTreeTemplate.fit expectedly refused call") - - def test_predict(self): - try: - self.model.predict(None) # type: ignore - except NotImplementedError as ex: - pytest.xfail("DecisionTreeTemplate.predict expectedly refused call") - - def test_select_samples_and_features_no_sampling(self): - self.model.frac_features = 1.0 - self.model.frac_subsamples = 1.0 - self.model._organize_growth_parameters() - - # line to test - X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) - - assert np.allclose(X, self.X) - assert np.allclose(y, self.y) - assert np.allclose(ix_features, np.arange(0, self.X.shape[1], 1)) - - def test_select_samples_and_features_with_column_sampling(self): - self.model.frac_features = 0.5 - self.model.frac_subsamples = 1.0 - self.model._organize_growth_parameters() - - # line to test - X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) - - assert np.isclose( - X.shape[1], self.X.shape[1] * self.model.frac_features, atol=1 - ) - assert np.isclose(y.shape[0], self.y.shape[0]) - assert all([ix in np.arange(0, self.X.shape[1], 1) for ix in ix_features]) - - def test_select_samples_and_features_with_row_sampling(self): - self.model.frac_features = 1.0 - self.model.frac_subsamples = 0.5 - self.model._organize_growth_parameters() - - # line to test - X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) - - assert np.isclose(X.shape[0], self.X.shape[0] * self.model.frac_subsamples) - assert np.isclose(y.shape[0], self.y.shape[0] * self.model.frac_subsamples) - assert np.allclose(ix_features, np.arange(0, self.X.shape[1], 1)) - - def test_select_samples_and_features_with_column_and_row_sampling(self): - self.model.frac_features = 0.5 - self.model.frac_subsamples = 0.5 - self.model._organize_growth_parameters() - - # line to test - X, y, ix_features = self.model._select_samples_and_features(self.X, self.y) - - assert np.isclose( - X.shape[1], self.X.shape[1] * self.model.frac_features, atol=1 - ) - assert np.isclose(X.shape[0], self.X.shape[0] * self.model.frac_subsamples) - assert np.isclose(y.shape[0], self.y.shape[0] * self.model.frac_subsamples) - assert all([ix in np.arange(0, self.X.shape[1], 1) for ix in ix_features]) - - def test_select_samples_and_features_sampling_reproducibility(self): - self.model.frac_features = 0.5 - self.model.frac_subsamples = 0.5 - self.model._organize_growth_parameters() - - # line to test - X0, y0, ix_features0 = self.model._select_samples_and_features(self.X, self.y) - X1, y1, ix_features1 = self.model._select_samples_and_features(self.X, self.y) - - assert np.allclose(X0, X1) - assert np.allclose(y0, y1) - assert np.allclose(ix_features0, ix_features1) - - def test_select_features(self): - ix_features = np.arange(0, self.X.shape[1], 1) - _X = self.model._select_features(self.X, ix_features) - assert np.allclose(_X, self.X) - - ix_features = np.array([0, 1, 2]) - _X = self.model._select_features(self.X, ix_features) - assert _X.shape[1] == 3 - - -class TestDecisionTreeRegressor: - model = dtree.DecisionTreeRegressor() - - X = np.array( - [ - [-1, -1], - [1, -1], - [1, 1], - [-1, 1], - ] - ) - y = np.array([0.0, 0.0, 1.0, 1.0]) - - def test_fit(self): - model = dtree.DecisionTreeRegressor() - model.fit(self.X, self.y) - assert isinstance(model.tree_, dtree.Node) - - def test_predict(self): - model = dtree.DecisionTreeRegressor() - model.fit(self.X, self.y) - predictions = model.predict(self.X) - assert np.allclose(predictions, self.y) - - -class TestDecisionTreeClassifier: - model = dtree.DecisionTreeClassifier() - - X = np.array( - [ - [-1, -1], - [1, -1], - [1, 1], - [-1, 1], - ] - ) - y = np.array([False, False, True, True]) - - def test_classes_(self): - assert not hasattr(self.model, "classes_") - - def test_fit(self): - model = dtree.DecisionTreeClassifier() - model.fit(self.X, self.y) - assert not hasattr(self.model, "classes_") - assert isinstance(model.tree_, dtree.Node) - - def test_predict(self): - model = dtree.DecisionTreeClassifier() - model.fit(self.X, self.y) - predictions = model.predict(self.X) - assert (predictions == self.y).all() - - -@parametrize_with_checks( - [dtree.DecisionTreeRegressor(), dtree.DecisionTreeClassifier()], - expected_failed_checks=expected_failed_checks, # type: ignore -) -def test_dtree_estimators_with_sklearn_checks(estimator, check): - """Test of estimators using scikit-learn test suite - - Reference: https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.parametrize_with_checks.html#sklearn.utils.estimator_checks.parametrize_with_checks - """ - - check(estimator) diff --git a/tests/test_extratrees.py b/tests/test_extratrees.py index 7bbce23..ed82edf 100644 --- a/tests/test_extratrees.py +++ b/tests/test_extratrees.py @@ -4,7 +4,7 @@ import random_tree_models.decisiontree as dtree import random_tree_models.extratrees as et -from random_tree_models.scoring import MetricNames +from random_tree_models.params import MetricNames from tests.conftest import expected_failed_checks diff --git a/tests/test_isolationforest.py b/tests/test_isolationforest.py index a1d30a8..c127bc0 100644 --- a/tests/test_isolationforest.py +++ b/tests/test_isolationforest.py @@ -1,7 +1,7 @@ import numpy as np import random_tree_models.isolationforest as iforest -from random_tree_models.utils import ThresholdSelectionMethod +from random_tree_models.params import ThresholdSelectionMethod rng = np.random.RandomState(42) diff --git a/tests/test_leafweights.py b/tests/test_leafweights.py index d1f6b0d..2b03d68 100644 --- a/tests/test_leafweights.py +++ b/tests/test_leafweights.py @@ -2,8 +2,8 @@ import pytest import random_tree_models.leafweights as leafweights -import random_tree_models.utils as utils -from random_tree_models.scoring import MetricNames +import random_tree_models.params as utils +from random_tree_models.params import MetricNames def test_leaf_weight_mean(): diff --git a/tests/test_scoring.py b/tests/test_scoring.py index cb32b59..c0b9ca2 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -1,8 +1,8 @@ import numpy as np import pytest +import random_tree_models.params as utils import random_tree_models.scoring as scoring -import random_tree_models.utils as utils from random_tree_models import rs_entropy, rs_gini_impurity @@ -375,43 +375,41 @@ class TestSplitScoreMetrics: var_exp = -0.25 def test_gini(self): - g = scoring.calc_split_score( - scoring.MetricNames.gini, self.y, self.target_groups - ) + g = scoring.calc_split_score(utils.MetricNames.gini, self.y, self.target_groups) # g = scoring.SplitScoreMetrics["gini"](self.y, self.target_groups) assert g == self.g_exp def test_gini_rs(self): g = scoring.calc_split_score( - scoring.MetricNames.gini_rs, self.y, self.target_groups + utils.MetricNames.gini_rs, self.y, self.target_groups ) # g = scoring.SplitScoreMetrics["gini_rs"](self.y, self.target_groups) assert g == self.g_exp def test_entropy(self): h = scoring.calc_split_score( - scoring.MetricNames.entropy, self.y, self.target_groups + utils.MetricNames.entropy, self.y, self.target_groups ) # h = scoring.SplitScoreMetrics["entropy"](self.y, self.target_groups) assert h == self.h_exp def test_entropy_rs(self): h = scoring.calc_split_score( - scoring.MetricNames.entropy_rs, self.y, self.target_groups + utils.MetricNames.entropy_rs, self.y, self.target_groups ) # h = scoring.SplitScoreMetrics["entropy_rs"](self.y, self.target_groups) assert h == self.h_exp def test_variance(self): var = scoring.calc_split_score( - scoring.MetricNames.variance, self.y, self.target_groups + utils.MetricNames.variance, self.y, self.target_groups ) # var = scoring.SplitScoreMetrics["variance"](self.y, self.target_groups) assert var == self.var_exp def test_friedman_binary_classification(self): var = scoring.calc_split_score( - scoring.MetricNames.friedman_binary_classification, + utils.MetricNames.friedman_binary_classification, self.y, self.target_groups, ) diff --git a/tests/test_utils.py b/tests/test_utils.py index dc78d71..5b143a0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,26 +3,35 @@ import pytest from pydantic import ValidationError +import random_tree_models.params import random_tree_models.utils as utils def test_ColumnSelectionMethod(): expected = ["ascending", "largest_delta", "random"] - assert list(utils.ColumnSelectionMethod.__members__.keys()) == expected + assert ( + list(random_tree_models.params.ColumnSelectionMethod.__members__.keys()) + == expected + ) def test_ThresholdSelectionMethod(): expected = ["bruteforce", "quantile", "random", "uniform"] - assert list(utils.ThresholdSelectionMethod.__members__.keys()) == expected + assert ( + list(random_tree_models.params.ThresholdSelectionMethod.__members__.keys()) + == expected + ) # method, quantile, random_state, n_thresholds class TestThresholdSelectionParameters: def test_expected_okay(self): - params = utils.ThresholdSelectionParameters( + params = random_tree_models.params.ThresholdSelectionParameters( method="quantile", quantile=0.1, random_state=0, n_thresholds=100 ) - assert params.method == utils.ThresholdSelectionMethod.quantile + assert ( + params.method == random_tree_models.params.ThresholdSelectionMethod.quantile + ) assert params.quantile == 0.1 assert params.random_state == 0 assert params.n_thresholds == 100 @@ -30,7 +39,7 @@ def test_expected_okay(self): def test_method_fail(self): try: - _ = utils.ThresholdSelectionParameters( + _ = random_tree_models.params.ThresholdSelectionParameters( method="wuppy", quantile=0.1, random_state=0, n_thresholds=100 ) except ValueError as ex: @@ -44,7 +53,7 @@ def test_method_fail(self): ) def test_quantile(self, q: float, fail: bool): try: - _ = utils.ThresholdSelectionParameters( + _ = random_tree_models.params.ThresholdSelectionParameters( method="quantile", quantile=q, random_state=0, n_thresholds=100 ) except ValueError as ex: @@ -66,7 +75,7 @@ def test_quantile(self, q: float, fail: bool): ) def test_random_state(self, random_state: int, fail: bool): try: - _ = utils.ThresholdSelectionParameters( + _ = random_tree_models.params.ThresholdSelectionParameters( method="quantile", quantile=0.1, random_state=random_state, @@ -95,7 +104,7 @@ def test_random_state(self, random_state: int, fail: bool): ) def test_n_thresholds(self, n_thresholds: int, fail: bool): try: - _ = utils.ThresholdSelectionParameters( + _ = random_tree_models.params.ThresholdSelectionParameters( method="quantile", quantile=0.1, random_state=42, @@ -112,27 +121,31 @@ def test_n_thresholds(self, n_thresholds: int, fail: bool): def test_ColumnSelectionParameters(): - params = utils.ColumnSelectionParameters(method="random", n_trials=10) - assert params.method == utils.ColumnSelectionMethod.random + params = random_tree_models.params.ColumnSelectionParameters( + method="random", n_trials=10 + ) + assert params.method == random_tree_models.params.ColumnSelectionMethod.random assert params.n_trials == 10 class TestTreeGrowthParameters: def test_expected_okay(self): - params = utils.TreeGrowthParameters( + params = random_tree_models.params.TreeGrowthParameters( max_depth=10, min_improvement=0.0, lam=0.0, frac_subsamples=1.0, frac_features=1.0, random_state=0, - threshold_params=utils.ThresholdSelectionParameters( + threshold_params=random_tree_models.params.ThresholdSelectionParameters( method="quantile", quantile=0.1, random_state=0, n_thresholds=100, ), - column_params=utils.ColumnSelectionParameters(method="random", n_trials=10), + column_params=random_tree_models.params.ColumnSelectionParameters( + method="random", n_trials=10 + ), ) assert params.max_depth == 10 assert params.min_improvement == 0.0 @@ -140,8 +153,13 @@ def test_expected_okay(self): assert params.frac_subsamples == 1.0 assert params.frac_features == 1.0 assert params.random_state == 0 - assert isinstance(params.threshold_params, utils.ThresholdSelectionParameters) - assert isinstance(params.column_params, utils.ColumnSelectionParameters) + assert isinstance( + params.threshold_params, + random_tree_models.params.ThresholdSelectionParameters, + ) + assert isinstance( + params.column_params, random_tree_models.params.ColumnSelectionParameters + ) @pytest.mark.parametrize( "frac_subsamples,fail", @@ -155,7 +173,7 @@ def test_expected_okay(self): ) def test_frac_subsamples(self, frac_subsamples: float, fail: bool): try: - _ = utils.TreeGrowthParameters( + _ = random_tree_models.params.TreeGrowthParameters( max_depth=10, frac_subsamples=frac_subsamples, ) @@ -180,7 +198,7 @@ def test_frac_subsamples(self, frac_subsamples: float, fail: bool): ) def test_frac_features(self, frac_features: float, fail: bool): try: - _ = utils.TreeGrowthParameters( + _ = random_tree_models.params.TreeGrowthParameters( max_depth=10, frac_features=frac_features, ) @@ -195,7 +213,7 @@ def test_frac_features(self, frac_features: float, fail: bool): def test_fail_if_max_depth_missing(self): with pytest.raises(ValidationError): - _ = utils.TreeGrowthParameters() # type: ignore + _ = random_tree_models.params.TreeGrowthParameters() # type: ignore def test_get_logger(): diff --git a/uv.lock b/uv.lock index b5eb83e..c61c2f2 100644 --- a/uv.lock +++ b/uv.lock @@ -699,6 +699,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, ] +[[package]] +name = "dirty-equals" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/99/133892f401ced5a27e641a473c547d5fbdb39af8f85dac8a9d633ea3e7a7/dirty_equals-0.9.0.tar.gz", hash = "sha256:17f515970b04ed7900b733c95fd8091f4f85e52f1fb5f268757f25c858eb1f7b", size = 50412, upload-time = "2025-01-11T23:23:40.491Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/0c/03cc99bf3b6328604b10829de3460f2b2ad3373200c45665c38508e550c6/dirty_equals-0.9.0-py3-none-any.whl", hash = "sha256:ff4d027f5cfa1b69573af00f7ba9043ea652dbdce3fe5cbe828e478c7346db9c", size = 28226, upload-time = "2025-01-11T23:23:37.489Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -877,6 +886,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "inline-snapshot" +version = "0.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pytest" }, + { name = "rich" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/93/3caece250cdf267fcb39e6a82ada0e7e8e8fb37207331309dbf6865d7497/inline_snapshot-0.27.2.tar.gz", hash = "sha256:5ecc7ccfdcbf8d9273d3fa9fb55b829720680ef51bb1db12795fd1b0f4a3783c", size = 347133, upload-time = "2025-08-11T07:49:55.134Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/7f/9e41fd793827af8cbe812fff625d62b3b47603d62145b718307ef4e381eb/inline_snapshot-0.27.2-py3-none-any.whl", hash = "sha256:7c11f78ad560669bccd38d6d3aa3ef33d6a8618d53bd959019dca3a452272b7e", size = 68004, upload-time = "2025-08-11T07:49:53.904Z" }, +] + [[package]] name = "ipykernel" version = "6.30.1" @@ -2717,6 +2742,8 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "dirty-equals" }, + { name = "inline-snapshot" }, { name = "ipywidgets" }, { name = "jupyter-contrib-nbextensions" }, { name = "jupyterlab" }, @@ -2732,6 +2759,8 @@ nb = [ { name = "jupyterlab" }, ] test = [ + { name = "dirty-equals" }, + { name = "inline-snapshot" }, { name = "pytest" }, ] @@ -2746,6 +2775,8 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ + { name = "dirty-equals", specifier = ">=0.9.0" }, + { name = "inline-snapshot", specifier = ">=0.27.2" }, { name = "ipywidgets", specifier = ">=8.0.6" }, { name = "jupyter-contrib-nbextensions", specifier = ">=0.7.0" }, { name = "jupyterlab", specifier = ">=4.0.0" }, @@ -2760,7 +2791,11 @@ nb = [ { name = "jupyter-contrib-nbextensions", specifier = ">=0.7.0" }, { name = "jupyterlab", specifier = ">=4.0.0" }, ] -test = [{ name = "pytest", specifier = ">=7.3.1" }] +test = [ + { name = "dirty-equals", specifier = ">=0.9.0" }, + { name = "inline-snapshot", specifier = ">=0.27.2" }, + { name = "pytest", specifier = ">=7.3.1" }, +] [[package]] name = "referencing"