Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions nbs/core/decision-tree.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@
"import seaborn as sns\n",
"import sklearn.datasets as sk_datasets\n",
"\n",
"from random_tree_models.decisiontree import (\n",
"from random_tree_models.models.decisiontree import (\n",
" DecisionTreeClassifier,\n",
" DecisionTreeRegressor,\n",
")\n",
"from random_tree_models.decisiontree.visualize import show_tree\n",
"from random_tree_models.models.decisiontree.visualize import show_tree\n",
"from random_tree_models.scoring import MetricNames"
]
},
Expand Down
4 changes: 2 additions & 2 deletions nbs/core/extra-trees.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@
"import seaborn as sns\n",
"import sklearn.datasets as sk_datasets\n",
"\n",
"import random_tree_models.extratrees as et\n",
"import random_tree_models.models.extratrees as et\n",
"from random_tree_models.scoring import MetricNames\n",
"from random_tree_models.params import ThresholdSelectionMethod\n",
"from random_tree_models.decisiontree.visualize import show_tree"
"from random_tree_models.models.decisiontree.visualize import show_tree"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions nbs/core/gradient-boosted-trees.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@
"import seaborn as sns\n",
"import sklearn.datasets as sk_datasets\n",
"\n",
"from random_tree_models.decisiontree.visualize import show_tree\n",
"import random_tree_models.gradientboostedtrees as gbtree\n",
"from random_tree_models.models.decisiontree.visualize import show_tree\n",
"import random_tree_models.models.gradientboostedtrees as gbtree\n",
"from random_tree_models.scoring import MetricNames"
]
},
Expand Down
4 changes: 2 additions & 2 deletions nbs/core/isolation-forest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@
"import seaborn as sns\n",
"import sklearn.datasets as sk_datasets\n",
"\n",
"from random_tree_models.decisiontree.visualize import show_tree\n",
"import random_tree_models.isolationforest as iforest\n",
"from random_tree_models.models.decisiontree.visualize import show_tree\n",
"import random_tree_models.models.isolationforest as iforest\n",
"from random_tree_models.params import ColumnSelectionMethod, ThresholdSelectionMethod"
]
},
Expand Down
4 changes: 2 additions & 2 deletions nbs/core/random-forest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
"import seaborn as sns\n",
"import sklearn.datasets as sk_datasets\n",
"\n",
"from random_tree_models.decisiontree.visualize import show_tree\n",
"import random_tree_models.randomforest as rf\n",
"from random_tree_models.models.decisiontree.visualize import show_tree\n",
"import random_tree_models.models.randomforest as rf\n",
"from random_tree_models.params import MetricNames"
]
},
Expand Down
4 changes: 2 additions & 2 deletions nbs/core/robust-random-cut-forest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
"import seaborn as sns\n",
"import sklearn.datasets as sk_datasets\n",
"\n",
"from random_tree_models.decisiontree.visualize import show_tree\n",
"import random_tree_models.isolationforest as iforest\n",
"from random_tree_models.models.decisiontree.visualize import show_tree\n",
"import random_tree_models.models.isolationforest as iforest\n",
"from random_tree_models.params import ColumnSelectionMethod, ThresholdSelectionMethod"
]
},
Expand Down
4 changes: 2 additions & 2 deletions nbs/core/xgboost.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,8 @@
"import sklearn.datasets as sk_datasets\n",
"from scipy import stats\n",
"\n",
"from random_tree_models.decisiontree.visualize import show_tree\n",
"import random_tree_models.xgboost as xgboost\n",
"from random_tree_models.models.decisiontree.visualize import show_tree\n",
"import random_tree_models.models.xgboost as xgboost\n",
"from random_tree_models.params import MetricNames"
]
},
Expand Down
2 changes: 1 addition & 1 deletion nbs/dev/xgboost-profiling-histogramming-yay-or-nay.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"import sklearn.datasets as sk_datasets\n",
"from sklearn import metrics\n",
"\n",
"import random_tree_models.xgboost as xgboost"
"import random_tree_models.models.xgboost as xgboost"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

import numpy as np
import sklearn.base as base
from sklearn.utils import ClassifierTags # type: ignore
from sklearn.utils.multiclass import check_classification_targets, type_of_target
from sklearn.utils.validation import check_is_fitted, validate_data # type: ignore

from random_tree_models.decisiontree.node import Node
from random_tree_models.decisiontree.predict import predict_with_tree
from random_tree_models.decisiontree.train import grow_tree
from random_tree_models.models.decisiontree.node import Node
from random_tree_models.models.decisiontree.predict import predict_with_tree
from random_tree_models.models.decisiontree.random import (
get_random_feature_ids,
get_random_sample_ids,
)
from random_tree_models.models.decisiontree.train import grow_tree
from random_tree_models.params import (
ColumnSelectionMethod,
ColumnSelectionParameters,
Expand Down Expand Up @@ -55,6 +60,7 @@ def __init__(
random_state: int = 42,
ensure_all_finite: bool = True,
) -> None:
# scikit-learn requires we store parameters like this below, instead of directly assigning TreeGrowthParameters
self.max_depth = max_depth
self.measure_name = measure_name
self.min_improvement = min_improvement
Expand All @@ -70,49 +76,43 @@ def __init__(
self.ensure_all_finite = ensure_all_finite

def _organize_growth_parameters(self):
lam = -abs(self.lam) # doing this for probably a good reason

threshold_params = ThresholdSelectionParameters(
method=self.threshold_method,
quantile=self.threshold_quantile,
n_thresholds=self.n_thresholds,
random_state=self.random_state,
)

column_params = ColumnSelectionParameters(
method=self.column_method,
n_trials=self.n_columns_to_try,
)

self.growth_params_ = TreeGrowthParameters(
max_depth=self.max_depth,
min_improvement=self.min_improvement,
lam=-abs(self.lam),
frac_subsamples=float(self.frac_subsamples),
frac_features=float(self.frac_features),
random_state=int(self.random_state),
threshold_params=ThresholdSelectionParameters(
method=self.threshold_method,
quantile=self.threshold_quantile,
n_thresholds=self.n_thresholds,
random_state=int(self.random_state),
),
column_params=ColumnSelectionParameters(
method=self.column_method,
n_trials=self.n_columns_to_try,
),
lam=lam,
frac_subsamples=self.frac_subsamples,
frac_features=self.frac_features,
random_state=self.random_state,
threshold_params=threshold_params,
column_params=column_params,
)

def _select_samples_and_features(
self, X: np.ndarray, y: np.ndarray
) -> T.Tuple[np.ndarray, np.ndarray, np.ndarray]:
"Sub-samples rows and columns from X and y"

if not hasattr(self, "growth_params_"):
raise ValueError(f"Try calling `fit` first.")

ix = np.arange(len(X))
rng = np.random.RandomState(self.growth_params_.random_state)
if self.growth_params_.frac_subsamples < 1.0:
n_samples = int(self.growth_params_.frac_subsamples * len(X))
ix_samples = rng.choice(ix, size=n_samples, replace=False)
else:
ix_samples = ix

if self.frac_features < 1.0:
n_columns = int(X.shape[1] * self.frac_features)
ix_features = rng.choice(
np.arange(X.shape[1]),
size=n_columns,
replace=False,
)
else:
ix_features = np.arange(X.shape[1])

ix_samples = get_random_sample_ids(X, rng, self.growth_params_.frac_subsamples)
ix_features = get_random_feature_ids(X, rng, self.growth_params_.frac_features)

_X = X[ix_samples, :]
_X = _X[:, ix_features]
Expand Down Expand Up @@ -245,16 +245,9 @@ def __init__(
)
self.ensure_all_finite = ensure_all_finite

def _more_tags(self) -> T.Dict[str, bool]:
"""Describes to scikit-learn parametrize_with_checks the scope of this class

Reference: https://scikit-learn.org/stable/developers/develop.html#estimator-tags
"""
return {"binary_only": True}

def __sklearn_tags__(self):
# https://scikit-learn.org/stable/developers/develop.html
tags = super().__sklearn_tags__() # type: ignore
# https://scikit-learn.org/stable/developers/develop.html#estimator-tags
tags: ClassifierTags = super().__sklearn_tags__() # type: ignore
tags.classifier_tags.multi_class = False
return tags

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pydantic import StrictInt, StrictStr
from pydantic.dataclasses import dataclass

from random_tree_models.decisiontree.split_objects import SplitScore
from random_tree_models.models.decisiontree.split_objects import SplitScore


@dataclass
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

from random_tree_models.decisiontree.node import Node
from random_tree_models.models.decisiontree.node import Node


def find_leaf_node(node: Node, x: np.ndarray) -> Node:
Expand Down
28 changes: 28 additions & 0 deletions src/random_tree_models/models/decisiontree/random.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import numpy as np


def get_random_sample_ids(
X: np.ndarray, rng: np.random.RandomState, frac_subsamples: float
) -> np.ndarray:
ix = np.arange(len(X))
if frac_subsamples < 1.0:
n_samples = int(frac_subsamples * len(X))
ix_samples = rng.choice(ix, size=n_samples, replace=False)
else:
ix_samples = ix
return ix_samples


def get_random_feature_ids(
X: np.ndarray, rng: np.random.RandomState, frac_features: float
) -> np.ndarray:
if frac_features < 1.0:
n_columns = int(X.shape[1] * frac_features)
ix_features = rng.choice(
np.arange(X.shape[1]),
size=n_columns,
replace=False,
)
else:
ix_features = np.arange(X.shape[1])
return ix_features
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import numpy as np

import random_tree_models.scoring as scoring
from random_tree_models.decisiontree.node import Node
from random_tree_models.decisiontree.split_objects import BestSplit
from random_tree_models.models.decisiontree.node import Node
from random_tree_models.models.decisiontree.split_objects import BestSplit
from random_tree_models.params import (
ColumnSelectionMethod,
ColumnSelectionParameters,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import random_tree_models.leafweights as leafweights
import random_tree_models.params
import random_tree_models.scoring as scoring
from random_tree_models.decisiontree.node import Node
from random_tree_models.decisiontree.split import (
from random_tree_models.models.decisiontree.node import Node
from random_tree_models.models.decisiontree.split import (
check_if_split_sensible,
find_best_split,
select_arrays_for_child_node,
)
from random_tree_models.decisiontree.split_objects import SplitScore
from random_tree_models.models.decisiontree.split_objects import SplitScore
from random_tree_models.params import MetricNames


Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from rich import print as rprint
from rich.tree import Tree

from random_tree_models.decisiontree.estimators import DecisionTreeTemplate
from random_tree_models.decisiontree.node import Node
from random_tree_models.models.decisiontree.estimators import DecisionTreeTemplate
from random_tree_models.models.decisiontree.node import Node


def walk_tree(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
validate_data, # type: ignore
)

import random_tree_models.decisiontree as dtree
import random_tree_models.params as utils
from random_tree_models.models.decisiontree import (
DecisionTreeClassifier,
DecisionTreeRegressor,
)
from random_tree_models.params import MetricNames


Expand Down Expand Up @@ -103,11 +106,11 @@ def __init__(
def fit(self, X: np.ndarray, y: np.ndarray) -> "ExtraTreesRegressor":
X, y = validate_data(self, X, y, ensure_all_finite=False)

self.trees_: T.List[dtree.DecisionTreeRegressor] = []
self.trees_: list[DecisionTreeRegressor] = []
rng = np.random.RandomState(self.random_state)
for _ in track(range(self.n_trees), total=self.n_trees, description="tree"):
# train decision tree to predict differences
new_tree = dtree.DecisionTreeRegressor(
new_tree = DecisionTreeRegressor(
measure_name=self.measure_name,
max_depth=self.max_depth,
min_improvement=self.min_improvement,
Expand Down Expand Up @@ -199,11 +202,11 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "ExtraTreesClassifier":
raise ValueError("Cannot train with only one class present")

self.classes_, y = np.unique(y, return_inverse=True)
self.trees_: T.List[dtree.DecisionTreeClassifier] = []
self.trees_: list[DecisionTreeClassifier] = []

rng = np.random.RandomState(self.random_state)
for _ in track(range(self.n_trees), description="tree", total=self.n_trees):
new_tree = dtree.DecisionTreeClassifier(
new_tree = DecisionTreeClassifier(
measure_name=self.measure_name,
max_depth=self.max_depth,
min_improvement=self.min_improvement,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
validate_data, # type: ignore
)

import random_tree_models.decisiontree as dtree
from random_tree_models.params import MetricNames
from random_tree_models.models.decisiontree import (
DecisionTreeRegressor,
)
from random_tree_models.params import MetricNames, is_greater_zero
from random_tree_models.utils import bool_to_float


Expand All @@ -33,7 +35,7 @@ def __init__(
min_improvement: float = 0.0,
ensure_all_finite: bool = True,
) -> None:
self.n_trees = n_trees
self.n_trees = is_greater_zero(n_trees)
self.measure_name = measure_name
self.max_depth = max_depth
self.min_improvement = min_improvement
Expand Down Expand Up @@ -94,7 +96,7 @@ def __init__(
def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesRegressor":
X, y = validate_data(self, X, y, ensure_all_finite=False)

self.trees_: T.List[dtree.DecisionTreeRegressor] = []
self.trees_: list[DecisionTreeRegressor] = []

self.start_estimate_ = np.mean(y)

Expand All @@ -103,7 +105,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesRegressor":

for _ in track(range(self.n_trees), total=self.n_trees, description="tree"):
# train decision tree to predict differences
new_tree = dtree.DecisionTreeRegressor(
new_tree = DecisionTreeRegressor(
measure_name=self.measure_name,
max_depth=self.max_depth,
min_improvement=self.min_improvement,
Expand Down Expand Up @@ -239,9 +241,8 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesClassifier":
if len(np.unique(y)) == 1:
raise ValueError("Cannot train with only one class present")

# self.n_features_in_ = X.shape[1]
self.classes_, y = np.unique(y, return_inverse=True)
self.trees_: T.List[dtree.DecisionTreeRegressor] = []
self.trees_: list[DecisionTreeRegressor] = []
self.gammas_ = []

y = self._bool_to_float(y)
Expand All @@ -255,7 +256,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> "GradientBoostedTreesClassifier":
2 * y / (1 + np.exp(2 * y * yhat))
) # dloss/dyhat, g in the xgboost paper

new_tree = dtree.DecisionTreeRegressor(
new_tree = DecisionTreeRegressor(
measure_name=self.measure_name,
max_depth=self.max_depth,
min_improvement=self.min_improvement,
Expand Down
Loading
Loading