From 10d88171ed02ac6edd0e02d253578290e0562661 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 19:27:23 +0200 Subject: [PATCH 01/10] License update --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index b1701892..2c195b47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,7 @@ +[project] +license = "BSD-3-Clause" +license-files = ["LICENSE"] + [build-system] requires = ["setuptools", "Cython", "numpy", "wheel"] From b1f2fb8988b4c646483ecb22d3d3b9a00cb7e687 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 20:04:49 +0200 Subject: [PATCH 02/10] Seems to be a bug in cython 3.1.1 This bug makes it such that we don't build --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1d98842e..efc2e9e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy>=1.25.0 matplotlib>=3.7.1 -cython>=3.0.0 \ No newline at end of file +cython==3.1.0 From 53d266dc8332b271144514753741c80fe8943172 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 20:19:45 +0200 Subject: [PATCH 03/10] Fix bug 1 --- src/adaXT/decision_tree/_decision_tree.pyx | 8 +++++--- src/adaXT/predictor/predictor.pxd | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 508fb81b..37287c19 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -44,9 +44,7 @@ cdef class refit_object(Node): def add_idx(self, idx: int) -> None: self.list_idx.append(idx) - -@cython.auto_pickle(True) -cdef class _DecisionTree(): +cdef class _DecisionTree: cdef public: object criteria object splitter @@ -343,6 +341,10 @@ cdef class _DecisionTree(): # Now squash all the DecisionNodes not visited self.__squash_tree() + # Make sure that predictor_instance points to the same root, if we have + # changed it + self.predictor_instance.root = self.root + # From below here, it is the DepthTreeBuilder class queue_obj: diff --git a/src/adaXT/predictor/predictor.pxd b/src/adaXT/predictor/predictor.pxd index 77c18433..82300fec 100644 --- a/src/adaXT/predictor/predictor.pxd +++ b/src/adaXT/predictor/predictor.pxd @@ -7,7 +7,7 @@ cdef class Predictor(): cnp.ndarray X cnp.ndarray Y int n_features - Node root + cdef public Node root cpdef dict predict_leaf(self, double[:, ::1] X) From 9cd34b224dfdae0f17f15b45694d3c4a28440888 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 20:20:58 +0200 Subject: [PATCH 04/10] Formatting random forest --- src/adaXT/random_forest/random_forest.py | 37 +++++++++--------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index d0483e3b..ba5c8edd 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -64,8 +64,7 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min( - [sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -75,7 +74,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"]:], + indices[sampling_args["split"] :], size=resample_size1, replace=sampling_args["replace"], ) @@ -86,8 +85,7 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min( - [sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -97,7 +95,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"]:], + indices[sampling_args["split"] :], size=resample_size1, replace=sampling_args["replace"], ) @@ -153,18 +151,11 @@ def build_single_tree( predictor=predictor, splitter=splitter, ) - tree.fit( - X=X, - Y=Y, - sample_indices=fitting_indices, - sample_weight=sample_weight) + tree.fit(X=X, Y=Y, sample_indices=fitting_indices, sample_weight=sample_weight) if honest_tree: tree.refit_leaf_nodes( - X=X, - Y=Y, - sample_weight=sample_weight, - sample_indices=prediction_indices) - + X=X, Y=Y, sample_weight=sample_weight, sample_indices=prediction_indices + ) return tree @@ -349,8 +340,7 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: if "size" not in sampling_args: sampling_args["size"] = self.X_n_rows elif isinstance(sampling_args["size"], float): - sampling_args["size"] = int( - sampling_args["size"] * self.X_n_rows) + sampling_args["size"] = int(sampling_args["size"] * self.X_n_rows) elif not isinstance(sampling_args["size"], int): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." @@ -421,7 +411,8 @@ def __build_trees(self) -> None: sampling=self.sampling, ) self.fitting_indices, self.prediction_indices, self.out_of_bag_indices = zip( - *indices) + *indices + ) self.trees = self.parallel.starmap( build_single_tree, map_input=zip(self.fitting_indices, self.prediction_indices), @@ -444,8 +435,9 @@ def __build_trees(self) -> None: n_jobs=self.n_jobs_fit, ) - def fit(self, X: ArrayLike, Y: ArrayLike, - sample_weight: ArrayLike | None = None) -> None: + def fit( + self, X: ArrayLike, Y: ArrayLike, sample_weight: ArrayLike | None = None + ) -> None: """ Fit the random forest with training data (X, Y). @@ -477,8 +469,7 @@ def fit(self, X: ArrayLike, Y: ArrayLike, self.X = shared_numpy_array(X) self.Y = shared_numpy_array(Y) self.X_n_rows, self.n_features = self.X.shape - self.max_features = self._check_max_features( - self.max_features, X.shape[0]) + self.max_features = self._check_max_features(self.max_features, X.shape[0]) self.sample_weight = self._check_sample_weight(sample_weight) self.sampling_args = self.__get_sampling_parameter(self.sampling_args) From fe0935b3ffefed4445d7e47d34ed7fc4987be516 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 20:21:07 +0200 Subject: [PATCH 05/10] License fix --- pyproject.toml | 4 ---- setup.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2c195b47..b1701892 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,3 @@ -[project] -license = "BSD-3-Clause" -license-files = ["LICENSE"] - [build-system] requires = ["setuptools", "Cython", "numpy", "wheel"] diff --git a/setup.py b/setup.py index 3358c711..f6a0931d 100644 --- a/setup.py +++ b/setup.py @@ -140,6 +140,7 @@ def run_build(): extensions = cythonize(extensions, **arg_dir) setup( name=NAME, + license="BSD-3-clause", version=VERSION, description=DESCRIPTION, long_description=LONG_DESCRIPTION, @@ -160,7 +161,6 @@ def run_build(): classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", ], extras_require=extras, From 9d44b72fecc3c0acb9e70d20b6f01784323dc4fe Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 20:36:43 +0200 Subject: [PATCH 06/10] Fix bug 2 --- src/adaXT/random_forest/random_forest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index ba5c8edd..1b564e44 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -356,11 +356,11 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: sampling_args["split"] = np.min( [int(0.5 * self.X_n_rows), self.X_n_rows - 1] ) - elif isinstance(sampling_args["size"], float): + elif isinstance(sampling_args["split"], float): sampling_args["split"] = np.min( [int(sampling_args["split"] * self.X_n_rows), self.X_n_rows - 1] ) - elif not isinstance(sampling_args["size"], int): + elif not isinstance(sampling_args["split"], (int, np.integer)): raise ValueError( "The provided sampling_args['split'] is not an integer or float as required." ) @@ -370,7 +370,7 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: sampling_args["size"] = int( sampling_args["size"] * sampling_args["split"] ) - elif not isinstance(sampling_args["size"], int): + elif not isinstance(sampling_args["size"], (np.integer, int)): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." ) From 7154f074c075c1dd9790d0f3d14f032d24ccf134 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 20:45:28 +0200 Subject: [PATCH 07/10] Fix issue bug 3 --- src/adaXT/decision_tree/_decision_tree.pyx | 6 +++--- src/adaXT/random_forest/random_forest.py | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 37287c19..dadc42af 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -3,7 +3,7 @@ import sys cimport numpy as cnp ctypedef cnp.float64_t DOUBLE_t -ctypedef cnp.int64_t LONG_t +ctypedef cnp.int32_t INT32_T from libcpp cimport bool @@ -178,7 +178,7 @@ cdef class _DecisionTree: cdef void __fit_new_leaf_nodes(self, cnp.ndarray[DOUBLE_t, ndim=2] X, cnp.ndarray[DOUBLE_t, ndim=2] Y, cnp.ndarray[DOUBLE_t, ndim=1] sample_weight, - cnp.ndarray[LONG_t, ndim=1] sample_indices): + cnp.ndarray[INT32_T, ndim=1] sample_indices): cdef: int idx, n_objs, depth, cur_split_idx double cur_threshold @@ -326,7 +326,7 @@ cdef class _DecisionTree: cnp.ndarray[DOUBLE_t, ndim=2] X, cnp.ndarray[DOUBLE_t, ndim=2] Y, cnp.ndarray[DOUBLE_t, ndim=1] sample_weight, - cnp.ndarray[LONG_t, ndim=1] sample_indices) -> None: + cnp.ndarray[INT32_T, ndim=1] sample_indices) -> None: if self.root is None: raise ValueError("The tree has not been trained before trying to\ diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 1b564e44..185ad4f7 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -48,17 +48,17 @@ def get_sample_indices( Assumes there has been a previous call to self.__get_sample_indices on the RandomForest. """ + indices = np.arange(0, X_n_rows, dtype=np.int32) if sampling == "resampling": ret = ( gen.choice( - np.arange(0, X_n_rows), + indices, size=sampling_args["size"], replace=sampling_args["replace"], ), None, ) elif sampling == "honest_tree": - indices = np.arange(0, X_n_rows) gen.shuffle(indices) if sampling_args["replace"]: resample_size0 = sampling_args["size"] @@ -80,7 +80,6 @@ def get_sample_indices( ) ret = (fit_indices, pred_indices) elif sampling == "honest_forest": - indices = np.arange(0, X_n_rows) if sampling_args["replace"]: resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] @@ -101,7 +100,7 @@ def get_sample_indices( ) ret = (fit_indices, pred_indices) else: - ret = (np.arange(0, X_n_rows), None) + ret = (indices, None) if sampling_args["OOB"]: # Only fitting indices From a00920d7c03388cb0367fc47911c442fd13b0557 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 26 May 2025 21:07:07 +0200 Subject: [PATCH 08/10] Fixed linting and set build_system to use cython 3.1.0 not 3.1.1 --- pyproject.toml | 2 +- src/adaXT/decision_tree/_decision_tree.pyx | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b1701892..51c7fe31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "Cython", "numpy", "wheel"] +requires = ["setuptools", "Cython==3.1.0", "numpy", "wheel"] [tool.cython-lint] max-line-length = 127 diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index dadc42af..cf86e3e4 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -16,7 +16,6 @@ from .nodes import DecisionNode # for c level definitions -cimport cython from .nodes cimport DecisionNode, Node from ..utils cimport dsum @@ -27,7 +26,7 @@ cdef double EPSILON = np.finfo('double').eps cdef class refit_object(Node): cdef public: list list_idx - bint is_left + bool is_left def __init__( self, From 14449f09266f595cd9f16052d9389102de886cd0 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Wed, 28 May 2025 08:17:02 +0200 Subject: [PATCH 09/10] Version update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f6a0931d..9fc04048 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import os NAME = "adaXT" -VERSION = "1.5.0" +VERSION = "1.5.1" DESCRIPTION = "A Python package for tree-based regression and classification" PROJECT_URLS = { "Documentation": "https://NiklasPfister.github.io/adaXT/", From 30ae70e7f55b85b9cf7912efc3c48eee366bd719 Mon Sep 17 00:00:00 2001 From: Niklas Andreas Pfister Date: Wed, 28 May 2025 06:19:01 +0000 Subject: [PATCH 10/10] Automated autopep8 fixes --- src/adaXT/random_forest/random_forest.py | 36 +++++++++++++++--------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 185ad4f7..3c40fb6f 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -64,7 +64,8 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min( + [sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -74,7 +75,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"] :], + indices[sampling_args["split"]:], size=resample_size1, replace=sampling_args["replace"], ) @@ -84,7 +85,8 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min( + [sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -94,7 +96,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"] :], + indices[sampling_args["split"]:], size=resample_size1, replace=sampling_args["replace"], ) @@ -150,11 +152,17 @@ def build_single_tree( predictor=predictor, splitter=splitter, ) - tree.fit(X=X, Y=Y, sample_indices=fitting_indices, sample_weight=sample_weight) + tree.fit( + X=X, + Y=Y, + sample_indices=fitting_indices, + sample_weight=sample_weight) if honest_tree: tree.refit_leaf_nodes( - X=X, Y=Y, sample_weight=sample_weight, sample_indices=prediction_indices - ) + X=X, + Y=Y, + sample_weight=sample_weight, + sample_indices=prediction_indices) return tree @@ -339,7 +347,8 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: if "size" not in sampling_args: sampling_args["size"] = self.X_n_rows elif isinstance(sampling_args["size"], float): - sampling_args["size"] = int(sampling_args["size"] * self.X_n_rows) + sampling_args["size"] = int( + sampling_args["size"] * self.X_n_rows) elif not isinstance(sampling_args["size"], int): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." @@ -410,8 +419,7 @@ def __build_trees(self) -> None: sampling=self.sampling, ) self.fitting_indices, self.prediction_indices, self.out_of_bag_indices = zip( - *indices - ) + *indices) self.trees = self.parallel.starmap( build_single_tree, map_input=zip(self.fitting_indices, self.prediction_indices), @@ -434,9 +442,8 @@ def __build_trees(self) -> None: n_jobs=self.n_jobs_fit, ) - def fit( - self, X: ArrayLike, Y: ArrayLike, sample_weight: ArrayLike | None = None - ) -> None: + def fit(self, X: ArrayLike, Y: ArrayLike, + sample_weight: ArrayLike | None = None) -> None: """ Fit the random forest with training data (X, Y). @@ -468,7 +475,8 @@ def fit( self.X = shared_numpy_array(X) self.Y = shared_numpy_array(Y) self.X_n_rows, self.n_features = self.X.shape - self.max_features = self._check_max_features(self.max_features, X.shape[0]) + self.max_features = self._check_max_features( + self.max_features, X.shape[0]) self.sample_weight = self._check_sample_weight(sample_weight) self.sampling_args = self.__get_sampling_parameter(self.sampling_args)