diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 9757219..b1c28aa 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -12,11 +12,11 @@ jobs: strategy: matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: [3.7, 3.8, 3.9] + python-version: ["3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Display python version @@ -33,8 +33,9 @@ jobs: - name: Run tests run: | pytest ./tests --cov-config=.coveragerc --cov-report=xml --cov=deepforest deepforest - - name: Publish code coverage - uses: codecov/codecov-action@v1 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: ./coverage.xml + # This step requires CODECOV_TOKEN, which is only avail. in the original repo. We therefore comment that out. + # - name: Publish code coverage + # uses: codecov/codecov-action@v5 + # with: + # token: ${{ secrets.CODECOV_TOKEN }} + # file: ./coverage.xml diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 14a8ff7..97337a0 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -14,25 +14,28 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macos-latest] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up QEMU if: runner.os == 'Linux' - uses: docker/setup-qemu-action@v1 + uses: docker/setup-qemu-action@v3 with: platforms: all - name: Build wheels - uses: joerick/cibuildwheel@v1.9.0 + uses: pypa/cibuildwheel@v2.16.5 with: output-dir: wheelhouse env: CIBW_ARCHS_LINUX: "x86_64 aarch64" CIBW_ARCHS_WINDOWS: "AMD64" CIBW_ARCHS_MACOS: "x86_64" - CIBW_BUILD: cp3*-macosx_x86_64 cp3*-win_amd64 cp3*-manylinux_x86_64 cp3*-manylinux_aarch64 - CIBW_SKIP: cp35-* cp36-* + CIBW_BUILD: >- + cp310-manylinux_x86_64 cp310-manylinux_aarch64 cp310-macosx_x86_64 cp310-win_amd64 + cp311-manylinux_x86_64 cp311-manylinux_aarch64 cp311-macosx_x86_64 cp311-win_amd64 + cp312-manylinux_x86_64 cp312-manylinux_aarch64 cp312-macosx_x86_64 cp312-win_amd64 - name: Store artifacts - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: + name: wheels-${{ matrix.os }} path: ./wheelhouse/*.whl diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index 3676b41..a71b1eb 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -12,11 +12,11 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [3.7] + python-version: ["3.10"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Display python version diff --git a/.github/workflows/manual-release.yml b/.github/workflows/manual-release.yml new file mode 100644 index 0000000..0d3f886 --- /dev/null +++ b/.github/workflows/manual-release.yml @@ -0,0 +1,84 @@ +name: Manual Release + +on: + workflow_dispatch: + inputs: + version: + description: "Version to release (e.g., 0.1.8 <~ Must be above 0.1.7 as this is the point where we forked DF21)" + required: true + +permissions: + contents: write + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install build backend + run: | + python -m pip install --upgrade pip + python -m pip install build toml + + - name: Update version in setup.py + env: + RELEASE_VERSION: "${{ github.event.inputs.version }}" + run: | + python - <<'PY' + import os + import pathlib + import re + + release_version = os.environ["RELEASE_VERSION"].strip() + if not release_version: + raise SystemExit("RELEASE_VERSION is required") + + version = pathlib.Path("setup.py") + text = version.read_text() + new_text, count = re.subn( + r'^VERSION = "[^"]+"', f'VERSION = "{release_version}"', text, flags=re.MULTILINE + ) + if count == 0: + raise SystemExit("VERSION assignment not found in setup.py") + if new_text != text: + version.write_text(new_text) + PY + shell: bash + + - name: Commit version bump + env: + RELEASE_VERSION: "${{ github.event.inputs.version }}" + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add setup.py + if git diff --cached --quiet; then + echo "No changes to commit" + else + git commit -m "chore: release v${RELEASE_VERSION}" + fi + + - name: Create and push tag + env: + RELEASE_VERSION: "${{ github.event.inputs.version }}" + run: | + git tag "v${RELEASE_VERSION}" || true + git push origin HEAD + git push origin "v${RELEASE_VERSION}" + + - name: Build distributions + run: python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.rst b/README.rst index bb3db5f..1a7da68 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,10 @@ Deep Forest (DF) 21 =================== +.. note:: + + This repository is a community-maintained copy of the original `Deep Forest (DF21) project `_. We do **not** claim credit for the underlying research or implementation; our goal is to keep the project usable on modern Python versions (e.g., Python 3.10+) while the upstream repository has seen minimal activity since a long time (with one recent try to move to py310+ but with CI/CD failing). If the upstream maintainers prioritise these updates, we are happy to contribute everything back via pull request. + |github|_ |readthedocs|_ |codecov|_ |python|_ |pypi|_ |style|_ .. |github| image:: https://github.com/LAMDA-NJU/Deep-Forest/workflows/DeepForest-CI/badge.svg diff --git a/deepforest/cascade.py b/deepforest/cascade.py index 686f311..ee1b894 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -350,10 +350,10 @@ def _build_regressor_predictor( The maximum number of cascade layers in the deep forest. Notice that the actual number of layers can be smaller than ``max_layers`` because of the internal early stopping stage. - criterion : :obj:`{"mse", "mae"}`, default= :obj:`"mse"` - The function to measure the quality of a split. Supported criteria are - ``mse`` for the mean squared error, which is equal to variance reduction - as feature selection criterion, and ``mae`` for the mean absolute error. + criterion : :obj:`{"squared_error", "absolute_error"}`, default= :obj:`"squared_error"` + The function to measure the quality of a split. Supported criteria are + ``squared_error`` for the mean squared error, which is equal to variance reduction + as feature selection criterion, and ``absolute_error`` for the mean absolute error. n_estimators : :obj:`int`, default=2 The number of estimator in each cascade layer. It will be multiplied by 2 internally because each estimator contains a @@ -1553,7 +1553,7 @@ def __init__( bin_subsample=200000, bin_type="percentile", max_layers=20, - criterion="mse", + criterion="squared_error", n_estimators=2, n_trees=100, max_depth=None, diff --git a/deepforest/forest.py b/deepforest/forest.py index a917729..10eb127 100644 --- a/deepforest/forest.py +++ b/deepforest/forest.py @@ -49,6 +49,17 @@ MAX_INT = np.iinfo(np.int32).max +def _normalize_regression_criterion(criterion: str) -> str: + """Align legacy regression criteria names with scikit-learn + expectations.""" + + if criterion == "mse": + return "squared_error" + if criterion == "mae": + return "absolute_error" + return criterion + + def _get_n_samples_bootstrap(n_samples, max_samples): """ Get the number of samples in a bootstrap sample. @@ -199,9 +210,7 @@ def _partition_estimators(n_estimators, n_jobs): n_jobs = min(effective_n_jobs(n_jobs), n_estimators) # Partition estimators between jobs - n_estimators_per_job = np.full( - n_jobs, n_estimators // n_jobs, dtype=int - ) + n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, dtype=int) n_estimators_per_job[: n_estimators % n_jobs] += 1 starts = np.cumsum(n_estimators_per_job) @@ -840,7 +849,7 @@ def __init__( self, n_estimators=100, *, - criterion="mse", + criterion="squared_error", max_depth=None, min_samples_split=2, min_samples_leaf=1, @@ -853,6 +862,7 @@ def __init__( verbose=0, max_samples=None ): + criterion = _normalize_regression_criterion(criterion) super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -889,7 +899,7 @@ def __init__( self, n_estimators=100, *, - criterion="mse", + criterion="squared_error", max_depth=None, min_samples_split=2, min_samples_leaf=1, @@ -902,6 +912,7 @@ def __init__( verbose=0, max_samples=None ): + criterion = _normalize_regression_criterion(criterion) super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, diff --git a/deepforest/tree/tree.py b/deepforest/tree/tree.py index a73fa12..bdb4b41 100644 --- a/deepforest/tree/tree.py +++ b/deepforest/tree/tree.py @@ -49,7 +49,12 @@ DOUBLE = _tree.DOUBLE CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} -CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE} +CRITERIA_REG = { + "mse": _criterion.MSE, + "mae": _criterion.MAE, + "squared_error": _criterion.MSE, + "absolute_error": _criterion.MAE, +} DENSE_SPLITTERS = { "best": _splitter.BestSplitter, @@ -181,7 +186,7 @@ def fit( if self.class_weight is not None: y_original = np.copy(y) - y_encoded = np.zeros(y.shape, dtype=np.int) + y_encoded = np.zeros(y.shape, dtype=int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique( y[:, k], return_inverse=True @@ -504,7 +509,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): def __init__( self, *, - criterion="mse", + criterion="squared_error", splitter="best", max_depth=None, min_samples_split=2, @@ -580,7 +585,7 @@ class ExtraTreeRegressor(DecisionTreeRegressor): def __init__( self, *, - criterion="mse", + criterion="squared_error", splitter="random", max_depth=None, min_samples_split=2, diff --git a/pyproject.toml b/pyproject.toml index 636f0fd..9ac5189 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,13 @@ [build-system] requires = [ - "setuptools<60.0", + "setuptools>=61,<70", "wheel", + "numpy==1.26.4", "Cython>=0.28.5,<3.0", "oldest-supported-numpy", "scipy>=1.3.2", ] +build-backend = "setuptools.build_meta" [tool.black] line-length = 79 include = '\.pyi?$' diff --git a/setup.py b/setup.py index 087759f..2019246 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ import os import sys -from setuptools import find_packages -from numpy.distutils.core import setup +from setuptools import Extension, find_packages, setup # Project Information @@ -9,35 +8,80 @@ DESCRIPTION = "Deep Forest" with open("README.rst") as f: LONG_DESCRIPTION = f.read() -MAINTAINER = "Yi-Xuan Xu" -MAINTAINER_EMAIL = "xuyx@lamda.nju.edu.cn" +MAINTAINER = "Simon Provost" +MAINTAINER_EMAIL = "simon.gilbert.provost@gmail.com" URL = "https://github.com/LAMDA-NJU/Deep-Forest" VERSION = "0.1.7" -def configuration(parent_package="", top_path=None): - - if os.path.exists("MANIFEST"): - os.remove("MANIFEST") - - from numpy.distutils.misc_util import Configuration - - config = Configuration(None, parent_package, top_path) - config.add_subpackage("deepforest") - - return config - - if __name__ == "__main__": old_path = os.getcwd() - local_path = os.path.dirname(os.path.abspath(sys.argv[0])) + local_path = os.path.dirname(os.path.abspath(__file__)) os.chdir(local_path) sys.path.insert(0, local_path) + libraries = [] + if os.name == "posix": + libraries.append("m") + + from Cython.Build import cythonize + import numpy + + extensions = cythonize( + [ + Extension( + "deepforest._forest", + sources=[os.path.join("deepforest", "_forest.pyx")], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ), + Extension( + "deepforest._cutils", + sources=[os.path.join("deepforest", "_cutils.pyx")], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ), + Extension( + "deepforest.tree._tree", + sources=[os.path.join("deepforest", "tree", "_tree.pyx")], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ), + Extension( + "deepforest.tree._splitter", + sources=[os.path.join("deepforest", "tree", "_splitter.pyx")], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ), + Extension( + "deepforest.tree._criterion", + sources=[os.path.join("deepforest", "tree", "_criterion.pyx")], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ), + Extension( + "deepforest.tree._utils", + sources=[os.path.join("deepforest", "tree", "_utils.pyx")], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ), + ] + ) + + for extension in extensions: + extension.sources = [ + os.path.relpath(source, local_path) for source in extension.sources + ] + setup( - configuration=configuration, name=DISTNAME, maintainer=MAINTAINER, maintainer_email=MAINTAINER_EMAIL, @@ -48,6 +92,7 @@ def configuration(parent_package="", top_path=None): version=VERSION, long_description=LONG_DESCRIPTION, zip_safe=False, + ext_modules=extensions, classifiers=[ "Intended Audience :: Science/Research", "Intended Audience :: Developers", @@ -67,5 +112,5 @@ def configuration(parent_package="", top_path=None): "joblib>=0.11", "scikit-learn>=1.0,<1.6", ], - setup_requires=["cython"], + setup_requires=["cython", "numpy>=1.21,<2.0"], ) diff --git a/tests/test_forest.py b/tests/test_forest.py index 0274980..c3c259e 100644 --- a/tests/test_forest.py +++ b/tests/test_forest.py @@ -8,7 +8,7 @@ # Load utils from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.datasets import load_iris, load_wine, load_boston +from sklearn.datasets import load_iris, load_wine, load_diabetes from sklearn.ensemble._forest import ( _get_n_samples_bootstrap as sklearn_get_n_samples_bootstrap, ) @@ -81,7 +81,7 @@ def test_forest_classifier_workflow(load_func): model.predict(X_binned) -@pytest.mark.parametrize("load_func", [load_boston]) +@pytest.mark.parametrize("load_func", [load_diabetes]) def test_forest_regressor_workflow(load_func): n_estimators = 100 # to avoid oob warning diff --git a/tests/test_layer_estimator.py b/tests/test_layer_estimator.py index f494fe0..b24d7b7 100644 --- a/tests/test_layer_estimator.py +++ b/tests/test_layer_estimator.py @@ -8,7 +8,7 @@ # Load utils from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.datasets import load_digits, load_boston +from sklearn.datasets import load_digits, load_diabetes from sklearn.model_selection import train_test_split @@ -50,7 +50,7 @@ regressor_layer_kwargs = { "layer_idx": 0, "n_outputs": 1, - "criterion": "mse", + "criterion": "squared_error", "n_estimators": 1, "n_trees": 10, "max_depth": 3, @@ -64,7 +64,7 @@ regressor_estimator_kwargs = { "name": "rf", - "criterion": "mse", + "criterion": "squared_error", "n_trees": 10, "max_depth": 3, "min_samples_leaf": 10, @@ -99,7 +99,7 @@ def test_classifier_layer_properties_after_fitting(): def test_regressor_layer_properties_after_fitting(): # Load data and binning - X, y = load_boston(return_X_y=True) + X, y = load_diabetes(return_X_y=True) binner = _BinMapper(random_state=142) X_binned = binner.fit_transform(X) diff --git a/tests/test_model_regressor.py b/tests/test_model_regressor.py index 1149019..e6c547a 100644 --- a/tests/test_model_regressor.py +++ b/tests/test_model_regressor.py @@ -3,7 +3,7 @@ import shutil import numpy as np from numpy.testing import assert_array_equal -from sklearn.datasets import load_boston +from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split import deepforest @@ -14,7 +14,7 @@ save_dir = "./tmp" # Load data -X, y = load_boston(return_X_y=True) +X, y = load_diabetes(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.42, random_state=42 ) @@ -24,7 +24,7 @@ "n_bins": 10, "bin_subsample": 2e5, "max_layers": 10, - "criterion": "mse", + "criterion": "squared_error", "n_estimators": 1, "n_trees": 100, "max_depth": 3, @@ -44,7 +44,7 @@ "n_bins": 255, "bin_subsample": 2e5, "max_layers": 10, - "criterion": "mse", + "criterion": "squared_error", "n_estimators": 2, "n_trees": 100, "max_depth": None, @@ -197,14 +197,14 @@ def test_model_invalid_training_params(param): @pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"]) def test_regressor_predictor_normal(predictor): deepforest.cascade._build_regressor_predictor( - predictor, criterion="mse", n_estimators=1, n_outputs=2 + predictor, criterion="squared_error", n_estimators=1, n_outputs=2 ) def test_regressor_predictor_unknown(): with pytest.raises(NotImplementedError) as excinfo: deepforest.cascade._build_regressor_predictor( - "unknown", criterion="mse", n_estimators=1, n_outputs=2 + "unknown", criterion="squared_error", n_estimators=1, n_outputs=2 ) assert "name of the predictor should be one of" in str(excinfo.value) diff --git a/tests/test_set_custom_estimator.py b/tests/test_set_custom_estimator.py index 6ec5d55..fb21c97 100644 --- a/tests/test_set_custom_estimator.py +++ b/tests/test_set_custom_estimator.py @@ -2,7 +2,7 @@ import shutil import numpy as np from numpy.testing import assert_array_equal -from sklearn.datasets import load_iris, load_boston +from sklearn.datasets import load_iris, load_diabetes from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.model_selection import train_test_split @@ -17,7 +17,7 @@ X, y, test_size=0.42, random_state=42 ) -X, y = load_boston(return_X_y=True) +X, y = load_diabetes(return_X_y=True) X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( X, y, test_size=0.42, random_state=42 ) diff --git a/tests/test_tree regressor.py b/tests/test_tree regressor.py index 2b43180..709fd12 100644 --- a/tests/test_tree regressor.py +++ b/tests/test_tree regressor.py @@ -1,11 +1,11 @@ import pytest -from sklearn.datasets import load_boston +from sklearn.datasets import load_diabetes from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from deepforest import DecisionTreeRegressor -X, y = load_boston(return_X_y=True) +X, y = load_diabetes(return_X_y=True) # Data binning binner = _BinMapper(random_state=42) diff --git a/tests/test_tree_same.py b/tests/test_tree_same.py index d0e0335..f2ba36e 100644 --- a/tests/test_tree_same.py +++ b/tests/test_tree_same.py @@ -21,7 +21,7 @@ from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper # Toy datasets -from sklearn.datasets import load_iris, load_wine, load_boston +from sklearn.datasets import load_iris, load_wine, load_diabetes from deepforest import DecisionTreeClassifier from deepforest import ExtraTreeClassifier @@ -89,7 +89,7 @@ def test_extra_tree_classifier_proba(load_func): assert_array_equal(actual_proba, expected_proba) -@pytest.mark.parametrize("load_func", [load_boston]) +@pytest.mark.parametrize("load_func", [load_diabetes]) def test_tree_regressor_pred(load_func): X, y = load_func(return_X_y=True) @@ -115,7 +115,7 @@ def test_tree_regressor_pred(load_func): assert_array_equal(actual_pred, expected_pred) -@pytest.mark.parametrize("load_func", [load_boston]) +@pytest.mark.parametrize("load_func", [load_diabetes]) def test_extra_tree_regressor_pred(load_func): X, y = load_func(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( @@ -140,7 +140,7 @@ def test_extra_tree_regressor_pred(load_func): assert_array_equal(actual_pred, expected_pred) -@pytest.mark.parametrize("load_func", [load_boston]) +@pytest.mark.parametrize("load_func", [load_diabetes]) def test_tree_regressor_multi_output_pred(load_func): X, y = load_func(return_X_y=True) @@ -171,7 +171,7 @@ def test_tree_regressor_multi_output_pred(load_func): assert_array_equal(actual_pred, expected_pred) -@pytest.mark.parametrize("load_func", [load_boston]) +@pytest.mark.parametrize("load_func", [load_diabetes]) def test_extra_tree_regressor_multi_output_pred(load_func): X, y = load_func(return_X_y=True)