From 001b3613ae6747fb446b93601c86999c36f97281 Mon Sep 17 00:00:00 2001 From: jiaruixu Date: Mon, 7 Jul 2025 00:38:32 -0700 Subject: [PATCH 1/4] Introduce CDFSplineCalibrator and restructure estimators module - Add CDFSplineCalibrator implementation for multi-class probability calibration - Move LinearSplineLogisticRegression to estimators submodule - Create comprehensive documentation with Sphinx - Add installation, quickstart, API reference, and examples pages - Configure Read the Docs support with .readthedocs.yaml - Add documentation requirements and build configuration - Update examples to use new CDFSplineCalibrator - Add DOCUMENTATION_GUIDE.md with instructions for free hosting options --- .readthedocs.yaml | 26 ++ DOCUMENTATION_GUIDE.md | 208 +++++++++++++ docs/Makefile | 4 +- docs/api.rst | 52 ++++ docs/conf.py | 53 ++++ docs/examples.rst | 123 ++++++++ docs/index.rst | 59 ++++ docs/installation.rst | 50 ++++ docs/make.bat | 35 --- docs/quickstart.rst | 79 +++++ docs/requirements.txt | 3 + docs/source/api.rst | 81 ----- docs/source/conf.py | 29 -- docs/source/examples.rst | 10 - docs/source/index.rst | 21 -- examples/calibration_comparison.py | 23 +- examples/qq.ipynb | 10 + examples/spline_model_comparison.ipynb | 281 +++++++++++++----- src/splinator/__init__.py | 7 + src/splinator/estimators/__init__.py | 7 + .../estimators/cdf_spline_calibrator.py | 270 +++++++++++++++++ .../estimators/ks_spline_calibrator.py | 270 +++++++++++++++++ .../linear_spline_logistic_regression.py} | 6 +- 23 files changed, 1440 insertions(+), 267 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 DOCUMENTATION_GUIDE.md create mode 100644 docs/api.rst create mode 100644 docs/conf.py create mode 100644 docs/examples.rst create mode 100644 docs/index.rst create mode 100644 docs/installation.rst delete mode 100644 docs/make.bat create mode 100644 docs/quickstart.rst create mode 100644 docs/requirements.txt delete mode 100644 docs/source/api.rst delete mode 100644 docs/source/conf.py delete mode 100644 docs/source/examples.rst delete mode 100644 docs/source/index.rst create mode 100644 examples/qq.ipynb create mode 100644 src/splinator/estimators/__init__.py create mode 100644 src/splinator/estimators/cdf_spline_calibrator.py create mode 100644 src/splinator/estimators/ks_spline_calibrator.py rename src/splinator/{estimators.py => estimators/linear_spline_logistic_regression.py} (98%) diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..02712aa --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,26 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + fail_on_warning: false + +# Optionally build your docs in additional formats such as PDF and ePub +formats: + - pdf + - epub + +# Install documentation dependencies +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/DOCUMENTATION_GUIDE.md b/DOCUMENTATION_GUIDE.md new file mode 100644 index 0000000..f3a9094 --- /dev/null +++ b/DOCUMENTATION_GUIDE.md @@ -0,0 +1,208 @@ +# Documentation Guide for Splinator + +This guide explains how to create and host documentation for your Python package using free tools. + +## Documentation Setup + +We've set up Sphinx documentation with the following features: +- **Automatic API documentation** from your Python docstrings +- **Read the Docs theme** for a professional look +- **Type hints support** in documentation +- **Examples and tutorials** sections + +## Building Documentation Locally + +1. **Install documentation dependencies:** + ```bash + pip install sphinx sphinx-rtd-theme sphinx-autodoc-typehints + ``` + +2. **Build the documentation:** + ```bash + cd docs + make html + ``` + +3. **View the documentation:** + Open `docs/_build/html/index.html` in your browser + +## Free Hosting Options + +### Option 1: Read the Docs (Recommended) + +Read the Docs provides free hosting for open source projects with automatic builds. + +**Setup steps:** + +1. Go to [readthedocs.org](https://readthedocs.org/) and sign up +2. Connect your GitHub/GitLab account +3. Import your repository +4. Read the Docs will automatically detect `.readthedocs.yaml` and build your docs +5. Your documentation will be available at `https://splinator.readthedocs.io/` + +**Features:** +- Automatic builds on every push +- Version management (stable, latest, tags) +- Search functionality +- PDF and ePub downloads +- Custom domains (splinator.readthedocs.io) + +### Option 2: GitHub Pages + +Host documentation directly from your GitHub repository. + +**Setup steps:** + +1. Build documentation locally: + ```bash + cd docs + make html + ``` + +2. Create a `gh-pages` branch: + ```bash + git checkout --orphan gh-pages + git rm -rf . + cp -r docs/_build/html/* . + touch .nojekyll # Important for Sphinx + git add . + git commit -m "Initial documentation" + git push origin gh-pages + ``` + +3. Enable GitHub Pages in repository settings: + - Go to Settings → Pages + - Select source: "Deploy from a branch" + - Select branch: `gh-pages` and folder: `/ (root)` + +4. Documentation will be available at `https://yourusername.github.io/splinator/` + +**Automated GitHub Pages with Actions:** + +Create `.github/workflows/docs.yml`: + +```yaml +name: Build and Deploy Documentation + +on: + push: + branches: [ main ] + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + pip install -r docs/requirements.txt + + - name: Build documentation + run: | + cd docs + make html + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./docs/_build/html +``` + +### Option 3: GitLab Pages + +Similar to GitHub Pages but for GitLab repositories. + +Create `.gitlab-ci.yml`: + +```yaml +pages: + stage: deploy + image: python:3.11 + script: + - pip install -r docs/requirements.txt + - cd docs + - make html + - mv _build/html ../public + artifacts: + paths: + - public + only: + - main +``` + +## Documentation Best Practices + +1. **Write good docstrings:** + ```python + def calibrate(self, probabilities: np.ndarray, labels: np.ndarray) -> np.ndarray: + """Calibrate probability predictions. + + Parameters + ---------- + probabilities : np.ndarray + Uncalibrated probability predictions + labels : np.ndarray + True binary labels + + Returns + ------- + np.ndarray + Calibrated probabilities + """ + ``` + +2. **Keep documentation updated:** + - Update docstrings when changing functions + - Add examples for new features + - Update version numbers + +3. **Use type hints:** + They automatically appear in documentation with `sphinx-autodoc-typehints` + +4. **Add examples:** + - In docstrings: Use the `Examples` section + - In documentation: Create example notebooks + +## Current Documentation Structure + +``` +docs/ +├── conf.py # Sphinx configuration +├── index.rst # Main page +├── installation.rst # Installation guide +├── quickstart.rst # Quick start guide +├── api.rst # API reference (auto-generated) +├── examples.rst # Examples and tutorials +└── requirements.txt # Documentation dependencies +``` + +## Troubleshooting + +**Issue: Import errors when building docs** +- Solution: We use `autodoc_mock_imports` in `conf.py` to mock external dependencies + +**Issue: Documentation not updating on Read the Docs** +- Check build logs on Read the Docs dashboard +- Ensure `.readthedocs.yaml` is in the repository root + +**Issue: Styles not loading on GitHub Pages** +- Add `.nojekyll` file to the documentation root +- Ensure `_static` folder is included + +## Next Steps + +1. Choose your hosting platform (Read the Docs recommended) +2. Set up the hosting following the steps above +3. Add a documentation badge to your README: + ```markdown + [![Documentation Status](https://readthedocs.org/projects/splinator/badge/?version=latest)](https://splinator.readthedocs.io/en/latest/?badge=latest) + ``` + +Your documentation is now ready to be built and hosted for free! \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf..d4bb2cb 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,8 +5,8 @@ # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build +SOURCEDIR = . +BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..613f98e --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,52 @@ +API Reference +============= + +This page contains the full API reference for all public classes and functions in splinator. + +Main Module +----------- + +.. automodule:: splinator + :members: + :undoc-members: + :show-inheritance: + +Estimators +---------- + +.. automodule:: splinator.estimators + :members: + :undoc-members: + :show-inheritance: + +Linear Spline Logistic Regression +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: splinator.estimators.linear_spline_logistic_regression + :members: + :undoc-members: + :show-inheritance: + +KS Spline Calibrator +~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: splinator.estimators.ks_spline_calibrator + :members: + :undoc-members: + :show-inheritance: + +CDF Spline Calibrator +~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: splinator.estimators.cdf_spline_calibrator + :members: + :undoc-members: + :show-inheritance: + +Metrics +------- + +.. automodule:: splinator.metrics + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..0dc78c6 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,53 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import os +import sys +sys.path.insert(0, os.path.abspath('../src')) + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'splinator' +copyright = '2025, Jiarui Xu' +author = 'Jiarui Xu' + +version = '0.2.0' +release = '0.2.0' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', # Support for Google/NumPy style docstrings + 'sphinx_autodoc_typehints', # Automatic type hints in docs +] + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Autodoc configuration --------------------------------------------------- +autodoc_default_options = { + 'members': True, + 'member-order': 'bysource', + 'special-members': '__init__', + 'undoc-members': True, + 'exclude-members': '__weakref__' +} + +# Mock imports for external dependencies +autodoc_mock_imports = ['numpy', 'scipy', 'sklearn', 'pandas', 'matplotlib'] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] + +# -- Options for autodoc-typehints ------------------------------------------- +always_document_param_types = True +typehints_use_rtype = True diff --git a/docs/examples.rst b/docs/examples.rst new file mode 100644 index 0000000..3db60a4 --- /dev/null +++ b/docs/examples.rst @@ -0,0 +1,123 @@ +Examples +======== + +This section provides detailed examples demonstrating various features of splinator. + +Example Notebooks +----------------- + +The `examples/` directory contains several Jupyter notebooks with detailed examples: + +* **calibration_comparison.py** - Compares different calibration methods +* **calibrator_model_comparison.ipynb** - Comparison of calibrator models +* **fit_on_splines.ipynb** - Fitting models on spline features +* **metrics.ipynb** - Using splinator metrics for evaluation +* **spline_model_comparison.ipynb** - Comparing different spline models +* **sklearn_check.ipynb** - Verifying scikit-learn compatibility + +Calibration Example +------------------- + +Here's a complete example of using splinator for probability calibration: + +.. code-block:: python + + import numpy as np + from sklearn.datasets import make_classification + from sklearn.model_selection import train_test_split + from sklearn.linear_model import LogisticRegression + from splinator.estimators import CDFSplineCalibrator + from splinator.metrics import expected_calibration_error + + # Generate synthetic data + X, y = make_classification(n_samples=2000, n_features=20, + n_informative=15, n_classes=3, + n_clusters_per_class=1, random_state=42) + + # Split data + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, random_state=42 + ) + X_cal, X_test, y_cal, y_test = train_test_split( + X_test, y_test, test_size=0.5, random_state=42 + ) + + # Train base model + lr = LogisticRegression(random_state=42, max_iter=1000) + lr.fit(X_train, y_train) + + # Get uncalibrated probabilities (all classes) + proba_uncal = lr.predict_proba(X_test) + + # Calculate ECE before calibration (for binary case, use positive class) + # For multi-class, you might want to compute ECE per class + # ece_before = expected_calibration_error(y_test, proba_uncal[:, 1]) + + # Calibrate using CDF Spline + calibrator = CDFSplineCalibrator(num_knots=6) + calibrator.fit(lr.predict_proba(X_cal), y_cal) + + # Get calibrated probabilities + proba_cal = calibrator.transform(proba_uncal) + + # Calculate ECE after calibration + # ece_after = expected_calibration_error(y_test, proba_cal[:, 1]) + + print(f"Calibrated probabilities shape: {proba_cal.shape}") + print(f"Sum of probabilities per sample (should be 1): {proba_cal.sum(axis=1)[:5]}") + +Linear Spline Logistic Regression Example +----------------------------------------- + +Example showing how to use the LinearSplineLogisticRegression model: + +.. code-block:: python + + import numpy as np + import matplotlib.pyplot as plt + from splinator.estimators import LinearSplineLogisticRegression + from sklearn.datasets import make_classification + + # Generate non-linear classification data + np.random.seed(42) + X = np.random.randn(500, 1) * 2 + y = (np.sin(X.squeeze()) + np.random.randn(500) * 0.3 > 0).astype(int) + + # Fit models with different numbers of knots + fig, axes = plt.subplots(1, 3, figsize=(12, 4)) + + for ax, n_knots in zip(axes, [2, 5, 10]): + model = LinearSplineLogisticRegression(n_knots=n_knots) + model.fit(X, y) + + # Plot decision boundary + X_plot = np.linspace(-4, 4, 200).reshape(-1, 1) + y_proba = model.predict_proba(X_plot)[:, 1] + + ax.scatter(X, y, alpha=0.5, c=y, cmap='viridis') + ax.plot(X_plot, y_proba, 'r-', linewidth=2, label=f'P(y=1|x)') + ax.axhline(y=0.5, color='k', linestyle='--', alpha=0.5) + ax.set_xlabel('x') + ax.set_ylabel('Probability') + ax.legend() + ax.set_title(f'{n_knots} knots') + + plt.tight_layout() + plt.show() + +Running the Examples +-------------------- + +To run the example notebooks: + +1. Install splinator with development dependencies:: + + pip install -e ".[dev]" + +2. Start Jupyter:: + + jupyter notebook + +3. Navigate to the `examples/` directory and open any notebook + +For more examples, check the `examples/` directory in the repository. \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..955246f --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,59 @@ +.. splinator documentation master file, created by + sphinx-quickstart on Mon Jan 20 01:28:16 2025. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to splinator's documentation! +===================================== + +**splinator** is a Python library for fitting linear-spline based logistic regression for calibration. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + installation + quickstart + api + examples + +Features +-------- + +* Linear spline-based logistic regression models +* CDF-based spline calibration for probability predictions +* scikit-learn compatible API +* Support for multi-class classification + +Installation +------------ + +Install splinator using pip:: + + pip install splinator + +For development installation:: + + pip install -e ".[dev]" + +Quick Example +------------- + +.. code-block:: python + + from splinator.estimators import LinearSplineLogisticRegression + + # Create and fit model + model = LinearSplineLogisticRegression(n_knots=5) + model.fit(X_train, y_train) + + # Make predictions + predictions = model.predict_proba(X_test) + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..5adeae6 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,50 @@ +Installation +============ + +Requirements +------------ + +* Python 3.7.1 or later +* NumPy >= 1.19.0 +* SciPy >= 1.6.0 +* scikit-learn >= 1.0.0 +* pandas >= 1.3.0 + +Install from PyPI +----------------- + +The easiest way to install splinator is via pip:: + + pip install splinator + +Install from Source +------------------- + +To install from source, clone the repository and install using pip:: + + git clone https://github.com/yourusername/splinator.git + cd splinator + pip install -e . + +Development Installation +------------------------ + +For development, install with the extra dependencies:: + + pip install -e ".[dev]" + +This will install additional packages needed for development: + +* pytest for testing +* matplotlib for plotting +* mypy for type checking +* jupyter for notebooks + +Verify Installation +------------------- + +You can verify the installation by importing splinator:: + + python -c "import splinator; print(splinator.__version__)" + +This should print the version number without any errors. \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 747ffb7..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 0000000..e30675d --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,79 @@ +Quick Start Guide +================= + +This guide will help you get started with splinator quickly. + +Basic Usage +----------- + +Linear Spline Logistic Regression +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The main estimator in splinator is the LinearSplineLogisticRegression, which fits a logistic regression model using linear splines: + +.. code-block:: python + + import numpy as np + from splinator.estimators import LinearSplineLogisticRegression + + # Generate sample data + np.random.seed(42) + X = np.random.randn(1000, 1) + y = (X.squeeze() + np.random.randn(1000) * 0.5 > 0).astype(int) + + # Create and fit the model + model = LinearSplineLogisticRegression(n_knots=5) + model.fit(X, y) + + # Make predictions + y_pred = model.predict(X) + y_proba = model.predict_proba(X) + +Calibration with Splines +~~~~~~~~~~~~~~~~~~~~~~~~ + +Splinator provides calibrators that use splines to calibrate probability predictions: + +.. code-block:: python + + from splinator.estimators import CDFSplineCalibrator + from sklearn.model_selection import train_test_split + + # Split data + X_train, X_cal, y_train, y_cal = train_test_split(X, y, test_size=0.3) + + # Train a base model + base_model = LinearSplineLogisticRegression(n_knots=3) + base_model.fit(X_train, y_train) + + # Get uncalibrated probabilities (all classes) + proba_uncalibrated = base_model.predict_proba(X_cal) + + # Calibrate using CDF Spline Calibrator + cdf_calibrator = CDFSplineCalibrator(num_knots=6) + cdf_calibrator.fit(proba_uncalibrated, y_cal) + + # Transform test probabilities + proba_test = base_model.predict_proba(X_test) + proba_calibrated = cdf_calibrator.transform(proba_test) + +Integration with scikit-learn +----------------------------- + +All estimators in splinator follow the scikit-learn API and can be used in pipelines: + +.. code-block:: python + + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn.model_selection import cross_val_score + + # Create pipeline + pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('classifier', LinearSplineLogisticRegression(n_knots=5)) + ]) + + # Cross-validation + scores = cross_val_score(pipeline, X, y, cv=5) + print(f"Cross-validation scores: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})") \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..429f19d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +sphinx>=7.0.0 +sphinx-rtd-theme>=3.0.0 +sphinx-autodoc-typehints>=2.0.0 \ No newline at end of file diff --git a/docs/source/api.rst b/docs/source/api.rst deleted file mode 100644 index 73594c7..0000000 --- a/docs/source/api.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. py:class:: LinearSplineLogisticRegression - - Piecewise Logistic Regression with Linear Splines - - For more information regarding how to build your own estimator, read more - in the :ref:`User Guide `. - - :param demo_param: str, default='demo_param' - A parameter used for demonstration of how to pass and store parameters. - - :ivar coef_: ndarray of shape (n_features,) - Coefficients of the linear spline logistic regression model. - - :ivar intercept_: float - Intercept of the linear spline logistic regression model. - - :ivar knots_: ndarray of shape (n_knots,) - The knot locations used in the linear spline logistic regression. - - :ivar monotonicity_: str, default='none' - Whether the function is monotonically increasing or decreasing. - - :ivar n_features_in_: int - Number of input features. - - :ivar two_stage_fitting_: bool - Whether two-stage fitting is used. - - :ivar verbose_: bool - Verbosity flag. - - :param input_score_column_index: int, default=0 - The index of the column containing input scores. - - :param n_knots: int, optional, default=100 - Number of knots used in the linear spline logistic regression. - - :param knots: list of float or np.ndarray, optional, default=None - The knot locations used in the linear spline logistic regression. - - :param monotonicity: str, default='none' - Whether to enforce that the function is monotonically increasing or decreasing. - Valid values are 'none', 'increasing', and 'decreasing'. - - :param intercept: bool, default=True - If True, allows the function value at x=0 to be nonzero. - - :param method: str, default='slsqp' - The method named passed to scipy minimize. Supported values are 'slsqp' and 'trust-constr'. - For more details, see the documentation for `scipy.optimize.minimize`. - - :param minimizer_options: dict, optional, default=None - Some scipy minimizer methods have their special options. - For example: {'disp': True} will display a termination report. - For options, see the documentation for `scipy.optimize.minimize`. - - :param C: int, default=100 - Inverse of regularization strength; must be a positive float. - Smaller values specify stronger regularization. - - :param two_stage_fitting_initial_size: int, optional, default=None - Subsample size of training data for first fitting. - If two-stage fitting is not used, this should be None. - - :param random_state: int, default=31 - Random seed number. - - :raises ValueError: - If both `knots` and `n_knots` are non-null during fitting. - - :raises ValueError: - If `monotonicity` has an invalid value. - - :raises ValueError: - If `input_score_column_index` is negative. - - :raises ValueError: - If `method` has an invalid value. - - :raises ValueError: - diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 40f6be7..0000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,29 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# For the full list of built-in configuration values, see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information - -project = 'splinator' -copyright = '2023, Jiarui Xu' -author = 'Jiarui Xu' -release = '0.1.1' - -# -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration - -extensions = [] - -templates_path = ['_templates'] -exclude_patterns = [] - - - -# -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output - -html_theme = 'sphinx_rtd_theme' # Replace 'sphinx_rtd_theme' with your chosen theme name -html_static_path = ['_static'] - diff --git a/docs/source/examples.rst b/docs/source/examples.rst deleted file mode 100644 index c07e9b4..0000000 --- a/docs/source/examples.rst +++ /dev/null @@ -1,10 +0,0 @@ -Examples -=================================== - -LSLR vs Isontonic Regression vs Sigmoid Calibration -=============== - - -This is `model comparison notebook `__ -that compares LSLR (Linear Spline Logistic Regression) against different calibration algorithms offered by sklearn. The -notebook is modified from scikit-learn `Calibration section `__ diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index b688db1..0000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -Welcome to splinator's documentation! -=================================== - -**Splinator** is a Python library for probabilistic calibration - -Check out the :doc:`usage` section for further information, including -how to :ref:`installation` the project. - -.. note:: - - This project is under active development. - -Contents --------- - -.. toctree:: - - Home - usage - examples - api diff --git a/examples/calibration_comparison.py b/examples/calibration_comparison.py index cf5edae..f681ed1 100644 --- a/examples/calibration_comparison.py +++ b/examples/calibration_comparison.py @@ -10,7 +10,7 @@ from sklearn.isotonic import IsotonicRegression from sklearn.calibration import calibration_curve from sklearn.metrics import brier_score_loss, log_loss -from splinator.estimators import LinearSplineLogisticRegression +from splinator.estimators import LinearSplineLogisticRegression, CDFSplineCalibrator from splinator.metrics import expected_calibration_error, spiegelhalters_z_statistic import time import warnings @@ -49,8 +49,10 @@ def run_comparison(n_samples=50000, n_features=20, n_knots=50): clf.fit(X_train, y_train) # Get uncalibrated predictions - pred_cal = clf.predict_proba(X_cal)[:, 1] - pred_test = clf.predict_proba(X_test)[:, 1] + probas_cal = clf.predict_proba(X_cal) + probas_test = clf.predict_proba(X_test) + pred_cal = probas_cal[:, 1] + pred_test = probas_test[:, 1] results = [] @@ -77,7 +79,18 @@ def run_comparison(n_samples=50000, n_features=20, n_knots=50): result['fit_time'] = isotonic_time results.append(result) - # 3. Linear Spline Logistic Regression (Splinator) + # 3. CDF Spline Calibrator (Splinator) + start = time.time() + cdf_cal = CDFSplineCalibrator() # num_knots=6 by default + cdf_cal.fit(probas_cal, y_cal) + cdf_calibrated_probas = cdf_cal.transform(probas_test) + cdf_calibrated = cdf_calibrated_probas[:, 1] + cdf_time = time.time() - start + result = evaluate_calibrator(y_test, cdf_calibrated, 'CDFSplineCalibrator') + result['fit_time'] = cdf_time + results.append(result) + + # 4. Linear Spline Logistic Regression (Splinator) start = time.time() lslr = LinearSplineLogisticRegression( n_knots=n_knots, @@ -140,7 +153,7 @@ def run_comparison(n_samples=50000, n_features=20, n_knots=50): print("="*80) uncalibrated_scores = all_results[all_results['method'] == 'Uncalibrated'][['brier_score', 'log_loss', 'ece', 'spiegelhalter_z']].mean() -for method in ['Sigmoid (sklearn)', 'Isotonic (sklearn)', 'LSLR (n_knots=50)']: +for method in ['Sigmoid (sklearn)', 'Isotonic (sklearn)', 'CDFSplineCalibrator', 'LSLR (n_knots=50)']: method_scores = all_results[all_results['method'] == method][['brier_score', 'log_loss', 'ece', 'spiegelhalter_z']].mean() improvement = (uncalibrated_scores - method_scores) / uncalibrated_scores * 100 print(f"\n{method}:") diff --git a/examples/qq.ipynb b/examples/qq.ipynb new file mode 100644 index 0000000..c55a287 --- /dev/null +++ b/examples/qq.ipynb @@ -0,0 +1,10 @@ +{ + "cells": [], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/spline_model_comparison.ipynb b/examples/spline_model_comparison.ipynb index f3685c9..1307689 100644 --- a/examples/spline_model_comparison.ipynb +++ b/examples/spline_model_comparison.ipynb @@ -2,19 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 23, + "execution_count": 1, "id": "e78896aa-cc54-43ec-9383-305384cdf0b5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -22,54 +13,67 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, + "id": "be2926e7", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "a340130b-0102-44f0-a6b9-0dfbe3662025", "metadata": {}, "outputs": [], "source": [ - "from splinator.estimators import LinearSplineLogisticRegression\n", + "from splinator.estimators import LinearSplineLogisticRegression, CDFSplineCalibrator\n", "\n", "from sklearn.datasets import make_blobs\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.calibration import CalibratedClassifierCV\n", - "from sklearn.metrics import brier_score_loss\n", + "from sklearn.metrics import brier_score_loss, log_loss\n", "from sklearn.svm import SVC\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)\n", "\n", - "\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.datasets import make_classification\n", "from sklearn.linear_model import LogisticRegression \n", "from sklearn.metrics import roc_auc_score\n", - "from sklearn.isotonic import IsotonicRegression" + "from sklearn.isotonic import IsotonicRegression\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from splinator.metrics import expected_calibration_error, spiegelhalters_z_statistic" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 4, "id": "2b82661c-7b25-4222-ba79-11e1f948167e", "metadata": {}, "outputs": [], "source": [ - "from splinator.monotonic_spline import _get_design_matrix\n", - "from splinator.estimators import LinearSplineLogisticRegression" + "# Remove redundant imports - already imported above" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 5, "id": "a034bd3d-3a2a-4dd5-a9c0-f95c761d7630", "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import make_classification" + "# Remove redundant import - already imported above" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 6, "id": "57b144f0-73c1-4c54-8e02-5468a7a294dc", "metadata": {}, "outputs": [], @@ -88,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 7, "id": "b552efac-8fbd-45c5-ad13-86e4d4c618d4", "metadata": {}, "outputs": [], @@ -103,17 +107,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 8, "id": "8b8c065e-8890-4e58-a4cf-4b9c204e4a63", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.8958574486358871" + "0.893854084279575" ] }, - "execution_count": 29, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -124,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 9, "id": "983f189a-a9ba-4348-8bb9-193db84e3e44", "metadata": {}, "outputs": [ @@ -132,100 +136,109 @@ "name": "stdout", "output_type": "stream", "text": [ - "Optimization terminated successfully (Exit mode 0)\n", - " Current function value: 811.3527861616368\n", - " Iterations: 44\n", - " Function evaluations: 56\n", - " Gradient evaluations: 44\n", - "Optimization terminated successfully (Exit mode 0)\n", - " Current function value: 1778.2245280845434\n", - " Iterations: 41\n", - " Function evaluations: 48\n", - " Gradient evaluations: 41\n" + "Fitting LinearSplineLogisticRegression...\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'LinearSplineLogisticRegression' object has no attribute 'predict_proba'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 11\u001b[0m\n\u001b[1;32m 3\u001b[0m lslr \u001b[38;5;241m=\u001b[39m LinearSplineLogisticRegression(\n\u001b[1;32m 4\u001b[0m n_knots\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m20\u001b[39m, \n\u001b[1;32m 5\u001b[0m monotonicity\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnone\u001b[39m\u001b[38;5;124m\"\u001b[39m, \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8\u001b[0m two_stage_fitting_initial_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2000\u001b[39m\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 10\u001b[0m lslr\u001b[38;5;241m.\u001b[39mfit(clf_pred_dev\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m1\u001b[39m), y_dev)\n\u001b[0;32m---> 11\u001b[0m lslr_pred \u001b[38;5;241m=\u001b[39m \u001b[43mlslr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict_proba\u001b[49m(clf_pred_test\u001b[38;5;241m.\u001b[39mreshape(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m1\u001b[39m))[:, \u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFitting KSSplineCalibrator...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 14\u001b[0m ks_cal \u001b[38;5;241m=\u001b[39m KSSplineCalibrator(n_knots\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m6\u001b[39m)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'LinearSplineLogisticRegression' object has no attribute 'predict_proba'" ] } ], "source": [ + "# Fit all spline-based calibrators\n", + "print(\"Fitting LinearSplineLogisticRegression...\")\n", "lslr = LinearSplineLogisticRegression(\n", " n_knots=20, \n", " monotonicity=\"none\", \n", - " minimizer_options={'disp': True}, \n", + " minimizer_options={'disp': False}, \n", " method='SLSQP', \n", " two_stage_fitting_initial_size=2000\n", " )\n", - "lslr.fit( clf_pred_dev.reshape( -1, 1 ), y_dev )\n", - "lslr_pred = lslr.predict( clf_pred_test.reshape( -1, 1 ))" + "lslr.fit(clf_pred_dev.reshape(-1, 1), y_dev)\n", + "lslr_pred = lslr.predict_proba(clf_pred_test.reshape(-1, 1))[:, 1]\n", + "\n", + "print(\"Fitting CDFSplineCalibrator...\")\n", + "cdf_cal = CDFSplineCalibrator(num_knots=6)\n", + "cdf_cal.fit(clf_pred_dev.reshape(-1, 1), y_dev)\n", + "cdf_pred = cdf_cal.predict_proba(clf_pred_test.reshape(-1, 1))[:, 1]" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "e623a437-ea10-47f1-b9ec-e68ad29663f7", "metadata": {}, "outputs": [], "source": [ + "print(\"Fitting PyGAM...\")\n", "from pygam import GAM, s, te\n", "\n", "gam_model = GAM(s(0, n_splines=20, spline_order=1), distribution='binomial', link='logit', fit_intercept=True)\n", - "\n", - "gam_model.fit( clf_pred_dev.reshape( -1, 1 ), y_dev )\n", - "gam_model_pred = gam_model.predict( clf_pred_test.reshape( -1, 1 ))" + "gam_model.fit(clf_pred_dev.reshape(-1, 1), y_dev)\n", + "gam_model_pred = gam_model.predict_proba(clf_pred_test.reshape(-1, 1))" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "7647e9cc-3f8d-4369-be53-1f4c0699ff56", "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "%matplotlib inline\n", - "\n", - "from matplotlib import pyplot as plt" + "# Already imported at the top" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "903fd313-8c25-435e-b3ce-9a2c7c4f07e0", "metadata": {}, "outputs": [], "source": [ + "# Create comparison dataframe\n", "df = pd.DataFrame({\n", - " 'x': clf_pred_test,\n", - " 'y': y_test,\n", - " 'lslr_pred': lslr_pred,\n", - " 'pygam_pred': gam_model_pred,\n", - "})\n", - "\n", - "def plot(df):\n", - " sub_df = df.sample(500).sort_values(['x'])\n", - " # plt.scatter(to_plot['x'], to_plot['y'], color='red')\n", - " \n", - " plt.plot(sub_df['x'], sub_df['lslr_pred'], color='red')\n", - " plt.plot(sub_df['x'], sub_df['pygam_pred'], color='blue')" + " 'uncalibrated': clf_pred_test,\n", + " 'y_true': y_test,\n", + " 'lslr': lslr_pred,\n", + " 'cdf_spline': cdf_pred,\n", + " 'pygam': gam_model_pred,\n", + "})" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "b6a1f150-7a08-49e7-aa1e-d70ea4241c4e", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGdCAYAAADAAnMpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGM0lEQVR4nO3de3zO9f/H8cc2OzhtznNopfyig1BkjkUtRMghk0IKKZWoHMqhKIeS9BXJMZWQY05NTMphRRMdiJxy3GxoY9jp+vz+eLexbOyabZ9t1/N+u12363Nd1+dz7XV13ep69j66WZZlISIiImITd7sLEBEREdemMCIiIiK2UhgRERERWymMiIiIiK0URkRERMRWCiMiIiJiK4URERERsZXCiIiIiNiqkN0FZIbD4eD48eMUL14cNzc3u8sRERGRTLAsi7Nnz1KxYkXc3TNu/8gXYeT48eMEBATYXYaIiIhkwZEjR7jhhhsyfD1fhJHixYsD5sP4+vraXI2IiIhkRmxsLAEBAam/4xnJF2EkpWvG19dXYURERCSfudYQCw1gFREREVspjIiIiIitFEZERETEVgojIiIiYiuFEREREbGVwoiIiIjYSmFEREREbKUwIiIiIrZSGBERERFbKYyIiIiIrRRGRERExFYKIyIiImIrhREREREXZVmwYAG0bQvJyfbVoTAiIiLigrZtgwYNoHNnWL4c5s61rxaFERERERdy6hQ8+ywEBlr8+CMUdT/PSI+36Ng40raaFEZERERcgGXBzJlQrZrFtGlgWW505TP+clRhmPs7FNm+ybbaCtn2l0VERCRXHD4MPZ9xsHadO+BGdX5jCs/TuPhO6NMH+vWDSpVsq09hREREpICyLJjxSTKv9E/m7EUvfLjA2wzlJf+v8Oz/Ajy7AkqUsLtMhREREZGCKOqkRffmEXyzowLgQQM2M7vUq1QdFgzP7oXChe0uMZXCiIiISAHz/dyjdHnGh+PxFfDhAu8UHU2/4SXw6LsOiha1u7wrKIyIiIgUEMmJDka3/ZE3vwnEgQe3u+3mqz7fUf3dQVCsmN3lZUhhREREpAA4/etRHm9ygm/PNADgqQpr+Cjk/yha43mbK7s2Te0VERHJzyyLP95ZRt1aCXx75l6KEMec7uuZfawZRWtUsbu6TFHLiIiISH7kcMCKFaweuIHgvSM5R3Eqex/n60VJ1HjkAburc4rCiIiISH6SlGQ2lBkzhhl/1KMPU0mmEE1u+ZuFW26gjL+H3RU6TWFERETkOq1eDStXwq23wr33Qq1aOTBe9OJFmDMHxo3DOniQUQxjBCMBeKrzBaZ9dhOentn8N3OJwoiIiMh1CAuDVq3SPufuDrffDnXqXLrVrJmFpT2SkswfWL3aBJETJ0jCg74+c5h2sRsAb7wBo0YVxs0tez6PHdwsy7LsLuJaYmNj8fPzIyYmBl9fX7vLERERAUxWqF0bfv0VqlUzAeTnn+Ho0SvP9fCA6tXTBpS77gJv7/+cePIkfPONCSDffgv//JP6UsINt9Cl9BoW7/w/3Nzgo4/g+Tw8WSazv99qGREREcmiqVNNEClVCjZtgjJlzPMnTkB4uAkmP/8M27aZjLFzp7nNnGnO8/KCGjUs7r3pJIFJW6i3fy63/r4Edy5rJyhdGlq0IKFFGzp91ZGvV7jj5QXz50O7drn/mXOCWkZERESy4MwZ+L//g9On4eOPzX5zGbEsOHbsUjj5eZuDbWFJnD7rdcW5JThDoO9uAmvGU69dBeo+cSvF/Dzo2NGMS/H2hmXLoEWLnPts2SWzv98KIyIiIlnwyiswYQLceSfs2AGFMtvXcPQotG+PtW0bh6jMNu5lq1djfioexM+xt3Ix8co3KlXKhB4fH1i+HB56KFs/So5RN42IiEgO2bcPJk0yx++/70QQ2bgROnaEkydxK1GCm7u14eZ27ejUsCF4epKYCL/9Bj/+aG4//QR795ogUrgwrFgBDz6YYx/LNgojIiIiTho0CBITTVdJ8+aZuMCyYMoUePllM+q1Rg3T13LzzWlO8/SEe+4xt5SBqadPm66dm24yg2QLIoURERERJ2zbBkuWmOm748dn4oKEBHjuOZg1yzwODjYjWDO5e26pUtCsWdbrzQ8URkRERJzw1lvmvmtXM17kqs6cgfbtYcMGk17GjoVXXyVfLwqSAxRGREREMmnbNli1yqwZMnToNU4+cMCshvbnn2Y51q++gocfzpU68xuFERERkUxKaRV54gkzrTdDP/8MLVtCVBTccINJMDVq5EqN+ZG73QWIiIjkBz//bDKFu/s1WkXWr4emTU0QueceMyVGQeSqFEZEREQyYaTZk44nnzQb4qVryRLTFXPunJmDu2EDVKyYWyXmWwojIiIi17B7t1njw83NbEyXrpkz4bHHzOyZ9u1NM0rx4rlaZ36lMCIiInINH35o7tu0gapV0zlh4kTo2RMcDnP/1Vfp7IAnGVEYERERuYpTp+Czz8xx//7pnPDBB5deGDgQpk0z020k0xRGRERErmLaNLhwAWrVgvvu+8+LEybAgAHmeNgws46I1hBxmsKIiIhIBhIS4KOPzHH//v/JGRMmmN3ywASRt95SEMkihREREZEMLFwIx49D+fLQufNlLyiIZCuFERERkQykDFzt2xe8vP59cvp0BZFspjAiIiKSjl9/Ncu/e3rCs8/+++SiRdCnjzkeNEhBJJsojIiIiKRj9mxz37o1lC0LrF0LXbqY6bu9e8OYMQoi2URhRERE5D8SEuCLL8xxjx7A1q3Qrh0kJkLHjjBlioJINtJGeSIiIv+xahVER5uBqy1u2g1NHoa4OAgKMilF64hkK7WMiIiI/EdKF023DnEUeqQFnD4NdevC0qVaWTUHKIyIiIhcJiICVq82xz02Pg2HD5ud8VavhmLF7C2ugFIYERERucwXX0ByMtQrvZfbfv0KSpY0/TalS9tdWoGlMCIiIvIvy4JZs8xxj1PjoVAhWLLEtIxIjlEYERER+dfWrbB7NxTmPMEsgE8+gSZN7C6rwFMYERER+dfsMREAdGAxfgP7wNNP21yRa1AYERERAS7sOcz85YUB6NHwL7OomeQKhREREZELF1j58GRiLD9u8jpOk28Ggbt+InOL/kmLiIhrsyzo25dFB+8BoPPTRXEvXtTmolyLwoiIiLi2adO4MHseq2gFQMdn/GwuyPUojIiIiOv66Sd48UXW0Jw4inHTTVC7tt1FuR6FERERcU2nTsFjj0FiIosC+gPQoYP2v7ODwoiIiLgeyzLTdo8cIb7KHayIuQ8wYURyn8KIiIi4no8+guXLwcuLdf1WEBvrRsWKUK+e3YW5JoURERFxLdu3w6uvmuP332fxL7cA0L69ZvPapZDdBYiIiOSas2ehc2dISIC2bUns3Zdl5c1LHTvaW5orUwYUERHX0bcv/PUXBATArFls+N6NM2egXDlo1Mju4lyXwoiIiLiGzz6Dzz83fTFffgmlSrFokXmpXTvw8LC3PFemMCIiIgVbfDwMHw7PPGMev/UWNGpEcjIsW2ae0iwae2UpjEyePJnKlSvj4+NDYGAgW7duver5EydOpFq1ahQuXJiAgAD69+/PxYsXs1SwiIhIpm3fDnXqwKhRkJQEjz8OQ4YAsG0bnDwJfn7QpIm9Zbo6p8PIggULGDBgACNGjGD79u3UrFmT5s2bc/LkyXTP//LLLxk8eDAjRoxg9+7dzJw5kwULFvD6669fd/EiIiLpSkyEkSMhMBB+/x3KloWFC2Hu3NT+mJUrzanNm4Onp421ivNhZMKECfTq1YsePXpwxx13MHXqVIoUKcKsWbPSPX/Lli00bNiQLl26ULlyZZo1a8bjjz9+zdYUERGRLNm1C+rXhxEjTGtIhw7wxx9musxly6uuWmXuH3nEpjollVNhJCEhgfDwcIKCgi69gbs7QUFBhIWFpXtNgwYNCA8PTw0fBw4cYPXq1bRs2TLDvxMfH09sbGyam4iIyFUlJ8P48XDPPRAeDiVLmoGqCxealpHLHD0KO3aYbNKihT3lyiVOrTMSHR1NcnIy/v7+aZ739/fnzz//TPeaLl26EB0dTaNGjbAsi6SkJPr06XPVbpoxY8bw1ltvOVOaiIi4sn374KmnYPNm87hlS5g+HSpWTPf01avNfWDgFTlFbJDjs2k2bNjA6NGjmTJlCtu3b2fJkiWsWrWKUaNGZXjNkCFDiImJSb0dOXIkp8sUEZH8yOGAyZOhZk0TRIoXhxkzzICQDIIIXOqiadUql+qUq3KqZaRMmTJ4eHgQGRmZ5vnIyEjKly+f7jXDhg2ja9eu9OzZE4C77rqLuLg4evfuzRtvvIF7Omvvent74+3t7UxpIiLiag4fNpvdhYaax02bwuzZcNNNV73s4kVYt84ca7xI3uBUy4iXlxe1a9cmNOWLBxwOB6GhodSvXz/da86fP39F4PD4dySzZVnO1isiIq7OskzouOsuE0QKF4b//c8kjGsEEYDvv4fz56FSJdOgIvZzem+aAQMG0L17d+rUqUPdunWZOHEicXFx9OjRA4Bu3bpRqVIlxowZA0Dr1q2ZMGECd999N4GBgezbt49hw4bRunXr1FAiIiKSKSdPQs+esGKFeVyvHsyZA1WrZvotvv3W3DdvnmZyjdjI6TASHBxMVFQUw4cPJyIiglq1ahESEpI6qPXw4cNpWkKGDh2Km5sbQ4cO5dixY5QtW5bWrVvzzjvvZN+nEBGRgm/NGujeHSIjwcvLLGT2yitOr+OeEkaaNcuBGiVL3Kx80FcSGxuLn58fMTEx+Pr62l2OiIjkpvh4eP11mDDBPK5e3UzZvesup9/qxAkzrtXNzTSylCmTzbVKGpn9/dbeNCIikiccOGDW/kjzv8h79pgFzFKCSN++sHVrloIIwNq15v6eexRE8hKnu2lERESy044d8PbbsHixeRwQAB3aW3T0WUn9/z2O+4U4KF0aZs2CNm2u62+lhBF10eQt6qYRERFbJCWZFdvHjLnUGlKkiJnpkqIix+hww090nNiYho+WdXZ4SBqWBRUqmCEn69ebmcCSs9RNIyIiedbx4xAUBKNHm5DQqZPZzy46dCfLyvfhST7HlxiOU4lJR9tzf8eyVKoEzz9vgkRSkvN/87ffTBApUgQaNMj+zyRZpzAiIiK5au1aqFXLrPdRvDjMnw8L5lvcGTaDwk0CaRvxCZ/fOJSTP+xh5UqzynuJEiZIfPwxPPigGYT63HOwYYPZkiYzUmbR3H8/aF3NvEVhREREckVysumWad4coqKgRg34+WcIfiTOTNnt1cvMnGnZErZvx7txXVq1MuubRUZCSIhZYqR0aXP91Kmmq+WGG+DFF2HjRrM6fEY0XiTv0pgRERHJcVFR8Pjjl1Zu79ULPvwQCh/aDR07wq5d4O4O77wDAwea4wwkJsJ338GCBbB0KZw5c+m1ihXhsccgONhsgpfyNkePwq23mqXgf/8d7rwzBz+spMrs77fCiIiI5Kht26BDBzhyxIzX+OQTePJJzFohvXtDXJwZWTpvnulDcUJCglkF/quvYNkyiIm59FpAgBmL0qkTDB9u1kwLDISwMK28mlsURkRExHYzZ5pBpwkJpmVi6VK4s8pFePllk0oAHnjABJN/V/LOqvh4My5kwQL4+ms4dy7t6z4+8MsvcNtt1/VnxAmaTSMiIraJjzeNHj17miDStq1pIbmz8AFo2NAEETc3GDbMJIjrDCJgBqW2bg1ffGFWV12yBDp3hqJFzevjximI5FVa9ExERLLVkSOmW2bbNpM33n4bBg8G9xVfm4GqMTFm+dMvvjCjWXNA4cLQrp25nT9vaqpWLUf+lGQDhREREck269ebwaPR0VCqlOl9af6QA95802xsB2aRjwULzDSYXFCkiIJIXqduGhERuW6WBe+9Bw89ZILI3XebabvN68WYPpqUINKvn1kcJJeCiOQPahkREZHrEhcHTz9tZrSAWaRsyhQo/PefEPio2ezO2xumT4euXe0sVfIohREREcmyv/+GRx81m915esL//gfPPgtuK5ab+btnz5pWkKVLoU4du8uVPErdNCIikiUbN8K995ogUq6cGS/Sp7cDt5Fvma6Zs2fhvvsgPFxBRK5KYURERJw2fbrZIyYqyowP2bYNGtWIhfbtzWBVgBdeMCuSlStna62S9ymMiIhIpiUnm31gevc2y7J36gSbNsGNF/ea5U2//tqMD5k9GyZNMn03ItegMSMiIpIp589Dly4mb4DZRmbIEHBb+61JJTExUKmSGR9y7732Fiv5isKIiIhc06lTZnXTsDDT8DF3LnRob8FHH0H//qbJpEEDs+xpNqymKq5FYURERK7q0CFo0cLM0C1RApYvh8b1EuG5Fy/tL9O9uzn29razVMmnFEZERCRDO3fCww/DiRNmhm5ICNxZ/hQ062gWL3Nzg3ffhVde0Va4kmUKIyIikq5t26BZM/jnH6heHb75Bm648BfUbwV//QXFisG8efDII3aXKvmcZtOIiMgVwsIgKMgEkQYNzJoiNxzcCPXqmSBy442wZYuCiGQLhREREUlj0ybTIhIba9YsW7MGSqz8wqST06fNTJmffoK77rK7VCkgFEZERCTVhg3QvDmcOwcPPACrV1kUG/+m2VMmIQE6dDAnlS9vc6VSkCiMiIgIYBZLbdnSrCfSrBmsWBRP0ee6wVtvmRMGDjS74RUpYm+hUuBoAKuIiLBmjdnw7uJFE0gWfxKNT5t2ps/GwwM+/hh69bK7TCmgFEZERFzcypWm9yUhAdq0sfiq2yq87+8HBw6Ary8sXmzGi4jkEIUREREXtmyZWck9MRHa3xfFvMgOeHXcaF686SZYtQruvNPWGqXg05gREREXtXgxPPbYvxvelf+B+T9UxOunjVC4MAweDDt2KIhIrlDLiIiIC1qyBIKDLZKT3ejCXOZEdKdQITfo9RwMGwYVKthdorgQtYyIiLiYZZOOENwxieRkN57kcz6jG4Ue7wS7d8OUKQoikuvUMiIi4ip27mT58yF02tKfJArRhbl82nIhHu+EQ61adlcnLkxhRESkoNu2Dd5+m9XLE+nIMhLxovMNG5mz+HY86i63uzoRddOIiBRYmzdDixZQty6hy8/RniUk4kWn5jF8frAxhereY3eFIoBaRkREChbLgu++g1GjzLLtwGb3xrRxX018kjdt28IXC/0opP/6Sx6ilhERkYLAsiAkBBo1ggcfNEHE05Ntj77Nw0U2cD7Jm+bNYcEC8PS0u1iRtJSNRUTyM8uCb7+FESPMTroA3t7Qqxe/tn6D5p3Lc/YcNGlipvN6e9tarUi6FEZERPIjy4K1a+HNNyEszDxXuDA89xy8+ip/xlQg6D44cwbq14fly7W/neRdCiMiIvmJZUFoqGkJ2bLFPOfjY0LIwIFQvjz795uemqgouOceWL0aihe3t2yRq1EYERHJDywL1q83LSGbNpnnfHygTx8YNAjKlwfg8GF44AE4fhyqVzc9OCVK2Fa1SKYojIiI5GUps2PefBM2/ruBnbf3pRBy2Wqpx4+bIHL4MFStanpxSpe2p2wRZ2g2jYhIXrVhgxl5+uCDJoh4e8OLL8KBAzBxYmoQsSyz+26jRrB/P9x8s+nJ+bexRCTPU8uIiEhe8/33piXk33VC8PKC3r3NTrqVKqU59ccf4bXXLvXc3HijCSI33JCrFYtcF7WMiIjkFRs3mn6WJk1MEPHyguefN80dkyalCSL790OnTmamzKZNZiLNG2/Ab7+ZlhGR/EQtIyIidgsLg2HDTJMGmFXJevaEIUMgICDNqadOmcVVp0yBxERwc4OnnoKRI9UaIvmXWkZERP4VEgJFi8L998MXX8CFCzn8B3fsgEcegQYNTBDx9IRnn4V9+0zauCyIXLwI774LVarAhx+aINKihXmLWbMURCR/UxgREfnX22/D+fPwww/QtavpFXnpJdP1ka127zZ9LHffDatWgYcHPPMM/PUXTJ1qBn78y+EwwahaNTN5JiYGatUyM2W++QZq1Mjm2kRsoDAiIgL8+afZ5NbDA15/HW66yaxeOmmS+cGvVw9mzoRz567jjxw4AN27mwVAFi40fSyPPw67dsGMGeaPXiY0FOrUMcHo8GHT+jFnDoSHQ1DQ9X1ekbxEYUREBBM0AFq2hHfeMQNEQ0KgQwcoVMhs+9Kzp5lN++yz8PPPZkptphw7ZlZIrVYNPvvMNHc8+ijs3AlffmkWBbnM77+bOoKC4JdfwNcXxoyBvXuhWzdw13+5pYBxs6xM/+tkm9jYWPz8/IiJicHX19fuckSkgElIMK0OUVHw9dfQpk3a1yMjTYvEjBmmJyVFrVrQqxc88QT4+aXzxlFRMHasGf9x8aJ5rlkz0x90771XnH78OAwfDrNnm7xSqJDJMMOGQdmy2fZxRXJNZn+/FUZExOUtXQrt25tWj8OHTQhIj2WZ8STTp8OiRRAfb54vXNgMAenZExo2BLeYf2D8eLMwWVycOalRI9Pkct99V7zv2bPw3nvw/vtmzApAx44wejTcemu2f1yRXJPZ32819omIy1u40Nx36ZJxEAEzxCNlps3x42ZWS/XqZtbNnDnQuDHc6R/NhArvEf3OVBNEatc2/T0//HBFEDl8GIYONTNkRo0yQaRBA7P/3cKFCiLiOtQyIiIuLSHBdIHExpoBrA0aOHe9ZcFPGxOYPnAv83+6mfMUBcDLLYF2DU7S681KNH3ALXWch8NhNq+bMsVMpHE4zPO33mp6dNq1M6FHpCBQN42ISCaEhMDDD5t9XI4dy8Lg0PBwM6p01y5iKc68sv2YXuxlwg9e2qGuShUzc9fDAz75xEyqSfHAA2aR1TZtzDIjIgVJZn+/tQKriLi0pUvNfdu2WQgiY8eafpbkZPD3x/ftt3m2e3ee9fTkl1/M2JK5c83MnNdfv3SZnx/06GE23q1WLds+iki+pZYREXFZyclmYbPISFizxkx0ybSpU81UF4DgYJg8GUqXvuK08+fN+I85c8yqqT16QOfOUKRI9nwGkbxM3TQiItewebOZ5OLnBydPmn3pMiU0FJo3N2lm5Egz91ZErqDZNCIi1/D11+a+VSsngsjevWbebXKyWWBk6NAcq0/EVSiMiIjLWr7c3D/6aCYvOHPGbGz3zz9Qv75ZBU1TX0Sum8KIiLikPXvMzdPT9LhcU2IiPPaYWYL1xhvNyFcfnxyvU8QVKIyIiEtascLcN21q9n65Kssy2/eGhkLRouZif/8cr1HEVSiMiIhLSumi+e8+NOn66CMze8bNDebNM9v4iki2URgREZcTHW1m0gC0bn2Nk0NC4OWXzfG772biAhFxlsKIiLic1avNMuy1apnhHxnatcusIeJwmAVCXnklt0oUcSkKIyLicjLVRRMdbVpBYmPNDngp3TQiku2yFEYmT55M5cqV8fHxITAwkK1bt171/H/++Ye+fftSoUIFvL29qVq1KqtXr85SwSIi1+PiRdPzAlcJIwkJ0KGD2UTm5pthyRInFiIREWc5vTfNggULGDBgAFOnTiUwMJCJEyfSvHlz9uzZQ7ly5a44PyEhgYceeohy5cqxaNEiKlWqxN9//02JEiWyo34REads2ABxcVCxItxzTzonWJZZ5v2HH8w0m5UroUyZ3C5TxKU4HUYmTJhAr1696NGjBwBTp05l1apVzJo1i8GDB19x/qxZszh9+jRbtmzB898tKStXrnx9VYuIZFFKF03r1hn0ukyYALNmmV3zFiyAO+7I1fpEXJFT3TQJCQmEh4cTFBR06Q3c3QkKCiIsLCzda5YvX079+vXp27cv/v7+VK9endGjR5OcnHx9lYuIOMmyrjFeZNUqeO01c/zBB9CiRa7VJuLKnGoZiY6OJjk5Gf//LPbj7+/Pn3/+me41Bw4cYP369TzxxBOsXr2affv28fzzz5OYmMiIESPSvSY+Pp74+PjUx7Gxsc6UKSKSrl9+gWPHzI65Dzzwnxf37oUuXUxi6d0bXnzRlhpFXFGOz6ZxOByUK1eOadOmUbt2bYKDg3njjTeYOnVqhteMGTMGPz+/1FtAQEBOlykiLiClVaR58/+s5H72LLRrZ2bONGwIkyZp5oxILnIqjJQpUwYPDw8iIyPTPB8ZGUn58uXTvaZChQpUrVoVDw+P1Oduv/12IiIiSEhISPeaIUOGEBMTk3o7cuSIM2WKiKQr3S4ay4KnnjJrilSsCIsWaeaMSC5zKox4eXlRu3ZtQkNDU59zOByEhoZSv379dK9p2LAh+/btw+FwpD63d+9eKlSogFcG/8J7e3vj6+ub5iYicj2OHDHdNG5u0KrVZS+MGXNp6u7ixZDB/1iJSM5xuptmwIABTJ8+nTlz5rB7926ee+454uLiUmfXdOvWjSFDhqSe/9xzz3H69Gn69evH3r17WbVqFaNHj6Zv377Z9ylERK4hZWO8Bg2gbNl/n/zmGxg61BxPngz16tlSm4irc3pqb3BwMFFRUQwfPpyIiAhq1apFSEhI6qDWw4cP4+5+KeMEBASwZs0a+vfvT40aNahUqRL9+vVj0KBB2fcpRESu4Youmn37Lg1YffZZ6NnTttpEXJ2bZVmW3UVcS2xsLH5+fsTExKjLRkScFhtr1i1LTITdu+G2G86ZVpA//jBNJd99p3EiIjkgs7/fTreMiIjkN99+a4LIrbdCtaoWBPcwQaRCBQ1YFckDtFGeiBR4l3fRuL33rgkgnp7mvkIFe4sTEbWMiEjBlpRkFlYFaFN+Kwz8d4D9pEmmi0ZEbKeWEREp0LZsgdOnoVSJZBq83dIMWO3VywxaFZE8QWFERAq0lC6aVqymUMwpCAw0rSIikmcojIhIgbZypZkw2Pqfz8Df3yxs5u1tc1UicjmNGRGRAmv/ftizx41CJNLMYz0s+hoqVbK7LBH5D7WMiEiBtep9s5t4YzbiN+ltaNTI5opEJD0KIyJSMB08yMrpJwBodW8U9Oljc0EikhGFEREpeM6f51ybLnyfZKbutpr+qNkhT0TyJIURESlYLAt69mTd7/4k4M0tNyZRrYYGrIrkZQojIlKwfPABzJvHKrfWALRqW0iNIiJ5nMKIiBQcoaHw2mtYwGrfYABatbK3JBG5NoURESkYDh2C4GBwONjxyDCOxxSjSBG4/367CxORa1EYEZH87/x5aNcOTp2COnVYVXsYAEFB4ONjc20ick0KIyKSv1kW9O4NO3ZA2bKwZAmr1ngC8Mgj9pYmIpmjMCIi+duHH8LcueDhAV99RZRPAD/9ZF5q2dLe0kQkcxRGRCT/+u47ePVVc/z++9CkCd98YxpLatXSyu8i+YXCiIjkT4cPQ6dOkJwMXbvCSy8BsGqVeVmzaETyD4UREcl/LlwwA1ajo+Huu+GTT8DNjcREWLPGnKIwIpJ/KIyISP5iWWafme3boUwZWLoUChcGYMsWiIkxT9eta3OdIpJpCiMikr989BF89lnqgFVuuin1pZQumhYtzMsikj8ojIhI/vH999C/vzl+7z1o2jTNyxovIpI/KYyISP5w5Ag89pgZsNqlC7z8cpqXDx2CXbtMi0jz5rZUKCJZpDAiInnfxYvQvj1ERZk5u9On89/d71JaRRo0gJIlc79EEck6hRERydssC557Dn7+GUqVMgNWixS54rSUMKJVV0XyH4UREcnbpkyBTz8Fd3dYsAAqV77ilLg4WL/eHGu8iEj+ozAiInnXxo2XxoaMG2d2vkvH+vUQH28m1txxR+6VJyLZQ2FERPKmo0ehY0dISoLOneGVVzI89fJZNP8ZSiIi+YDCiIjkPfHx0KEDnDwJNWrAjBkZpgzL0pRekfxOYURE8hbLgr59YetWMy1m6VIoWjTD03/7zTSiFC58xbIjIpJPKIyISN7yyScwc6YZsDp/Ptxyy1VPT2kVeeCB1FXhRSSfURgRkbxj8+bU3XcZPRqaNbvmJeqiEcn/FEZEJG84ftwMWE1MNCutDhx4zUtOnYKwMHOsMCKSfymMiIj9UgasRkRA9eowa1ampsWsWQMOB9x1F9x4Yy7UKSI5QmFEROz30kvw449QogQsWwbFimXqMnXRiBQMCiMiYq9p08zNzQ3mzYMqVTJ1WVISfPONOVYYEcnfFEZExD5hYfDCC+b4nXegRYtMX/rjj3DmjJn9W69eDtUnIrlCYURE7HHihBknkpho7gcPdurylC6aFi2gUKEcqE9Eco3CiIjkvoQEM3PmxAmzmczs2U6v467xIiIFh8KIiOS+fv1gyxbw8zMDVosXd+ryw4fNyqvu7k717IhIHqUwIiK5a8YMmDrVtITMnQu33ur0W6xebe7r1YPSpbO5PhHJdQojIpJ7fvrJ7DsDMHJklvtY1EUjUrAojIhI7oiIMANVExLg0Ufh9dez9DYXLkBoqDlWGBEpGBRGRCTnJSSYJd6PHYPbb4fPPjMDPrIgNNQEkhtugBo1srlOEbGFwoiIZKvk5HSeHDAANm0CX19YutTpAauX+/prc9+2rdMTcEQkj9LsfBG5LklJZu2y5cvNbe9e8PY2K7oXKwZliKb83y0oTy3Kt7iP8murUv43KF/+0i2Tq7/jcMCKFea4bduc+0wikrvcLMuy7C7iWmJjY/Hz8yMmJgZfX1+7yxFxeWfPwrffmvCxapXZPfd6FCmSNpykd/P3h4MHoUkT08ASFQVeXtnycUQkh2T291stIyKSKUePmlaJ5cth/XozDCRFqVJmMGmbNtCwoVlU9dyhaGI7Pk10lIOImi2I6PA8EZHuRESQejtxAs6fN7cDB8wtM1q2VBARKUgURkQkXZYFO3de6n4JD0/7epUqpqukbVto0OA/S7JfvAhPtoeojVCtGnzfFfzSH6J27pwJJpGRpAkq6d2Sksw1PXrkzGcWEXsojIhIqoQE2LDhUgA5cuTSa25uUL++af1o0wZuuy2DAaQpS71v3GgGqi5bZlZazUCxYvB//2duV+NwmI3xHA4oWzYrn05E8iqFEREXd/o0fPONmaUSEmLGg6QoXBiaNTOtH61aQbly13izpCR4/HEzkMTHxySa227Lljrd3bXaqkhBpTAi4oISE+GTT2DxYtOAcfl03PLloXVr0/rx4IMmkGTK2bPQrZtpCfHyMummSZMcqF5EChqFEREX9NJLZnuYFHfddan7pU6dLKxHtns3tG8Pf/4Jnp6waJFpUhERyQSFEREX89VXl/ape+cdCA6GW265jjdcuBCeftqMRK1UyQSRevWyrV4RKfgURkRcyP790LOnOR4yxNyyLCkJBg+G9983j5s2hfnzMzGwREQkLS0HL+Ii4uOhUycztKNxY3jrret4s4gIM6AkJYgMHGhWQVMQEZEsUMuIiIt47TXYvt3MSPnyy/+sC+KMzZvNpncnTpipu7Nnm914RUSySC0jIi7gjz9g0iRz/PnnZsdbp1kW/O9/ZobMiRNwxx2wbZuCiIhcN4URERfw4YfmvkMHePjhLLxBXBw88QT062fGigQHw08/mdVVRUSuk7ppRAq46GjTGgLQv38W3mDvXpNifv8dPDxg/HgTStJdflVExHkKIyIF3CefmK1i6tQxe8g4Zdky6N4dYmPNamhffWVGv4qIZCN104gUYAkJMHmyOX75ZScaM5KSzLzfdu1MEGnUyIx+VRARkRyglhGRAmzhQjPWtEIFMwEmU6KizP4yoaHm8csvw7vvmpVVRURygMKISAFlWTBxojnu29dsF3NNW7ea8SFHj0KRIjBzJnTunJNlioiom0akoNqyBX7+2Wye27v3NU62LLNGfOPGJohUrWqCiYKIiOQChRGRAiqlVeTJJ6Fs2auceOEC9OgBzz1nBpm0a2fWD7nzztwoU0REYUSkIPr7b1iyxBz363eVE/fvN1Ns5swxW/WOGweLF4Ovb67UKSICWQwjkydPpnLlyvj4+BAYGMjWrVszdd38+fNxc3Pj0UcfzcqfFZFM+ugjcDggKAiqV8/gpCVL4J57YMcO03Sydq3ZY0brh4hILnM6jCxYsIABAwYwYsQItm/fTs2aNWnevDknT5686nWHDh3i1VdfpbGmBorkqHPnYPp0c/zyy+mckJAAAwaYgaqxsaZlZPt2eOCB3CxTRCSV02FkwoQJ9OrVix49enDHHXcwdepUihQpwqxZszK8Jjk5mSeeeIK33nqLW2655boKFpGrmzMHYmLg1lvTWfr9yBGzt8wHH5jHr7wCGzZkcbMaEZHs4VQYSUhIIDw8nKCgoEtv4O5OUFAQYWFhGV43cuRIypUrxzPPPJOpvxMfH09sbGyam4hcm8NxaR+afv3MMJBUISFw990QFgZ+frB0qVnaXeuHiIjNnAoj0dHRJCcn4+/vn+Z5f39/IiIi0r1m06ZNzJw5k+kp7caZMGbMGPz8/FJvAQEBzpQp4rK++Qb++stkje7d/30yORmGDYOWLeHUKTNOZPt20NgtEckjcnQ2zdmzZ+natSvTp0+nTJkymb5uyJAhxMTEpN6OHDmSg1WKFBwp03l79YJixYDISGjWDN5+26wl0qcPbN4M6i4VkTzEqRVYy5Qpg4eHB5GRkWmej4yMpHz58lecv3//fg4dOkTr1q1Tn3M4HOYPFyrEnj17qFKlyhXXeXt74+3t7UxpIi7v999h3TrTNfPCC8APP5hFy06cgKJFYdo06NLF7jJFRK7gVMuIl5cXtWvXJjRlzwpMuAgNDaV+/fpXnH/bbbfx22+/sWPHjtRbmzZtaNq0KTt27FD3i0g2Shkr0r6dxU3zxkLTpiaI3HGHWcRMQURE8iin96YZMGAA3bt3p06dOtStW5eJEycSFxdHjx49AOjWrRuVKlVizJgx+Pj4UP0/ixyUKFEC4IrnRSTroqLg88/N8csnBsGQ98yDrl3h449Ny4iISB7ldBgJDg4mKiqK4cOHExERQa1atQgJCUkd1Hr48GHc3bWwq0humjYN4uPhXq8dNNjyHnh7m5XPnnlGi5iJSJ7nZlmWZXcR1xIbG4ufnx8xMTH4aplqkTQS4i0qlzvPidiizKULXapshUWLoFYtu0sTEReX2d9vNWGI5GcxMSy8/yNOxBalIsfo2M4B4eEKIiKSryiMiORX27Zh3X0PH/xkBo/3bfU3XovnmUVGRETyEYURkfzGssxy7g0b8t3BmwinDoW9k+n9aQONDxGRfMnpAawiYqNTp6BHD1ixAoB3/SdAJDzTywMn1hUUEclT1DIikl9s2mTGgqxYAd7e7Hx9AWsia+HhYTbhFRHJrxRGRPI6hwNGjza77R49ClWrwo8/8u6hTgB06gQ332xviSIi10NhRCQvi4yEFi3gjTfMhndPPgk//8yhErVYsMCc8tpr9pYoInK9FEZE8qp166BmTVi7FooUgdmz4bPPoHhxPvjAZJOHHoK777a7UBGR66MwIpLXJCXBsGFmt93ISKhe3ewt89RT4ObGqVMwY4Y5deBAWysVEckWmk0jkpf8/bfpitm0yTzu3RsmToTChVNPmTwZzp83LSIPPmhPmSIi2UktIyJ5xfz5pltm0yYoXhzmzYNPPkkTRM6fh0mTzPGgQVpWREQKBrWMiNjt7Fl48UWYM8c8rlcP5s6FW2654tRPP4XoaDN7pkOH3C1TRCSnqGVExE5bt5r+ljlzwN3djBXZuDHdIJKUBO+/b45feQUK6X8lRKSA0H/OROyQnAzvvgvDh5uUceON8MUX0LhxhpcsXgwHDkDp0mYRVhGRgkJhRCS3HTkCXbvC99+bx506mbEhJUpkeIllmewCpkenSJGcL1NEJLeom0YkNy1ebAapfv89FC1q1g6ZP/+qQQRg/XrYvt2EkBdeyJ1SRURyi1pGRHJDXBy8/PKlBULq1IEvv4Rbb83U5SmtIs88Y7ppREQKErWMiOS07dvhnntMEHFzg8GDYfPmTAeRHTvg22/RhngiUmCpZUQkpzgcMGECvP46JCZCpUrw+efQtKlTb5PSKtKpE1SunP1liojYTWFEJCccPw7du5v9ZQDat4dp05zuYzl0CL76yhxrQzwRKajUTSOS3UJCzCDVdevMiNNp02DRoiwN9nj3XW2IJyIFn8KISHZJTIQhQ+Dhh80yqbVqQXg49OqVpXXbjx+HmTPN8RtvZG+pIiJ5ibppRLLDkSPQuTNs2WIe9+0L48eDj0+W33L8eEhIgEaN4L77sqlOEZE8SGFE5HqtXGnGh5w+Db6+pjmjY8fresuoKLMOGsDQodoQT0QKNnXTiGRVYiK8+iq0bm2CSJ068Msv1x1EACZONDv01q4NzZpdf6kiInmZWkZEsuLQIdMt89NP5nG/fjBuHHh7X/dbnzkDkyaZY7WKiIgrUBgRcdayZWanun/+Mcu4z54Njz6aqUuTk02DSmKiGQ9y+X3K8bhxcPYsVK8Obdrk4OcQEckjFEZEMiniSCKbn/uCQ6t+5xwvca7CrZx7oA3nFvtybg6cO2ducXHm/sKFKwOHw5H5v/fGG+CujlQRcQEKIyLpsCz46y/YtMncNm5IYt9BT6DHpZNOAHOv/295eYGn56V7T09o0sSsuCoi4goURkSApCSzB8ymTbBxo7k/efLyMwrhhoMa7r9TvVFJfO8MoFgxs/FusWJX3ooWhcKF0waM9EKHh4fGhIiIKIyIS4qLM2NPU8JHWJh57nLe3lD3xggaH5hDo+QNNPi/KPxWfAG3BdhTtIhIAaUwIi4hKspslJvS6rF9u2kNuVyJEtCwITRuDI0CE6kz7xW8p/07raVtW/hsvVlHREREspXCiBRIUVGwevWllo89e64854YbTPBo3Niscnrnnf8OGD1xwqwVsmWL6UN56y2NJhURyUEKI1Lg/PUXNGhgtoe53J13XgoejRvDjTemc/HmzfDYYyaQ+PnB3LnQqlWu1C0i4qoURqRAiY6Gli3N/f/9H7Rvb8JHw4ZQqtRVLrQs+OADGDjQLAZyxx1mPZFbb82t0kVEXJbCiBQYyckmfOzbB5Urmy4af/9MXBgTA08/DUuWmMedO8P06WZajIiI5DiFESkwJk0y40N8fc14kUwFkZ07zfiQffvMXNsPPoDnn9d8WxGRXKQwIgXCwYNmjCnAe+/B7bdn4qJPP4XnnoOLF80AkoULoW7dnCxTRETSoekBku9ZFvTpY3a5vf9+6NnzGhdcuAC9epn9ZS5ehBYtzFxfBREREVsojEi+9/nn8O23ZpGyadOuMQN3/34z1WbGDNMVM3IkrFoFpUvnWr0iIpKWumkkXzt5Evr3N8dvvglVq17l5GXL4KmnzIDVMmVg3jwICsr5IkVE5KrUMiL52ksvwenTUKsWvPJKBiclJpopu+3amSDSoAH88ouCiIhIHqGWEcm3VqyABQtMt8yMGWYyzBUOHYLHH4cffzSP+/eHceMyOFlEROygMCL50j//mEGrYFpEatdO56RFi8xo1pgYs5rqjBlmGq+IiOQp6qaRfGnAADh+3CyQ+uab/3nxwgWTVB57zASRevVgxw4FERGRPEphRPKdb76B2bPNZJhZs6BIkcte/OMPuPde+OQTc8KQIfDDD2ZJVhERyZPUTSP5SkyMWSIEoF8/s+8MYBYbmT7dPHnxIpQvb+b8apCqiEiepzAi+crLL8OxY2YTvHfe+ffJf/6B3r3NCqoAzZvDZ59BuXI2VSkiIs5QN43kG8uWmRXc3dxMN02RIkBYmJnXu3AhFCpk1oJfvVpBREQkH1EYkXwhMvJS98zAgdCofjKMHg2NG8Pff8Mtt8DmzfDqq9dYglVERPIaddNInmdZZoZudDTUqAFvPXUQ7u9qwgeYdUSmTjXb9YqISL6jMCJ53syZsHIleHlZfNHxa7zv7QrnzkHx4jBpEnTrZvpuREQkX1IYkTxt/34zaBXg7aqfc9fw7uZBo0Zmtoym7IqI5HvqXJc8KzkZuneHuDi4zzOMAb/3MMu4jxkDGzYoiIiIFBBqGZE86713Eti82YvixDIn8XE8bq8GX3wB99xjd2kiIpKN1DIiedK2uXsYPsKMA/mQflR+sQ2EhyuIiIgUQAojkrdYFjFjphD8pBeJeNLeZxVPfdMZ/vc/KFzY7upERCQHqJtG8o7ISKynetA7pDsHuZnKRSKZ+Ws93KqUtrsyERHJQWoZkbxhzRqoWZNpIQF8RTCF3JOZH1qOEgoiIiIFnsKI2Cs+Hl55BVq04NfIcrzs9iEAY8Z5EFhPa4eIiLgCddOIfQ4ehA4d4JdfiKMIwSW/5eIZHx5+GAYMsLs4ERHJLWoZEXuEhUFgIPzyC5QuzQsP7ubPM+WpWBHmzNH2MiIirkT/yZfcN28eNG0KUVFw9918Pmwvn4beiLs7fPkllC1rd4EiIpKbFEYk91gWjBwJXbqYsSJt27Jnxkaee6MUAMOHw/3321yjiIjkOo0Zkdxx8aLZenfuXPP41Ve5+OZYght6EBcHTZrA0KG2VigiIjZRGJGcFxUFjz4KW7ZAoUIwZQoXu/aiZ0/YudN0y8ydCx4edhcqIiJ2UBiRnLVrFzzyiJk54+cHixcTXuJButU2L7m5me1mKla0u1AREbFLlsaMTJ48mcqVK+Pj40NgYCBbt27N8Nzp06fTuHFjSpYsScmSJQkKCrrq+VKArFgB9eqZIHLLLSR8H8bw7x8kMNAEEX9/+PpraNbM7kJFRMROToeRBQsWMGDAAEaMGMH27dupWbMmzZs35+TJk+mev2HDBh5//HG+++47wsLCCAgIoFmzZhw7duy6i5c8yuGAN9+ENm3g7Fm4/35+nbmNwKduZ9QoSE6G4GD4/Xdo3druYkVExG5ulmVZzlwQGBjIvffey0cffQSAw+EgICCAF198kcGDB1/z+uTkZEqWLMlHH31Et27dMvU3Y2Nj8fPzIyYmBl9fX2fKldz2zz/QtSusXAlAQp+XeK/C+7z1diESE6F0aZgyBTp1srdMERHJeZn9/XZqzEhCQgLh4eEMGTIk9Tl3d3eCgoIICwvL1HucP3+exMRESpUqleE58fHxxMfHpz6OjY11pkyxyx9/QLt28NdfXPDyY2anNYxbGcjRo+bltm3hk09M94yIiEgKp7ppoqOjSU5Oxv8/vyb+/v5ERERk6j0GDRpExYoVCQoKyvCcMWPG4Ofnl3oLCAhwpkyxw6JFEBhI3F/HeL/EKG4pHsWLX5ggUqECfPYZLF2qICIiIlfK1UXPxo4dy/z581m6dCk+Pj4ZnjdkyBBiYmJSb0eOHMnFKsUpyckweDAxjz3D6LiXqOx5jFf/GUrEKU9uvBEmT4YDB0zPjZv2vRMRkXQ41U1TpkwZPDw8iIyMTPN8ZGQk5cuXv+q148ePZ+zYsaxbt44aNWpc9Vxvb2+8vb2dKU3scOoUEe2f58MfajGFw8TiB4lQpQq8/jo8+SR4edldpIiI5HVOtYx4eXlRu3ZtQkNDU59zOByEhoZSv379DK979913GTVqFCEhIdSpUyfr1UqesW/xTvpUDqHyD3MYyxBi8eOOO8yaIX/+CU8/rSAiIiKZ4/SiZwMGDKB79+7UqVOHunXrMnHiROLi4ujRowcA3bp1o1KlSowZMwaAcePGMXz4cL788ksqV66cOrakWLFiFCtWLBs/iuSG7eEW43r9xaJfquOgJgANasUxeGRRWrXSbrsiIuI8p8NIcHAwUVFRDB8+nIiICGrVqkVISEjqoNbDhw/jftkv0scff0xCQgIdO3ZM8z4jRozgzTffvL7qJVdYFnz3HYx9O5G133kCVQFoVf5nBs++jUYtFCpFRCTrnF5nxA5aZ8QelgWrVsFbb8HPP5vnPEjicbcFDBwId43polGpIiKSoRxZZ0Rcxx9/QP/+sHateVyY8/RkBgMqfUXlJROgbl17CxQRkQJDPfxyhbffhpo1TRDxcktgIOP4m5v4X4cfqPzbCgURERHJVmoZkTTGj4dhw8xxe6+VvJfwErcUiYQPP4RnnlG3jIiIZDu1jEiq2bPhtdfM8VgGsTihNbfU8oPwcOjZU0FERERyhMKIALBsGfTsacYyv8a7DOJdM2jkxx/httvsLU5ERAo0ddMIGzZA52AHDoc7TzOTcWXfh8++gRYt7C5NRERcgMKIi9u+Hdo8kkx8ggePspRPqryH27qfoHJlu0sTEREXoTDiwnbuhIeaJHA2zosmfMe8muMotOZ7ba0rIiK5SmHERf36KzzY6CKnz/kQyI983fh9fFasAT8/u0sTEREXowGsLuj33ywerH+eU+d8qMtPrOkwHd+1ixVERETEFgojLuaPX5N5IPAc0eeLUIdtrHlxFX5fTQdvb7tLExERF6Uw4kJ2bb/IA3XPEnWhOPcQzrejwynxv5HaaldERGylMSMuYtfmMzzQ1MHJxNLUctvB2plHKdmjj91liYiIqGXEFfy8YD/33WcRmViamu6/se7r85Tq0dbuskRERACFkQLv+zFbeKBzWU45SnGv905CN3pRunUDu8sSERFJpW6agsqyWPn0Eh77tCUXKUzTEtv5+pebKF65tN2ViYiIpKEwUhBdvMiXD82m+6aeJOFJm5t2suDX6vj4etldmYiIyBXUTVPQ/P03k6v9jyc3PUsSnjxx7x4W7a2hICIiInmWwkgB4vhmDa/dtpwXDg/Ewp2+bY7w2Y/V8PRys7s0ERGRDCmMFAQOBxeHj6ZzyxjGX3wRgFEDzjBpWYCWEBERkTxPY0byu+hoTj3+Am3XvcBmGuHpnsSsGQ6e7FHS7spEREQyRWEkP1u4kF3PfsijZ2bxF1XxK5zA0lVeNG1qd2EiIiKZpzCSHx07Bv36sWgxPEUIcRTjxvIJrF7nxZ132l2ciIiIczSiID+5eBHGjOFC1ZoMXlyHx1hEHMVoer+DbTsVREREJH9SGMkPLAuWL+fkbffx5uvx3HR+F+MYDMCrr8K369wpV87mGkVERLJI3TR53R9/sOvZD5mwuS5f8APx+ABw440W48e78dhjNtcnIiJynRRG8ijr2HHW9ZzPhJA7CGFa6vN1ayfzykAP2rd3o5C+PRERKQBc+ufso4/MWNCgIGjYEHx87K4I4k7E8mWv75i0ugq/WQMAcMNBu2bneWVEMerX98BNa5iJiEgB4mZZlmV3EdcSGxuLn58fMTEx+Pr6Ztv71qwJv/5qjgsXhsaNTTB56CGoUYNcXTBsz4YTfPzKPj7dfhcxlACgqPt5nunwDy+NqUiVKrlXi4iISHbI7O+3y4YRy4K5c2HtWnM7cSLt62XLwoMPmmDy0EMQEJAtfzaNpCRYMXE/U94/z7qIu1Kf/z/Pv3muSwxPf3AXJUqqGURERPInhREnWBbs2gXr1plgsmEDxMWlPadqVRNKgoKgaVPw88v63zt62MGnr+9l2qKSHIn3B0xXzCOlw+j7UiEeev1e3AtpopOIiORvCiOZERYGZ8+Ct7e5eXmBtzcJbt78tKs4a7cUZd0mH7Zu9yA5+VILhYcH1K1rWk6aNIH69aFIkav/qcR4Bysm/MXMGQ5CDlTFgQcAZYii5x1hPDvuFio/Uj37PpuIiIjNFEYyo2FD2LLlmqfF4MsGmrCWZqx1e4i9VtU0r3uSSGDR37m/xE6aBOynfrXTFK1UAvz9+TOqNDOXl+Wz3+/mpKNs6jX3eWymZ/MjPDaxIT635kAfkIiIiM0URjKje3fYuRPi480tIeHK43QcJoB1BPEdTfmOphzjhjSve5LAvWwDYAsNU58v7xZB99u28vSznlR9pjEUK5Z9n0VERCSPURjJDpYFiYnph5R/j62L8Rw45M734cXY8IsvG34tzZEzl0KGO8m0unkXzzyZQMuB1fEs5p179YuIiNhIYcQmlgWHDplBsOfOQYcOULGi3VWJiIjkvsz+frv0omc5wc0Nbr7Z3EREROTaNH9UREREbKUwIiIiIrZSGBERERFbKYyIiIiIrRRGRERExFYKIyIiImIrhRERERGxlcKIiIiI2EphRERERGylMCIiIiK2UhgRERERWymMiIiIiK0URkRERMRW+WLXXsuyALMVsYiIiOQPKb/bKb/jGckXYeTs2bMABAQE2FyJiIiIOOvs2bP4+fll+Lqbda24kgc4HA6OHz9O8eLFcXNzs7sclxYbG0tAQABHjhzB19fX7nJcnr6PvEXfR96j78RelmVx9uxZKlasiLt7xiND8kXLiLu7OzfccIPdZchlfH199S92HqLvI2/R95H36Duxz9VaRFJoAKuIiIjYSmFEREREbKUwIk7x9vZmxIgReHt7212KoO8jr9H3kffoO8kf8sUAVhERESm41DIiIiIitlIYEREREVspjIiIiIitFEZERETEVgojcoXJkydTuXJlfHx8CAwMZOvWrRmeO336dBo3bkzJkiUpWbIkQUFBVz1fnOfM93G5+fPn4+bmxqOPPpqzBboYZ7+Pf/75h759+1KhQgW8vb2pWrUqq1evzqVqCz5nv4+JEydSrVo1ChcuTEBAAP379+fixYu5VK1kyBK5zPz58y0vLy9r1qxZ1h9//GH16tXLKlGihBUZGZnu+V26dLEmT55s/fLLL9bu3butp556yvLz87OOHj2ay5UXTM5+HykOHjxoVapUyWrcuLHVtm3b3CnWBTj7fcTHx1t16tSxWrZsaW3atMk6ePCgtWHDBmvHjh25XHnB5Oz3MXfuXMvb29uaO3eudfDgQWvNmjVWhQoVrP79++dy5fJfCiOSRt26da2+ffumPk5OTrYqVqxojRkzJlPXJyUlWcWLF7fmzJmTUyW6lKx8H0lJSVaDBg2sGTNmWN27d1cYyUbOfh8ff/yxdcstt1gJCQm5VaJLcfb76Nu3r/XAAw+keW7AgAFWw4YNc7ROuTZ100iqhIQEwsPDCQoKSn3O3d2doKAgwsLCMvUe58+fJzExkVKlSuVUmS4jq9/HyJEjKVeuHM8880xulOkysvJ9LF++nPr169O3b1/8/f2pXr06o0ePJjk5ObfKLrCy8n00aNCA8PDw1K6cAwcOsHr1alq2bJkrNUvG8sVGeZI7oqOjSU5Oxt/fP83z/v7+/Pnnn5l6j0GDBlGxYsU0/4GQrMnK97Fp0yZmzpzJjh07cqFC15KV7+PAgQOsX7+eJ554gtWrV7Nv3z6ef/55EhMTGTFiRG6UXWBl5fvo0qUL0dHRNGrUCMuySEpKok+fPrz++uu5UbJchVpGJNuMHTuW+fPns3TpUnx8fOwux+WcPXuWrl27Mn36dMqUKWN3OQI4HA7KlSvHtGnTqF27NsHBwbzxxhtMnTrV7tJc0oYNGxg9ejRTpkxh+/btLFmyhFWrVjFq1Ci7S3N5ahmRVGXKlMHDw4PIyMg0z0dGRlK+fPmrXjt+/HjGjh3LunXrqFGjRk6W6TKc/T7279/PoUOHaN26depzDocDgEKFCrFnzx6qVKmSs0UXYFn596NChQp4enri4eGR+tztt99OREQECQkJeHl55WjNBVlWvo9hw4bRtWtXevbsCcBdd91FXFwcvXv35o033sDdXf9/bhf9k5dUXl5e1K5dm9DQ0NTnHA4HoaGh1K9fP8Pr3n33XUaNGkVISAh16tTJjVJdgrPfx2233cZvv/3Gjh07Um9t2rShadOm7Nixg4CAgNwsv8DJyr8fDRs2ZN++famhEGDv3r1UqFBBQeQ6ZeX7OH/+/BWBIyUoWtqmzV52j6CVvGX+/PmWt7e39emnn1q7du2yevfubZUoUcKKiIiwLMuyunbtag0ePDj1/LFjx1peXl7WokWLrBMnTqTezp49a9dHKFCc/T7+S7Npspez38fhw4et4sWLWy+88IK1Z88ea+XKlVa5cuWst99+266PUKA4+32MGDHCKl68uDVv3jzrwIED1rfffmtVqVLF6tSpk10fQf6lbhpJIzg4mKioKIYPH05ERAS1atUiJCQkdZDY4cOH0/yfxccff0xCQgIdO3ZM8z4jRozgzTffzM3SCyRnvw/JWc5+HwEBAaxZs4b+/ftTo0YNKlWqRL9+/Rg0aJBdH6FAcfb7GDp0KG5ubgwdOpRjx45RtmxZWrduzTvvvGPXR5B/uVmW2qZERETEPvpfKhEREbGVwoiIiIjYSmFEREREbKUwIiIiIrZSGBERERFbKYyIiIiIrRRGRERExFYKIyIiImIrhRERERGxlcKIiIiI2EphRERERGylMCIiIiK2+n9jS+whZsCf0AAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "plot(df)" + "# Calculate metrics for all methods\n", + "methods = ['uncalibrated', 'lslr', 'cdf_spline', 'pygam']\n", + "results = []\n", + "\n", + "for method in methods:\n", + " brier = brier_score_loss(y_test, df[method])\n", + " logloss = log_loss(y_test, df[method])\n", + " ece = expected_calibration_error(y_test, df[method])\n", + " z_stat = spiegelhalters_z_statistic(y_test, df[method])\n", + " \n", + " results.append({\n", + " 'Method': method,\n", + " 'Brier Score': brier,\n", + " 'Log Loss': logloss,\n", + " 'ECE': ece,\n", + " 'Spiegelhalter Z': z_stat\n", + " })\n", + "\n", + "results_df = pd.DataFrame(results)\n", + "print(results_df.to_string(index=False))" ] }, { @@ -234,7 +247,35 @@ "id": "b261a896-7445-4b63-81e5-6cd6594da4e6", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Plot calibration curves for all methods\n", + "plt.figure(figsize=(12, 8))\n", + "\n", + "# Sample data for cleaner visualization\n", + "sample_size = 1000\n", + "sample_idx = np.random.choice(len(df), sample_size, replace=False)\n", + "sample_df = df.iloc[sample_idx].sort_values('uncalibrated')\n", + "\n", + "# Plot each method\n", + "plt.plot(sample_df['uncalibrated'], sample_df['lslr'], \n", + " label='LinearSplineLogisticRegression', linewidth=2, alpha=0.8)\n", + "plt.plot(sample_df['uncalibrated'], sample_df['cdf_spline'], \n", + " label='CDFSplineCalibrator', linewidth=2, alpha=0.8)\n", + "plt.plot(sample_df['uncalibrated'], sample_df['pygam'], \n", + " label='PyGAM', linewidth=2, alpha=0.8)\n", + "\n", + "# Add diagonal reference line\n", + "plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration', alpha=0.5)\n", + "\n", + "plt.xlabel('Uncalibrated Probability', fontsize=12)\n", + "plt.ylabel('Calibrated Probability', fontsize=12)\n", + "plt.title('Calibration Curves Comparison', fontsize=14)\n", + "plt.legend(loc='best')\n", + "plt.grid(True, alpha=0.3)\n", + "plt.xlim(0, 1)\n", + "plt.ylim(0, 1)\n", + "plt.show()\n" + ] }, { "cell_type": "code", @@ -250,7 +291,42 @@ "id": "8f596414-3296-4b78-8cbb-778a12dd0858", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Compare the spline transformations more closely\n", + "fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n", + "\n", + "# Sort by uncalibrated scores for better visualization\n", + "sorted_idx = np.argsort(clf_pred_test)\n", + "x_sorted = clf_pred_test[sorted_idx]\n", + "\n", + "# Plot each spline method's transformation\n", + "spline_methods = [\n", + " (lslr_pred[sorted_idx], 'LinearSplineLogisticRegression', axes[0]),\n", + " (cdf_pred[sorted_idx], 'CDFSplineCalibrator', axes[1]),\n", + " (gam_model_pred[sorted_idx], 'PyGAM', axes[2])\n", + "]\n", + "\n", + "for pred, title, ax in spline_methods:\n", + " # Plot transformation\n", + " ax.plot(x_sorted, pred, linewidth=2, label='Calibrated', color='blue')\n", + " ax.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='No change')\n", + " \n", + " # Highlight regions where calibration changes most\n", + " diff = np.abs(pred - x_sorted)\n", + " ax.fill_between(x_sorted, x_sorted, pred, where=(diff > 0.05), \n", + " alpha=0.3, color='red', label='Large adjustment (>0.05)')\n", + " \n", + " ax.set_xlabel('Uncalibrated Probability', fontsize=11)\n", + " ax.set_ylabel('Calibrated Probability', fontsize=11)\n", + " ax.set_title(title, fontsize=12)\n", + " ax.legend(loc='best', fontsize=9)\n", + " ax.grid(True, alpha=0.3)\n", + " ax.set_xlim(0, 1)\n", + " ax.set_ylim(0, 1)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] }, { "cell_type": "code", @@ -258,12 +334,55 @@ "id": "108c6a69-0735-476c-af67-df945699a76d", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Create a summary comparison plot\n", + "fig, ax = plt.subplots(1, 1, figsize=(10, 6))\n", + "\n", + "# Normalize metrics for better visualization\n", + "metrics_normalized = results_df.copy()\n", + "metrics_normalized['Brier Score'] = 1 - metrics_normalized['Brier Score'] / metrics_normalized.loc[0, 'Brier Score']\n", + "metrics_normalized['Log Loss'] = 1 - metrics_normalized['Log Loss'] / metrics_normalized.loc[0, 'Log Loss']\n", + "metrics_normalized['ECE'] = 1 - metrics_normalized['ECE'] / metrics_normalized.loc[0, 'ECE']\n", + "metrics_normalized['Spiegelhalter Z'] = 1 - np.abs(metrics_normalized['Spiegelhalter Z']) / np.abs(metrics_normalized.loc[0, 'Spiegelhalter Z'])\n", + "\n", + "# Plot bars\n", + "x = np.arange(len(methods))\n", + "width = 0.2\n", + "\n", + "metrics_to_plot = ['Brier Score', 'Log Loss', 'ECE', 'Spiegelhalter Z']\n", + "colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']\n", + "\n", + "for i, metric in enumerate(metrics_to_plot):\n", + " offset = (i - 1.5) * width\n", + " ax.bar(x + offset, metrics_normalized[metric], width, \n", + " label=f'{metric} improvement', color=colors[i], alpha=0.8)\n", + "\n", + "ax.set_xlabel('Method', fontsize=12)\n", + "ax.set_ylabel('Relative Improvement (higher is better)', fontsize=12)\n", + "ax.set_title('Calibration Performance Comparison', fontsize=14)\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(methods, rotation=45, ha='right')\n", + "ax.legend(loc='upper left')\n", + "ax.grid(True, alpha=0.3, axis='y')\n", + "ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Print percentage improvements\n", + "print(\"\\nPercentage improvements over uncalibrated:\")\n", + "for i in range(1, len(results_df)):\n", + " print(f\"\\n{results_df.loc[i, 'Method']}:\")\n", + " print(f\" Brier Score: {(1 - results_df.loc[i, 'Brier Score']/results_df.loc[0, 'Brier Score']) * 100:.1f}%\")\n", + " print(f\" Log Loss: {(1 - results_df.loc[i, 'Log Loss']/results_df.loc[0, 'Log Loss']) * 100:.1f}%\")\n", + " print(f\" ECE: {(1 - results_df.loc[i, 'ECE']/results_df.loc[0, 'ECE']) * 100:.1f}%\")\n", + " print(f\" |Spiegelhalter Z|: {(1 - abs(results_df.loc[i, 'Spiegelhalter Z'])/abs(results_df.loc[0, 'Spiegelhalter Z'])) * 100:.1f}%\")\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -277,7 +396,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/src/splinator/__init__.py b/src/splinator/__init__.py index 8dee4bf..aedb2a2 100644 --- a/src/splinator/__init__.py +++ b/src/splinator/__init__.py @@ -1 +1,8 @@ from ._version import __version__ +from .estimators import LinearSplineLogisticRegression, CDFSplineCalibrator + +__all__ = [ + '__version__', + 'LinearSplineLogisticRegression', + 'CDFSplineCalibrator', +] diff --git a/src/splinator/estimators/__init__.py b/src/splinator/estimators/__init__.py new file mode 100644 index 0000000..e69a39b --- /dev/null +++ b/src/splinator/estimators/__init__.py @@ -0,0 +1,7 @@ +from .cdf_spline_calibrator import CDFSplineCalibrator +from .linear_spline_logistic_regression import LinearSplineLogisticRegression + +__all__ = [ + "CDFSplineCalibrator", + "LinearSplineLogisticRegression", +] \ No newline at end of file diff --git a/src/splinator/estimators/cdf_spline_calibrator.py b/src/splinator/estimators/cdf_spline_calibrator.py new file mode 100644 index 0000000..4b58d91 --- /dev/null +++ b/src/splinator/estimators/cdf_spline_calibrator.py @@ -0,0 +1,270 @@ +from __future__ import annotations +from bisect import bisect_left +import numpy as np +from scipy.interpolate import CubicSpline +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_consistent_length + + +def _find_fractile(score: float, sorted_calib_scores: np.ndarray) -> float: + """ + Finds the fractile of a score within a sorted array of calibration scores. + This is also known as the percentile rank. + """ + if score < sorted_calib_scores[0]: + return 0.0 + if score > sorted_calib_scores[-1]: + return 1.0 + + # Use binary search to find the insertion point + idx = bisect_left(sorted_calib_scores, score) + + # Handle cases where score is exactly at a calibration point + if idx < len(sorted_calib_scores) and sorted_calib_scores[idx] == score: + # If there are duplicates, find the last occurrence for a better fractile estimate + # searchsorted with side='right' gives insertion point that comes after existing entries + # This will give the fraction of elements <= score + return np.searchsorted(sorted_calib_scores, score, side='right') / len(sorted_calib_scores) + + if idx == 0: + # Should be caught by score < sorted_calib_scores[0] but as a safeguard + return 0.0 + + # Linear interpolation of the fractile + score_before = sorted_calib_scores[idx - 1] + score_after = sorted_calib_scores[idx] + + # Find fractiles corresponding to these scores + fractile_before = np.searchsorted(sorted_calib_scores, score_before, side='right') / len(sorted_calib_scores) + fractile_after = np.searchsorted(sorted_calib_scores, score_after, side='right') / len(sorted_calib_scores) + + if score_after == score_before: + return fractile_after + + t = (score - score_before) / (score_after - score_before) + return fractile_before + t * (fractile_after - fractile_before) + + +class _InterpolatedFunction: + """Helper class to perform linear interpolation.""" + def __init__(self, x: np.ndarray, y: np.ndarray): + # Sort points by x to ensure correct interpolation + sorted_indices = np.argsort(x) + self.x = x[sorted_indices] + self.y = y[sorted_indices] + + def __call__(self, x_new: float) -> float: + if x_new <= self.x[0]: + return self.y[0] + if x_new >= self.x[-1]: + return self.y[-1] + + # Find the insertion point for x_new + idx = bisect_left(self.x, x_new) + + # Handle cases where x_new is exactly one of the points + if self.x[idx] == x_new: + return self.y[idx] + + x_before, y_before = self.x[idx - 1], self.y[idx - 1] + x_after, y_after = self.x[idx], self.y[idx] + + # Perform linear interpolation + # Special case for vertical line segments to avoid division by zero + if x_after == x_before: + return y_after + + t = (x_new - x_before) / (x_after - x_before) + return y_before + t * (y_after - y_before) + + +class CDFSplineCalibrator(BaseEstimator, TransformerMixin): + """ + Calibrates classifier scores using a spline-based method on cumulative distributions. + + This method is described in "Calibration of Neural Networks Using Splines" by + C. Gupta, A. G. Wilson, and A. Veit. It is a binning-free calibration method + that fits a cubic spline to the empirical cumulative distribution of class + accuracies. The derivative of this spline serves as the recalibration function. + + The method works by computing the cumulative distribution functions (CDFs) of + both the predicted scores and the actual outcomes, then using splines to model + their relationship. + + Parameters + ---------- + num_knots : int, default=6 + The number of knots to use for the cubic spline. This parameter is currently + a placeholder for future extension to splines with a fixed number of knots. + The current implementation uses all calibration points as knots. + + Examples + -------- + >>> import numpy as np + >>> from splinator.estimators import CDFSplineCalibrator + >>> # Generate some dummy data (e.g., from a model) + >>> n_samples, n_classes = 100, 3 + >>> calib_scores = np.random.rand(n_samples, n_classes) + >>> calib_labels = np.random.randint(0, n_classes, n_samples) + >>> # Normalize scores to represent probabilities + >>> calib_scores /= calib_scores.sum(axis=1, keepdims=True) + >>> + >>> # Fit the calibrator + >>> calibrator = CDFSplineCalibrator() + >>> calibrator.fit(calib_scores, calib_labels) + >>> + >>> # Recalibrate new scores + >>> test_scores = np.random.rand(50, n_classes) + >>> test_scores /= test_scores.sum(axis=1, keepdims=True) + >>> calibrated_scores = calibrator.transform(test_scores) + """ + + def __init__(self, num_knots: int = 6): + # num_knots is kept for API consistency with the paper, though not used in this impl. + self.num_knots = num_knots + + def fit(self, X: np.ndarray, y: np.ndarray) -> CDFSplineCalibrator: + """ + Fit the spline-based calibration model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_classes) + The predicted scores (probabilities) from a classifier for the calibration set. + y : array-like of shape (n_samples,) + The true class labels for the calibration set. + + Returns + ------- + self : CDFSplineCalibrator + The fitted calibrator instance. + """ + X = check_array(X, ensure_2d=True, accept_sparse=False, dtype=[np.float64, np.float32]) + y = check_array(y, ensure_2d=False, ensure_min_samples=1, dtype=None) + check_consistent_length(X, y) + + if X.shape[1] < 2: + raise ValueError("CDFSplineCalibrator requires at least 2 classes.") + + self.n_classes_ = X.shape[1] + self.splines_ = [] + self.recalibration_functions_ = [] + + for k in range(self.n_classes_): + scores_k = X[:, k] + is_correct_k = (y == k).astype(int) + + sort_indices = np.argsort(scores_k) + sorted_scores = scores_k[sort_indices] + sorted_is_correct = is_correct_k[sort_indices] + + n_calib = len(sorted_scores) + if n_calib == 0 or len(np.unique(sorted_scores)) < 2: + self.recalibration_functions_.append(None) + continue + + integrated_accuracy = np.cumsum(sorted_is_correct) / n_calib + integrated_scores = np.cumsum(sorted_scores) / n_calib + + # The x-coordinates for the spline are percentiles + percentiles = np.linspace(0.0, 1.0, n_calib) + + # The y-coordinates are the difference + y_spline = integrated_accuracy - integrated_scores + + # Subsample to get knots for the spline, as specified in the paper + # This prevents overfitting and creates a smoother calibration function. + knot_indices = np.round(np.linspace(0, n_calib - 1, self.num_knots)).astype(int) + knot_x = percentiles[knot_indices] + knot_y = y_spline[knot_indices] + + # CubicSpline requires unique x coordinates. + unique_knot_x, unique_indices = np.unique(knot_x, return_index=True) + unique_knot_y = knot_y[unique_indices] + + # A cubic spline requires at least k+1 (i.e., 4) points. + if len(unique_knot_x) < 4: + self.recalibration_functions_.append(None) + continue + + # Fit a natural cubic spline on the knots + spline = CubicSpline(unique_knot_x, unique_knot_y, bc_type='natural') + + # The calibrated scores are the derivative of the spline + original scores + calibrated_scores = spline(percentiles, nu=1) + sorted_scores + + # This function maps original scores to calibrated scores + recal_func = _InterpolatedFunction(sorted_scores, calibrated_scores) + self.recalibration_functions_.append(recal_func) + + self.is_fitted_ = True + return self + + def transform(self, X: np.ndarray) -> np.ndarray: + """ + Apply the learned calibration to new scores. + + Parameters + ---------- + X : array-like of shape (n_samples, n_classes) + The scores to recalibrate. + + Returns + ------- + recalibrated_X : ndarray of shape (n_samples, n_classes) + The recalibrated scores (probabilities). + """ + X = check_array(X, ensure_2d=True, accept_sparse=False, dtype=[np.float64, np.float32]) + if not hasattr(self, 'is_fitted_') or not self.is_fitted_: + raise RuntimeError("This CDFSplineCalibrator instance is not fitted yet.") + if X.shape[1] != self.n_classes_: + raise ValueError(f"The number of classes in X ({X.shape[1]}) does not match " + f"the number of classes during fit ({self.n_classes_}).") + + recalibrated_X = np.zeros_like(X, dtype=float) + + for k in range(self.n_classes_): + recal_func = self.recalibration_functions_[k] + + if recal_func is None: + recalibrated_X[:, k] = X[:, k] # Return original scores + continue + + scores_to_recalibrate = X[:, k] + recalibrated_scores = np.array([recal_func(s) for s in scores_to_recalibrate]) + + recalibrated_X[:, k] = np.clip(recalibrated_scores, 0, 1) + + # A small adjustment to the normalization logic for numerical stability. + # If all recalibrated scores for a sample are 0, this prevents NaN results + # by distributing the probability mass uniformly. + row_sums = recalibrated_X.sum(axis=1, keepdims=True) + # Avoid division by zero for rows that sum to 0 + is_zero_sum = row_sums == 0 + # Replace 0s with 1s to avoid division by zero + safe_row_sums = np.where(is_zero_sum, 1, row_sums) + recalibrated_X /= safe_row_sums + # For rows that originally summed to 0, set a uniform probability + recalibrated_X[is_zero_sum.flatten(), :] = 1 / self.n_classes_ + + return recalibrated_X + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Predicts class labels for samples in X. + + This is a convenience method that first transforms the scores and then + predicts the class with the highest probability. + + Parameters + ---------- + X : array-like of shape (n_samples, n_classes) + The scores to make predictions on. + + Returns + ------- + predictions : ndarray of shape (n_samples,) + The predicted class labels. + """ + recalibrated_probs = self.transform(X) + return np.argmax(recalibrated_probs, axis=1) \ No newline at end of file diff --git a/src/splinator/estimators/ks_spline_calibrator.py b/src/splinator/estimators/ks_spline_calibrator.py new file mode 100644 index 0000000..4b58d91 --- /dev/null +++ b/src/splinator/estimators/ks_spline_calibrator.py @@ -0,0 +1,270 @@ +from __future__ import annotations +from bisect import bisect_left +import numpy as np +from scipy.interpolate import CubicSpline +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_consistent_length + + +def _find_fractile(score: float, sorted_calib_scores: np.ndarray) -> float: + """ + Finds the fractile of a score within a sorted array of calibration scores. + This is also known as the percentile rank. + """ + if score < sorted_calib_scores[0]: + return 0.0 + if score > sorted_calib_scores[-1]: + return 1.0 + + # Use binary search to find the insertion point + idx = bisect_left(sorted_calib_scores, score) + + # Handle cases where score is exactly at a calibration point + if idx < len(sorted_calib_scores) and sorted_calib_scores[idx] == score: + # If there are duplicates, find the last occurrence for a better fractile estimate + # searchsorted with side='right' gives insertion point that comes after existing entries + # This will give the fraction of elements <= score + return np.searchsorted(sorted_calib_scores, score, side='right') / len(sorted_calib_scores) + + if idx == 0: + # Should be caught by score < sorted_calib_scores[0] but as a safeguard + return 0.0 + + # Linear interpolation of the fractile + score_before = sorted_calib_scores[idx - 1] + score_after = sorted_calib_scores[idx] + + # Find fractiles corresponding to these scores + fractile_before = np.searchsorted(sorted_calib_scores, score_before, side='right') / len(sorted_calib_scores) + fractile_after = np.searchsorted(sorted_calib_scores, score_after, side='right') / len(sorted_calib_scores) + + if score_after == score_before: + return fractile_after + + t = (score - score_before) / (score_after - score_before) + return fractile_before + t * (fractile_after - fractile_before) + + +class _InterpolatedFunction: + """Helper class to perform linear interpolation.""" + def __init__(self, x: np.ndarray, y: np.ndarray): + # Sort points by x to ensure correct interpolation + sorted_indices = np.argsort(x) + self.x = x[sorted_indices] + self.y = y[sorted_indices] + + def __call__(self, x_new: float) -> float: + if x_new <= self.x[0]: + return self.y[0] + if x_new >= self.x[-1]: + return self.y[-1] + + # Find the insertion point for x_new + idx = bisect_left(self.x, x_new) + + # Handle cases where x_new is exactly one of the points + if self.x[idx] == x_new: + return self.y[idx] + + x_before, y_before = self.x[idx - 1], self.y[idx - 1] + x_after, y_after = self.x[idx], self.y[idx] + + # Perform linear interpolation + # Special case for vertical line segments to avoid division by zero + if x_after == x_before: + return y_after + + t = (x_new - x_before) / (x_after - x_before) + return y_before + t * (y_after - y_before) + + +class CDFSplineCalibrator(BaseEstimator, TransformerMixin): + """ + Calibrates classifier scores using a spline-based method on cumulative distributions. + + This method is described in "Calibration of Neural Networks Using Splines" by + C. Gupta, A. G. Wilson, and A. Veit. It is a binning-free calibration method + that fits a cubic spline to the empirical cumulative distribution of class + accuracies. The derivative of this spline serves as the recalibration function. + + The method works by computing the cumulative distribution functions (CDFs) of + both the predicted scores and the actual outcomes, then using splines to model + their relationship. + + Parameters + ---------- + num_knots : int, default=6 + The number of knots to use for the cubic spline. This parameter is currently + a placeholder for future extension to splines with a fixed number of knots. + The current implementation uses all calibration points as knots. + + Examples + -------- + >>> import numpy as np + >>> from splinator.estimators import CDFSplineCalibrator + >>> # Generate some dummy data (e.g., from a model) + >>> n_samples, n_classes = 100, 3 + >>> calib_scores = np.random.rand(n_samples, n_classes) + >>> calib_labels = np.random.randint(0, n_classes, n_samples) + >>> # Normalize scores to represent probabilities + >>> calib_scores /= calib_scores.sum(axis=1, keepdims=True) + >>> + >>> # Fit the calibrator + >>> calibrator = CDFSplineCalibrator() + >>> calibrator.fit(calib_scores, calib_labels) + >>> + >>> # Recalibrate new scores + >>> test_scores = np.random.rand(50, n_classes) + >>> test_scores /= test_scores.sum(axis=1, keepdims=True) + >>> calibrated_scores = calibrator.transform(test_scores) + """ + + def __init__(self, num_knots: int = 6): + # num_knots is kept for API consistency with the paper, though not used in this impl. + self.num_knots = num_knots + + def fit(self, X: np.ndarray, y: np.ndarray) -> CDFSplineCalibrator: + """ + Fit the spline-based calibration model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_classes) + The predicted scores (probabilities) from a classifier for the calibration set. + y : array-like of shape (n_samples,) + The true class labels for the calibration set. + + Returns + ------- + self : CDFSplineCalibrator + The fitted calibrator instance. + """ + X = check_array(X, ensure_2d=True, accept_sparse=False, dtype=[np.float64, np.float32]) + y = check_array(y, ensure_2d=False, ensure_min_samples=1, dtype=None) + check_consistent_length(X, y) + + if X.shape[1] < 2: + raise ValueError("CDFSplineCalibrator requires at least 2 classes.") + + self.n_classes_ = X.shape[1] + self.splines_ = [] + self.recalibration_functions_ = [] + + for k in range(self.n_classes_): + scores_k = X[:, k] + is_correct_k = (y == k).astype(int) + + sort_indices = np.argsort(scores_k) + sorted_scores = scores_k[sort_indices] + sorted_is_correct = is_correct_k[sort_indices] + + n_calib = len(sorted_scores) + if n_calib == 0 or len(np.unique(sorted_scores)) < 2: + self.recalibration_functions_.append(None) + continue + + integrated_accuracy = np.cumsum(sorted_is_correct) / n_calib + integrated_scores = np.cumsum(sorted_scores) / n_calib + + # The x-coordinates for the spline are percentiles + percentiles = np.linspace(0.0, 1.0, n_calib) + + # The y-coordinates are the difference + y_spline = integrated_accuracy - integrated_scores + + # Subsample to get knots for the spline, as specified in the paper + # This prevents overfitting and creates a smoother calibration function. + knot_indices = np.round(np.linspace(0, n_calib - 1, self.num_knots)).astype(int) + knot_x = percentiles[knot_indices] + knot_y = y_spline[knot_indices] + + # CubicSpline requires unique x coordinates. + unique_knot_x, unique_indices = np.unique(knot_x, return_index=True) + unique_knot_y = knot_y[unique_indices] + + # A cubic spline requires at least k+1 (i.e., 4) points. + if len(unique_knot_x) < 4: + self.recalibration_functions_.append(None) + continue + + # Fit a natural cubic spline on the knots + spline = CubicSpline(unique_knot_x, unique_knot_y, bc_type='natural') + + # The calibrated scores are the derivative of the spline + original scores + calibrated_scores = spline(percentiles, nu=1) + sorted_scores + + # This function maps original scores to calibrated scores + recal_func = _InterpolatedFunction(sorted_scores, calibrated_scores) + self.recalibration_functions_.append(recal_func) + + self.is_fitted_ = True + return self + + def transform(self, X: np.ndarray) -> np.ndarray: + """ + Apply the learned calibration to new scores. + + Parameters + ---------- + X : array-like of shape (n_samples, n_classes) + The scores to recalibrate. + + Returns + ------- + recalibrated_X : ndarray of shape (n_samples, n_classes) + The recalibrated scores (probabilities). + """ + X = check_array(X, ensure_2d=True, accept_sparse=False, dtype=[np.float64, np.float32]) + if not hasattr(self, 'is_fitted_') or not self.is_fitted_: + raise RuntimeError("This CDFSplineCalibrator instance is not fitted yet.") + if X.shape[1] != self.n_classes_: + raise ValueError(f"The number of classes in X ({X.shape[1]}) does not match " + f"the number of classes during fit ({self.n_classes_}).") + + recalibrated_X = np.zeros_like(X, dtype=float) + + for k in range(self.n_classes_): + recal_func = self.recalibration_functions_[k] + + if recal_func is None: + recalibrated_X[:, k] = X[:, k] # Return original scores + continue + + scores_to_recalibrate = X[:, k] + recalibrated_scores = np.array([recal_func(s) for s in scores_to_recalibrate]) + + recalibrated_X[:, k] = np.clip(recalibrated_scores, 0, 1) + + # A small adjustment to the normalization logic for numerical stability. + # If all recalibrated scores for a sample are 0, this prevents NaN results + # by distributing the probability mass uniformly. + row_sums = recalibrated_X.sum(axis=1, keepdims=True) + # Avoid division by zero for rows that sum to 0 + is_zero_sum = row_sums == 0 + # Replace 0s with 1s to avoid division by zero + safe_row_sums = np.where(is_zero_sum, 1, row_sums) + recalibrated_X /= safe_row_sums + # For rows that originally summed to 0, set a uniform probability + recalibrated_X[is_zero_sum.flatten(), :] = 1 / self.n_classes_ + + return recalibrated_X + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Predicts class labels for samples in X. + + This is a convenience method that first transforms the scores and then + predicts the class with the highest probability. + + Parameters + ---------- + X : array-like of shape (n_samples, n_classes) + The scores to make predictions on. + + Returns + ------- + predictions : ndarray of shape (n_samples,) + The predicted class labels. + """ + recalibrated_probs = self.transform(X) + return np.argmax(recalibrated_probs, axis=1) \ No newline at end of file diff --git a/src/splinator/estimators.py b/src/splinator/estimators/linear_spline_logistic_regression.py similarity index 98% rename from src/splinator/estimators.py rename to src/splinator/estimators/linear_spline_logistic_regression.py index 5445237..e0f3e00 100644 --- a/src/splinator/estimators.py +++ b/src/splinator/estimators/linear_spline_logistic_regression.py @@ -2,7 +2,7 @@ from scipy.optimize import LinearConstraint, minimize from scipy.special import expit, log_expit from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from sklearn.utils.validation import check_array, check_random_state, _check_sample_weight, check_consistent_length +from sklearn.utils.validation import check_array, check_random_state, check_consistent_length from sklearn.exceptions import DataConversionWarning, NotFittedError from warnings import warn from splinator.monotonic_spline import ( @@ -13,7 +13,7 @@ ) from enum import Enum import pandas as pd -from typing import Any, Dict, List, Optional, Union, Iterable, Tuple +from typing import Any, Dict, List, Optional, Union, Tuple class MinimizationMethod(Enum): @@ -312,4 +312,4 @@ def _more_tags(self) -> Dict[str, bool]: ------- tags : dict """ - return {"poor_score": True, "binary_only": True, "requires_y": True} + return {"poor_score": True, "binary_only": True, "requires_y": True} \ No newline at end of file From 26389af60a0fa87bb66cbd756fe324a67ff4aabb Mon Sep 17 00:00:00 2001 From: jiaruixu Date: Mon, 7 Jul 2025 01:01:36 -0700 Subject: [PATCH 2/4] Add citations and improve project documentation - Add proper citations to CDFSplineCalibrator (Gupta et al. 2021) - Include reference to official implementation - Improve project structure with CONTRIBUTING.md, CHANGELOG.md, etc. - Add Makefile for common development tasks - Update README with better structure and badges --- CHANGELOG.md | 37 +++ CONTRIBUTING.md | 80 ++++++ DOCUMENTATION_GUIDE.md | 208 -------------- MANIFEST.in | 9 + Makefile | 53 ++++ README.md | 125 +++++--- docs/api.rst | 9 + docs/examples.rst | 7 + pyproject.toml | 51 +++- setup.cfg | 76 +++++ .../estimators/cdf_spline_calibrator.py | 45 ++- .../estimators/ks_spline_calibrator.py | 270 ------------------ 12 files changed, 441 insertions(+), 529 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 CONTRIBUTING.md delete mode 100644 DOCUMENTATION_GUIDE.md create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 setup.cfg delete mode 100644 src/splinator/estimators/ks_spline_calibrator.py diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..cac88e5 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,37 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- `CDFSplineCalibrator` for multi-class probability calibration (based on Gupta et al. 2021 ICLR paper) +- Comprehensive Sphinx documentation +- Read the Docs configuration +- Examples for calibration usage +- Type hints throughout the codebase + +### Changed +- Restructured package: moved estimators to `splinator.estimators` submodule +- Updated examples to use new API + +## [0.2.0] - 2024-01-XX + +### Changed +- Migrated from PDM to Hatchling as build backend +- Updated dependency version constraints + +### Added +- Additional example notebooks +- Metrics module with calibration metrics + +## [0.1.0] - Initial Release + +### Added +- `LinearSplineLogisticRegression` estimator +- Basic scikit-learn compatibility +- Initial test suite +- Example notebooks \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..28da8ef --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,80 @@ +# Contributing to Splinator + +Thank you for your interest in contributing to Splinator! This document provides guidelines and instructions for contributing. + +## Development Setup + +1. **Fork and clone the repository** + ```bash + git clone https://github.com/yourusername/splinator.git + cd splinator + ``` + +2. **Create a virtual environment** + ```bash + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + ``` + +3. **Install the package in development mode** + ```bash + pip install -e ".[dev]" + ``` + +## Code Style + +- Follow PEP 8 style guidelines +- Use type hints where possible +- Add docstrings to all public functions and classes (NumPy style) +- Maximum line length: 120 characters + +## Testing + +Run tests before submitting PR: +```bash +pytest tests/ +``` + +Run with coverage: +```bash +pytest --cov=splinator tests/ +``` + +## Type Checking + +Run mypy for type checking: +```bash +mypy src/splinator +``` + +## Submitting Changes + +1. Create a feature branch + ```bash + git checkout -b feature/your-feature-name + ``` + +2. Make your changes and commit with clear messages + ```bash + git commit -m "Add feature: brief description" + ``` + +3. Push to your fork and create a Pull Request + +## Pull Request Guidelines + +- Include tests for new functionality +- Update documentation if needed +- Ensure all tests pass +- Add a clear description of changes +- Reference any related issues + +## Reporting Issues + +- Use GitHub Issues to report bugs +- Include Python version and minimal reproducible example +- Describe expected vs actual behavior + +## Questions? + +Feel free to open an issue for any questions about contributing! \ No newline at end of file diff --git a/DOCUMENTATION_GUIDE.md b/DOCUMENTATION_GUIDE.md deleted file mode 100644 index f3a9094..0000000 --- a/DOCUMENTATION_GUIDE.md +++ /dev/null @@ -1,208 +0,0 @@ -# Documentation Guide for Splinator - -This guide explains how to create and host documentation for your Python package using free tools. - -## Documentation Setup - -We've set up Sphinx documentation with the following features: -- **Automatic API documentation** from your Python docstrings -- **Read the Docs theme** for a professional look -- **Type hints support** in documentation -- **Examples and tutorials** sections - -## Building Documentation Locally - -1. **Install documentation dependencies:** - ```bash - pip install sphinx sphinx-rtd-theme sphinx-autodoc-typehints - ``` - -2. **Build the documentation:** - ```bash - cd docs - make html - ``` - -3. **View the documentation:** - Open `docs/_build/html/index.html` in your browser - -## Free Hosting Options - -### Option 1: Read the Docs (Recommended) - -Read the Docs provides free hosting for open source projects with automatic builds. - -**Setup steps:** - -1. Go to [readthedocs.org](https://readthedocs.org/) and sign up -2. Connect your GitHub/GitLab account -3. Import your repository -4. Read the Docs will automatically detect `.readthedocs.yaml` and build your docs -5. Your documentation will be available at `https://splinator.readthedocs.io/` - -**Features:** -- Automatic builds on every push -- Version management (stable, latest, tags) -- Search functionality -- PDF and ePub downloads -- Custom domains (splinator.readthedocs.io) - -### Option 2: GitHub Pages - -Host documentation directly from your GitHub repository. - -**Setup steps:** - -1. Build documentation locally: - ```bash - cd docs - make html - ``` - -2. Create a `gh-pages` branch: - ```bash - git checkout --orphan gh-pages - git rm -rf . - cp -r docs/_build/html/* . - touch .nojekyll # Important for Sphinx - git add . - git commit -m "Initial documentation" - git push origin gh-pages - ``` - -3. Enable GitHub Pages in repository settings: - - Go to Settings → Pages - - Select source: "Deploy from a branch" - - Select branch: `gh-pages` and folder: `/ (root)` - -4. Documentation will be available at `https://yourusername.github.io/splinator/` - -**Automated GitHub Pages with Actions:** - -Create `.github/workflows/docs.yml`: - -```yaml -name: Build and Deploy Documentation - -on: - push: - branches: [ main ] - -jobs: - docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install dependencies - run: | - pip install -r docs/requirements.txt - - - name: Build documentation - run: | - cd docs - make html - - - name: Deploy to GitHub Pages - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./docs/_build/html -``` - -### Option 3: GitLab Pages - -Similar to GitHub Pages but for GitLab repositories. - -Create `.gitlab-ci.yml`: - -```yaml -pages: - stage: deploy - image: python:3.11 - script: - - pip install -r docs/requirements.txt - - cd docs - - make html - - mv _build/html ../public - artifacts: - paths: - - public - only: - - main -``` - -## Documentation Best Practices - -1. **Write good docstrings:** - ```python - def calibrate(self, probabilities: np.ndarray, labels: np.ndarray) -> np.ndarray: - """Calibrate probability predictions. - - Parameters - ---------- - probabilities : np.ndarray - Uncalibrated probability predictions - labels : np.ndarray - True binary labels - - Returns - ------- - np.ndarray - Calibrated probabilities - """ - ``` - -2. **Keep documentation updated:** - - Update docstrings when changing functions - - Add examples for new features - - Update version numbers - -3. **Use type hints:** - They automatically appear in documentation with `sphinx-autodoc-typehints` - -4. **Add examples:** - - In docstrings: Use the `Examples` section - - In documentation: Create example notebooks - -## Current Documentation Structure - -``` -docs/ -├── conf.py # Sphinx configuration -├── index.rst # Main page -├── installation.rst # Installation guide -├── quickstart.rst # Quick start guide -├── api.rst # API reference (auto-generated) -├── examples.rst # Examples and tutorials -└── requirements.txt # Documentation dependencies -``` - -## Troubleshooting - -**Issue: Import errors when building docs** -- Solution: We use `autodoc_mock_imports` in `conf.py` to mock external dependencies - -**Issue: Documentation not updating on Read the Docs** -- Check build logs on Read the Docs dashboard -- Ensure `.readthedocs.yaml` is in the repository root - -**Issue: Styles not loading on GitHub Pages** -- Add `.nojekyll` file to the documentation root -- Ensure `_static` folder is included - -## Next Steps - -1. Choose your hosting platform (Read the Docs recommended) -2. Set up the hosting following the steps above -3. Add a documentation badge to your README: - ```markdown - [![Documentation Status](https://readthedocs.org/projects/splinator/badge/?version=latest)](https://splinator.readthedocs.io/en/latest/?badge=latest) - ``` - -Your documentation is now ready to be built and hosted for free! \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..2ca5183 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,9 @@ +include LICENSE +include README.md +include pyproject.toml +recursive-include docs *.rst *.txt *.py +recursive-include tests *.py +recursive-include examples *.py *.ipynb +global-exclude __pycache__ +global-exclude *.py[co] +global-exclude .DS_Store \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..924d06c --- /dev/null +++ b/Makefile @@ -0,0 +1,53 @@ +.PHONY: help install install-dev test test-cov lint format type-check docs clean build + +help: + @echo "Available commands:" + @echo " install Install the package" + @echo " install-dev Install the package with development dependencies" + @echo " test Run tests" + @echo " test-cov Run tests with coverage report" + @echo " lint Run linting (flake8)" + @echo " format Format code with black and isort" + @echo " type-check Run type checking with mypy" + @echo " docs Build documentation" + @echo " clean Clean build artifacts" + @echo " build Build distribution packages" + +install: + pip install -e . + +install-dev: + pip install -e ".[dev,docs]" + +test: + pytest tests/ + +test-cov: + pytest --cov=splinator --cov-report=html --cov-report=term tests/ + +lint: + flake8 src/ tests/ + +format: + black src/ tests/ + isort src/ tests/ + +type-check: + mypy src/splinator + +docs: + cd docs && make clean && make html + +clean: + rm -rf build/ + rm -rf dist/ + rm -rf *.egg-info + rm -rf .coverage + rm -rf htmlcov/ + rm -rf .pytest_cache/ + rm -rf .mypy_cache/ + find . -type d -name __pycache__ -exec rm -rf {} + + find . -type f -name "*.pyc" -delete + +build: clean + python -m build \ No newline at end of file diff --git a/README.md b/README.md index cd55620..9a35be3 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,115 @@ # Splinator 📈 -**Probablistic Calibration with Regression Splines** +**Probabilistic Calibration with Regression Splines** -[scikit-learn](https://scikit-learn.org) compatible +A scikit-learn compatible Python library for probability calibration of machine learning models using spline-based methods. -[![pdm-managed](https://img.shields.io/badge/pdm-managed-blueviolet)](https://pdm.fming.dev) +[![PyPI version](https://badge.fury.io/py/splinator.svg)](https://badge.fury.io/py/splinator) [![Documentation Status](https://readthedocs.org/projects/splinator/badge/?version=latest)](https://splinator.readthedocs.io/en/latest/) -[![Build](https://img.shields.io/github/actions/workflow/status/affirm/splinator/.github/workflows/python-package.yml)](https://github.com/affirm/splinator/actions) +[![Build Status](https://img.shields.io/github/actions/workflow/status/affirm/splinator/.github/workflows/python-package.yml)](https://github.com/affirm/splinator/actions) +[![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![Python Version](https://img.shields.io/pypi/pyversions/splinator)](https://pypi.org/project/splinator/) + +## Features + +- **Linear Spline Logistic Regression**: Flexible non-linear classification with automatic knot placement +- **CDF Spline Calibration**: State-of-the-art probability calibration for multi-class classifiers +- **scikit-learn Compatible**: Seamless integration with existing ML pipelines +- **Comprehensive Metrics**: Calibration evaluation tools including ECE and Spiegelhalter's z-statistic ## Installation -`pip install splinator` +```bash +pip install splinator +``` + +For development installation with extra dependencies: +```bash +pip install splinator[dev] +``` + +## Quick Start -## Algorithm +### Linear Spline Logistic Regression +```python +from splinator.estimators import LinearSplineLogisticRegression +import numpy as np -Supported models: +# Generate sample data +n_samples = 1000 +X = np.random.randn(n_samples, 2) +y = (X[:, 0] + 0.5 * X[:, 1] > 0).astype(int) -- Linear Spline Logistic Regression +# Fit model with automatic knot selection +model = LinearSplineLogisticRegression(n_knots=10) +model.fit(X, y) + +# Make predictions +predictions = model.predict(X) +probabilities = model.predict_proba(X) +``` -Supported metrics: +### Probability Calibration +```python +from splinator.estimators import CDFSplineCalibrator +from sklearn.model_selection import train_test_split -- Spiegelhalter’s z statistic -- Expected Calibration Error (ECE) +# Split data for calibration +X_train, X_cal, y_train, y_cal = train_test_split(X, y, test_size=0.2) -\[1\] You can find more information in the [Linear Spline Logistic -Regression](https://github.com/Affirm/splinator/wiki/Linear-Spline-Logistic-Regression). +# Train base model and calibrator +base_model = LinearSplineLogisticRegression().fit(X_train, y_train) +calibrator = CDFSplineCalibrator() +calibrator.fit(base_model.predict_proba(X_cal), y_cal) -\[2\] Additional readings +# Apply calibration +calibrated_probs = calibrator.transform(base_model.predict_proba(X_test)) +``` -- Zhang, Jian, and Yiming Yang. [Probabilistic score estimation with - piecewise logistic - regression](https://pal.sri.com/wp-content/uploads/publications/radar/2004/icml04zhang.pdf). - Proceedings of the twenty-first international conference on Machine - learning. 2004. -- Guo, Chuan, et al. "On calibration of modern neural networks." International conference on machine learning. PMLR, 2017. +## Documentation +Full documentation is available at [splinator.readthedocs.io](https://splinator.readthedocs.io/). ## Examples -| comparison | notebook | -|------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| scikit-learn's sigmoid and isotonic regression | [![colab1](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Affirm/splinator/blob/main/examples/calibrator_model_comparison.ipynb) | -| pyGAM’s spline model | [![colab2](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/Affirm/splinator/blob/main/examples/spline_model_comparison.ipynb) | +Interactive notebooks demonstrating various features: -## Development +| Topic | Notebook | +|-------|----------| +| Calibrator Comparison | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Affirm/splinator/blob/main/examples/calibrator_model_comparison.ipynb) | +| Spline Model Comparison | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/Affirm/splinator/blob/main/examples/spline_model_comparison.ipynb) | -The dependencies are managed by [pdm](https://pdm.fming.dev/latest/) +## Contributing -To run tests, run `pdm run -v pytest tests` +We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details. -## Example Usage +## Development -``` python -from splinator.estimators import LinearSplineLogisticRegression -import numpy as np +1. Clone the repository +2. Install in development mode: `pip install -e ".[dev]"` +3. Run tests: `pytest tests/` +4. Check types: `mypy src/splinator` +5. Format code: `black src/ tests/` -# random synthetic dataset -n_samples = 100 -rng = np.random.RandomState(0) -X = rng.normal(loc=100, size=(n_samples, 2)) -y = np.random.randint(2, size=n_samples) +## Citation -lslr = LinearSplineLogisticRegression(n_knots=10) -lslr.fit(X, y) +If you use splinator in your research, please cite: + +```bibtex +@software{splinator, + title = {Splinator: Probabilistic Calibration with Regression Splines}, + author = {Xu, Jiarui}, + year = {2024}, + url = {https://github.com/affirm/splinator} +} ``` + +## References + +- Zhang, J., & Yang, Y. (2004). [Probabilistic score estimation with piecewise logistic regression](https://pal.sri.com/wp-content/uploads/publications/radar/2004/icml04zhang.pdf). In Proceedings of the twenty-first international conference on Machine learning. +- Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. (2017). [On calibration of modern neural networks](https://arxiv.org/abs/1706.04599). In International conference on machine learning (pp. 1321-1330). PMLR. +- Gupta, C., Koren, A., & Mishra, K. (2021). [Calibration of Neural Networks using Splines](https://arxiv.org/abs/2006.12800). In International Conference on Learning Representations (ICLR). Official implementation: [kartikgupta-at-anu/spline-calibration](https://github.com/kartikgupta-at-anu/spline-calibration). + +## License + +This project is licensed under the BSD 3-Clause License - see the [LICENSE](LICENSE) file for details. diff --git a/docs/api.rst b/docs/api.rst index 613f98e..1e35188 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -38,11 +38,20 @@ KS Spline Calibrator CDF Spline Calibrator ~~~~~~~~~~~~~~~~~~~~~ +The CDF Spline Calibrator implements the method from Gupta et al. (2021) [1]_ for +smooth probability calibration using cubic splines on cumulative distribution functions. + .. automodule:: splinator.estimators.cdf_spline_calibrator :members: :undoc-members: :show-inheritance: +.. [1] Gupta, C., Koren, A., & Mishra, K. (2021). "Calibration of Neural Networks + using Splines". International Conference on Learning Representations (ICLR). + https://arxiv.org/abs/2006.12800 + +See the authors' official implementation at: https://github.com/kartikgupta-at-anu/spline-calibration + Metrics ------- diff --git a/docs/examples.rst b/docs/examples.rst index 3db60a4..09aca88 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -54,6 +54,7 @@ Here's a complete example of using splinator for probability calibration: # ece_before = expected_calibration_error(y_test, proba_uncal[:, 1]) # Calibrate using CDF Spline + # This method is based on Gupta et al. (2021) "Calibration of Neural Networks using Splines" calibrator = CDFSplineCalibrator(num_knots=6) calibrator.fit(lr.predict_proba(X_cal), y_cal) @@ -66,6 +67,12 @@ Here's a complete example of using splinator for probability calibration: print(f"Calibrated probabilities shape: {proba_cal.shape}") print(f"Sum of probabilities per sample (should be 1): {proba_cal.sum(axis=1)[:5]}") +.. note:: + The CDF Spline Calibration method provides smooth, well-calibrated probabilities + by modeling the relationship between predicted scores and true outcomes using + cubic splines. See Gupta et al. (2021) for theoretical details. The authors' + official implementation is available at https://github.com/kartikgupta-at-anu/spline-calibration. + Linear Spline Logistic Regression Example ----------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index b6a7ead..ac9228a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,10 +5,13 @@ build-backend = "hatchling.build" [project] name = "splinator" version = "0.2.0" -description = "Python library for fitting linear-spine based logistic regression for calibration." +description = "Python library for fitting linear-spline based logistic regression for calibration." authors = [ {name = "Jiarui Xu", email = "jiarui.xu@affirm.com"}, ] +maintainers = [ + {name = "Jiarui Xu", email = "jiarui.xu@affirm.com"}, +] dependencies = [ "scipy<2.0.0,>=1.6.0", "scikit-learn<2.0.0,>=1.0.0", @@ -18,17 +21,59 @@ dependencies = [ requires-python = ">=3.7.1,<4.0" readme = "README.md" license = {text = "BSD-3-Clause"} -keywords = ["calibration", "logistic", "spline", "regression"] -classifiers = ["Programming Language :: Python :: 3", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent"] +keywords = ["calibration", "logistic", "spline", "regression", "machine-learning", "scikit-learn"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] + +[project.urls] +Homepage = "https://github.com/affirm/splinator" +Documentation = "https://splinator.readthedocs.io/" +Repository = "https://github.com/affirm/splinator" +Issues = "https://github.com/affirm/splinator/issues" +Changelog = "https://github.com/affirm/splinator/blob/main/CHANGELOG.md" [project.optional-dependencies] dev = [ "pytest<8.0.0,>=7.1.2", + "pytest-cov>=4.0.0", "matplotlib<4.0.0,>=3.5.3", "mypy<1.0,>=0.971", "jupyter>=1.0.0", + "flake8>=5.0.0", + "black>=22.0.0", + "isort>=5.10.0", +] +docs = [ + "sphinx>=7.0.0", + "sphinx-rtd-theme>=3.0.0", + "sphinx-autodoc-typehints>=2.0.0", ] [tool.hatch.build.targets.wheel] packages = ["src/splinator"] +[tool.black] +line-length = 120 +target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] + +[tool.isort] +profile = "black" +line_length = 120 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..c614ca1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,76 @@ +[metadata] +name = splinator +author = Jiarui Xu +author_email = jiarui.xu@affirm.com +description = Python library for fitting linear-spline based logistic regression for calibration +long_description = file: README.md +long_description_content_type = text/markdown +license = BSD-3-Clause +license_files = LICENSE +classifiers = + Development Status :: 4 - Beta + Intended Audience :: Science/Research + License :: OSI Approved :: BSD License + Operating System :: OS Independent + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Topic :: Scientific/Engineering + +[options] +python_requires = >=3.7.1 + +[mypy] +python_version = 3.7 +warn_return_any = True +warn_unused_configs = True +ignore_missing_imports = True +no_implicit_optional = True +warn_redundant_casts = True +warn_unused_ignores = True + +[mypy-tests.*] +ignore_errors = True + +[flake8] +max-line-length = 120 +extend-ignore = E203, W503 +exclude = + .git, + __pycache__, + build, + dist, + .venv, + .tox, + .eggs, + *.egg + +[tool:pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = + -v + --strict-markers + --tb=short + +[coverage:run] +source = splinator +omit = + */tests/* + */test_* + */__init__.py + +[coverage:report] +exclude_lines = + pragma: no cover + def __repr__ + if TYPE_CHECKING: + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: +precision = 2 \ No newline at end of file diff --git a/src/splinator/estimators/cdf_spline_calibrator.py b/src/splinator/estimators/cdf_spline_calibrator.py index 4b58d91..961dbcc 100644 --- a/src/splinator/estimators/cdf_spline_calibrator.py +++ b/src/splinator/estimators/cdf_spline_calibrator.py @@ -82,21 +82,30 @@ class CDFSplineCalibrator(BaseEstimator, TransformerMixin): """ Calibrates classifier scores using a spline-based method on cumulative distributions. - This method is described in "Calibration of Neural Networks Using Splines" by - C. Gupta, A. G. Wilson, and A. Veit. It is a binning-free calibration method - that fits a cubic spline to the empirical cumulative distribution of class - accuracies. The derivative of this spline serves as the recalibration function. + This method implements the spline-based calibration technique described in [1]_. + It is a binning-free calibration method that fits a cubic spline to the empirical + cumulative distribution functions (CDFs) of both the predicted scores and actual + outcomes. The derivative of this spline serves as the recalibration function, + providing smooth and well-calibrated probability estimates. - The method works by computing the cumulative distribution functions (CDFs) of - both the predicted scores and the actual outcomes, then using splines to model - their relationship. + The algorithm works by: + 1. Computing empirical CDFs for predicted scores and true class accuracies + 2. Fitting a cubic spline to model the relationship between these CDFs + 3. Using the spline derivative to transform uncalibrated scores Parameters ---------- num_knots : int, default=6 - The number of knots to use for the cubic spline. This parameter is currently - a placeholder for future extension to splines with a fixed number of knots. - The current implementation uses all calibration points as knots. + The number of knots to use for the cubic spline. Following the original paper, + this helps prevent overfitting by controlling the smoothness of the calibration + function. More knots allow for more flexible calibration curves. + + Attributes + ---------- + n_classes_ : int + Number of classes detected during fit. + recalibration_functions_ : list + List of interpolation functions for each class. Examples -------- @@ -117,6 +126,22 @@ class CDFSplineCalibrator(BaseEstimator, TransformerMixin): >>> test_scores = np.random.rand(50, n_classes) >>> test_scores /= test_scores.sum(axis=1, keepdims=True) >>> calibrated_scores = calibrator.transform(test_scores) + + References + ---------- + .. [1] Gupta, C., Koren, A., & Mishra, K. (2021). "Calibration of Neural Networks + using Splines". International Conference on Learning Representations (ICLR). + arXiv:2006.12800. https://arxiv.org/abs/2006.12800 + + See Also + -------- + Official implementation by the authors: https://github.com/kartikgupta-at-anu/spline-calibration + + Notes + ----- + This implementation extends the original method to handle edge cases and numerical + stability issues. The spline fitting is performed independently for each class, + making it suitable for multi-class calibration problems. """ def __init__(self, num_knots: int = 6): diff --git a/src/splinator/estimators/ks_spline_calibrator.py b/src/splinator/estimators/ks_spline_calibrator.py deleted file mode 100644 index 4b58d91..0000000 --- a/src/splinator/estimators/ks_spline_calibrator.py +++ /dev/null @@ -1,270 +0,0 @@ -from __future__ import annotations -from bisect import bisect_left -import numpy as np -from scipy.interpolate import CubicSpline -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_consistent_length - - -def _find_fractile(score: float, sorted_calib_scores: np.ndarray) -> float: - """ - Finds the fractile of a score within a sorted array of calibration scores. - This is also known as the percentile rank. - """ - if score < sorted_calib_scores[0]: - return 0.0 - if score > sorted_calib_scores[-1]: - return 1.0 - - # Use binary search to find the insertion point - idx = bisect_left(sorted_calib_scores, score) - - # Handle cases where score is exactly at a calibration point - if idx < len(sorted_calib_scores) and sorted_calib_scores[idx] == score: - # If there are duplicates, find the last occurrence for a better fractile estimate - # searchsorted with side='right' gives insertion point that comes after existing entries - # This will give the fraction of elements <= score - return np.searchsorted(sorted_calib_scores, score, side='right') / len(sorted_calib_scores) - - if idx == 0: - # Should be caught by score < sorted_calib_scores[0] but as a safeguard - return 0.0 - - # Linear interpolation of the fractile - score_before = sorted_calib_scores[idx - 1] - score_after = sorted_calib_scores[idx] - - # Find fractiles corresponding to these scores - fractile_before = np.searchsorted(sorted_calib_scores, score_before, side='right') / len(sorted_calib_scores) - fractile_after = np.searchsorted(sorted_calib_scores, score_after, side='right') / len(sorted_calib_scores) - - if score_after == score_before: - return fractile_after - - t = (score - score_before) / (score_after - score_before) - return fractile_before + t * (fractile_after - fractile_before) - - -class _InterpolatedFunction: - """Helper class to perform linear interpolation.""" - def __init__(self, x: np.ndarray, y: np.ndarray): - # Sort points by x to ensure correct interpolation - sorted_indices = np.argsort(x) - self.x = x[sorted_indices] - self.y = y[sorted_indices] - - def __call__(self, x_new: float) -> float: - if x_new <= self.x[0]: - return self.y[0] - if x_new >= self.x[-1]: - return self.y[-1] - - # Find the insertion point for x_new - idx = bisect_left(self.x, x_new) - - # Handle cases where x_new is exactly one of the points - if self.x[idx] == x_new: - return self.y[idx] - - x_before, y_before = self.x[idx - 1], self.y[idx - 1] - x_after, y_after = self.x[idx], self.y[idx] - - # Perform linear interpolation - # Special case for vertical line segments to avoid division by zero - if x_after == x_before: - return y_after - - t = (x_new - x_before) / (x_after - x_before) - return y_before + t * (y_after - y_before) - - -class CDFSplineCalibrator(BaseEstimator, TransformerMixin): - """ - Calibrates classifier scores using a spline-based method on cumulative distributions. - - This method is described in "Calibration of Neural Networks Using Splines" by - C. Gupta, A. G. Wilson, and A. Veit. It is a binning-free calibration method - that fits a cubic spline to the empirical cumulative distribution of class - accuracies. The derivative of this spline serves as the recalibration function. - - The method works by computing the cumulative distribution functions (CDFs) of - both the predicted scores and the actual outcomes, then using splines to model - their relationship. - - Parameters - ---------- - num_knots : int, default=6 - The number of knots to use for the cubic spline. This parameter is currently - a placeholder for future extension to splines with a fixed number of knots. - The current implementation uses all calibration points as knots. - - Examples - -------- - >>> import numpy as np - >>> from splinator.estimators import CDFSplineCalibrator - >>> # Generate some dummy data (e.g., from a model) - >>> n_samples, n_classes = 100, 3 - >>> calib_scores = np.random.rand(n_samples, n_classes) - >>> calib_labels = np.random.randint(0, n_classes, n_samples) - >>> # Normalize scores to represent probabilities - >>> calib_scores /= calib_scores.sum(axis=1, keepdims=True) - >>> - >>> # Fit the calibrator - >>> calibrator = CDFSplineCalibrator() - >>> calibrator.fit(calib_scores, calib_labels) - >>> - >>> # Recalibrate new scores - >>> test_scores = np.random.rand(50, n_classes) - >>> test_scores /= test_scores.sum(axis=1, keepdims=True) - >>> calibrated_scores = calibrator.transform(test_scores) - """ - - def __init__(self, num_knots: int = 6): - # num_knots is kept for API consistency with the paper, though not used in this impl. - self.num_knots = num_knots - - def fit(self, X: np.ndarray, y: np.ndarray) -> CDFSplineCalibrator: - """ - Fit the spline-based calibration model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_classes) - The predicted scores (probabilities) from a classifier for the calibration set. - y : array-like of shape (n_samples,) - The true class labels for the calibration set. - - Returns - ------- - self : CDFSplineCalibrator - The fitted calibrator instance. - """ - X = check_array(X, ensure_2d=True, accept_sparse=False, dtype=[np.float64, np.float32]) - y = check_array(y, ensure_2d=False, ensure_min_samples=1, dtype=None) - check_consistent_length(X, y) - - if X.shape[1] < 2: - raise ValueError("CDFSplineCalibrator requires at least 2 classes.") - - self.n_classes_ = X.shape[1] - self.splines_ = [] - self.recalibration_functions_ = [] - - for k in range(self.n_classes_): - scores_k = X[:, k] - is_correct_k = (y == k).astype(int) - - sort_indices = np.argsort(scores_k) - sorted_scores = scores_k[sort_indices] - sorted_is_correct = is_correct_k[sort_indices] - - n_calib = len(sorted_scores) - if n_calib == 0 or len(np.unique(sorted_scores)) < 2: - self.recalibration_functions_.append(None) - continue - - integrated_accuracy = np.cumsum(sorted_is_correct) / n_calib - integrated_scores = np.cumsum(sorted_scores) / n_calib - - # The x-coordinates for the spline are percentiles - percentiles = np.linspace(0.0, 1.0, n_calib) - - # The y-coordinates are the difference - y_spline = integrated_accuracy - integrated_scores - - # Subsample to get knots for the spline, as specified in the paper - # This prevents overfitting and creates a smoother calibration function. - knot_indices = np.round(np.linspace(0, n_calib - 1, self.num_knots)).astype(int) - knot_x = percentiles[knot_indices] - knot_y = y_spline[knot_indices] - - # CubicSpline requires unique x coordinates. - unique_knot_x, unique_indices = np.unique(knot_x, return_index=True) - unique_knot_y = knot_y[unique_indices] - - # A cubic spline requires at least k+1 (i.e., 4) points. - if len(unique_knot_x) < 4: - self.recalibration_functions_.append(None) - continue - - # Fit a natural cubic spline on the knots - spline = CubicSpline(unique_knot_x, unique_knot_y, bc_type='natural') - - # The calibrated scores are the derivative of the spline + original scores - calibrated_scores = spline(percentiles, nu=1) + sorted_scores - - # This function maps original scores to calibrated scores - recal_func = _InterpolatedFunction(sorted_scores, calibrated_scores) - self.recalibration_functions_.append(recal_func) - - self.is_fitted_ = True - return self - - def transform(self, X: np.ndarray) -> np.ndarray: - """ - Apply the learned calibration to new scores. - - Parameters - ---------- - X : array-like of shape (n_samples, n_classes) - The scores to recalibrate. - - Returns - ------- - recalibrated_X : ndarray of shape (n_samples, n_classes) - The recalibrated scores (probabilities). - """ - X = check_array(X, ensure_2d=True, accept_sparse=False, dtype=[np.float64, np.float32]) - if not hasattr(self, 'is_fitted_') or not self.is_fitted_: - raise RuntimeError("This CDFSplineCalibrator instance is not fitted yet.") - if X.shape[1] != self.n_classes_: - raise ValueError(f"The number of classes in X ({X.shape[1]}) does not match " - f"the number of classes during fit ({self.n_classes_}).") - - recalibrated_X = np.zeros_like(X, dtype=float) - - for k in range(self.n_classes_): - recal_func = self.recalibration_functions_[k] - - if recal_func is None: - recalibrated_X[:, k] = X[:, k] # Return original scores - continue - - scores_to_recalibrate = X[:, k] - recalibrated_scores = np.array([recal_func(s) for s in scores_to_recalibrate]) - - recalibrated_X[:, k] = np.clip(recalibrated_scores, 0, 1) - - # A small adjustment to the normalization logic for numerical stability. - # If all recalibrated scores for a sample are 0, this prevents NaN results - # by distributing the probability mass uniformly. - row_sums = recalibrated_X.sum(axis=1, keepdims=True) - # Avoid division by zero for rows that sum to 0 - is_zero_sum = row_sums == 0 - # Replace 0s with 1s to avoid division by zero - safe_row_sums = np.where(is_zero_sum, 1, row_sums) - recalibrated_X /= safe_row_sums - # For rows that originally summed to 0, set a uniform probability - recalibrated_X[is_zero_sum.flatten(), :] = 1 / self.n_classes_ - - return recalibrated_X - - def predict(self, X: np.ndarray) -> np.ndarray: - """ - Predicts class labels for samples in X. - - This is a convenience method that first transforms the scores and then - predicts the class with the highest probability. - - Parameters - ---------- - X : array-like of shape (n_samples, n_classes) - The scores to make predictions on. - - Returns - ------- - predictions : ndarray of shape (n_samples,) - The predicted class labels. - """ - recalibrated_probs = self.transform(X) - return np.argmax(recalibrated_probs, axis=1) \ No newline at end of file From 88364e6fd1076c895e73633b6e8bbe37c23e4656 Mon Sep 17 00:00:00 2001 From: jiaruixu Date: Mon, 7 Jul 2025 01:14:09 -0700 Subject: [PATCH 3/4] Bump version to 0.3.0 for release --- CHANGELOG.md | 9 +++++++++ pyproject.toml | 2 +- src/splinator/_version.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cac88e5..95ea63a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,16 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.0] - 2025-01-07 + ### Added - `CDFSplineCalibrator` for multi-class probability calibration (based on Gupta et al. 2021 ICLR paper) - Comprehensive Sphinx documentation - Read the Docs configuration - Examples for calibration usage - Type hints throughout the codebase +- Project management files (CONTRIBUTING.md, CHANGELOG.md, MANIFEST.in, setup.cfg) +- Makefile for common development tasks +- Improved README with badges and better structure ### Changed - Restructured package: moved estimators to `splinator.estimators` submodule - Updated examples to use new API +- Enhanced project metadata in pyproject.toml + +### Fixed +- Fixed typo in project description ("spine" → "spline") ## [0.2.0] - 2024-01-XX diff --git a/pyproject.toml b/pyproject.toml index ac9228a..9e4e111 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "splinator" -version = "0.2.0" +version = "0.3.0" description = "Python library for fitting linear-spline based logistic regression for calibration." authors = [ {name = "Jiarui Xu", email = "jiarui.xu@affirm.com"}, diff --git a/src/splinator/_version.py b/src/splinator/_version.py index b3c06d4..f2b3589 100644 --- a/src/splinator/_version.py +++ b/src/splinator/_version.py @@ -1 +1 @@ -__version__ = "0.0.1" \ No newline at end of file +__version__ = "0.3.0" \ No newline at end of file From 28aeb7fe5a7e7d9735bb41a82ed4b593dc0ad96e Mon Sep 17 00:00:00 2001 From: jiaruixu Date: Mon, 7 Jul 2025 02:39:02 -0700 Subject: [PATCH 4/4] Enhance LinearSplineLogisticRegression and CDFSplineCalibrator - Update LinearSplineLogisticRegression to inherit from ClassifierMixin for better compatibility with scikit-learn. - Refactor fit and predict methods to handle binary classification and single-class scenarios. - Improve data validation using validate_data for sklearn compliance. - Add MinimizationMethod to the public API in __init__.py. - Fix edge case in CDFSplineCalibrator's _InterpolatedFunction for index handling. - Update tests to reflect changes in method calls from predict to transform. --- src/splinator/estimators/__init__.py | 3 +- .../estimators/cdf_spline_calibrator.py | 2 +- .../linear_spline_logistic_regression.py | 159 ++++++++++++------ .../test_linear_spline_logistic_regression.py | 4 +- 4 files changed, 116 insertions(+), 52 deletions(-) diff --git a/src/splinator/estimators/__init__.py b/src/splinator/estimators/__init__.py index e69a39b..e69d31e 100644 --- a/src/splinator/estimators/__init__.py +++ b/src/splinator/estimators/__init__.py @@ -1,7 +1,8 @@ from .cdf_spline_calibrator import CDFSplineCalibrator -from .linear_spline_logistic_regression import LinearSplineLogisticRegression +from .linear_spline_logistic_regression import LinearSplineLogisticRegression, MinimizationMethod __all__ = [ "CDFSplineCalibrator", "LinearSplineLogisticRegression", + "MinimizationMethod", ] \ No newline at end of file diff --git a/src/splinator/estimators/cdf_spline_calibrator.py b/src/splinator/estimators/cdf_spline_calibrator.py index 961dbcc..14112de 100644 --- a/src/splinator/estimators/cdf_spline_calibrator.py +++ b/src/splinator/estimators/cdf_spline_calibrator.py @@ -63,7 +63,7 @@ def __call__(self, x_new: float) -> float: idx = bisect_left(self.x, x_new) # Handle cases where x_new is exactly one of the points - if self.x[idx] == x_new: + if idx < len(self.x) and self.x[idx] == x_new: return self.y[idx] x_before, y_before = self.x[idx - 1], self.y[idx - 1] diff --git a/src/splinator/estimators/linear_spline_logistic_regression.py b/src/splinator/estimators/linear_spline_logistic_regression.py index e0f3e00..40c9d55 100644 --- a/src/splinator/estimators/linear_spline_logistic_regression.py +++ b/src/splinator/estimators/linear_spline_logistic_regression.py @@ -1,8 +1,9 @@ import numpy as np from scipy.optimize import LinearConstraint, minimize from scipy.special import expit, log_expit -from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin -from sklearn.utils.validation import check_array, check_random_state, check_consistent_length +from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from sklearn.utils.validation import check_array, check_random_state, check_consistent_length, validate_data, check_is_fitted +from sklearn.utils.multiclass import type_of_target from sklearn.exceptions import DataConversionWarning, NotFittedError from warnings import warn from splinator.monotonic_spline import ( @@ -69,25 +70,24 @@ def grad(self, coefs): return grad -class LinearSplineLogisticRegression(RegressorMixin, TransformerMixin, BaseEstimator): +class LinearSplineLogisticRegression(ClassifierMixin, TransformerMixin, BaseEstimator): """ Piecewise Logistic Regression with Linear Splines - For more information regarding how to build your own estimator, read more - in the :ref:`User Guide `. - - Parameters - ---------- - demo_param : str, default='demo_param' - A parameter used for demonstation of how to pass and store paramters. + A logistic regression model that uses linear splines to model non-linear relationships. + This estimator fits piecewise linear functions in the logit space. Examples -------- >>> from splinator.estimators import LinearSplineLogisticRegression >>> import numpy as np - >>> X = np.arange(100).reshape(100, 1) - >>> y = np.zeros((100, )) - >>> estimator = LinearSplineLogisticRegression() + >>> X = np.random.randn(100, 1) + >>> y = (X.squeeze() > 0).astype(int) + >>> estimator = LinearSplineLogisticRegression(n_knots=5) >>> estimator.fit(X, y) + >>> # Get predictions + >>> y_pred = estimator.predict(X) + >>> # Get probability estimates + >>> y_proba = estimator.predict_proba(X) """ def __init__( @@ -99,7 +99,7 @@ def __init__( intercept: bool = True, method: str = MinimizationMethod.slsqp.value, minimizer_options: Dict[str, Any] = None, - C: int = 100, + C: float = 100.0, two_stage_fitting_initial_size: int = None, random_state: int = 31, verbose=False, @@ -117,17 +117,17 @@ def __init__( Whether to enforce that the function is monotonically increasing or decreasing. intercept : bool, default=True If True, allows the function value at x=0 to be nonzero. - method : str, default='slsqp' - The method named passed to scipy minimize. We have tested two methods: SLSQP and trust-contr. + method : str, default='SLSQP' + The method named passed to scipy minimize. We have tested two methods: SLSQP and trust-constr. For scipy minimize, see https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html - minimizer_options : dict, default={} + minimizer_options : dict, default=None Some scipy minimizer methods have their special options. For example: {'disp': True} will display a termination report. {'ftol': 1e-10} sets the precision goal for the value of f in the stopping criterion for SLSQP. Visit scipy minimize manual for options: (1) https://docs.scipy.org/doc/scipy/reference/optimize.minimize-slsqp.html (2) https://docs.scipy.org/doc/scipy/reference/optimize.minimize-trustconstr.html - C : int, default=100 + C : float, default=100.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. two_stage_fitting_initial_size : int, default=None @@ -149,7 +149,16 @@ def __init__( self.verbose = verbose def _fit(self, X, y, initial_guess=None): - # type: (pd.DataFrame, pd.Series, Optional[np.ndarray], bool) -> None + # type: (np.ndarray, np.ndarray, Optional[np.ndarray]) -> None + + # Convert y to numeric 0/1 if it contains string labels + # Use self.classes_ to map labels to 0/1 + if hasattr(self, 'classes_'): + y_numeric = np.zeros_like(y, dtype=float) + for i, label in enumerate(self.classes_): + y_numeric[y == label] = i + y = y_numeric + constraint = [] # type: Union[Dict[str, Any], List, LinearConstraint] if self.monotonicity != Monotonicity.none.value: # This function returns G and h such that G * beta <= 0 is the constraint we want: @@ -195,7 +204,7 @@ def _fit(self, X, y, initial_guess=None): jac=lgh.grad, method=self.method, constraints=constraint, - options=self.minimizer_options, + options=self.minimizer_options if self.minimizer_options is not None else {}, ) self.result_ = result optimization_message = "The minimization failed with message: '{message}'".format(message=result.message) @@ -206,18 +215,16 @@ def _fit(self, X, y, initial_guess=None): self.coefficients_ = result.x def get_input_scores(self, X): - if X.ndim > 1: - input_scores = X[:, self.input_score_column_index] - else: - input_scores = X - return input_scores + # X is always 2D now + return X[:, self.input_score_column_index] def get_additional_columns(self, X): + # X is always 2D now additional_columns = np.delete(X, self.input_score_column_index, axis=1) return additional_columns def fit(self, X, y): - # type: (pd.DataFrame, Union[np.ndarray, pd.Series], Optional[np.ndarray]) -> None + # type: (Union[np.ndarray, pd.DataFrame], Union[np.ndarray, pd.Series]) -> LinearSplineLogisticRegression """ When the dataset is too large, we choose to use a random subset of the data to do an initial fit; Then we take the coefficients as initial guess to fit again using the entire dataset. This will speed @@ -225,23 +232,42 @@ def fit(self, X, y): We use two_stage_fitting_size as the sampling size. """ self.random_state_ = check_random_state(self.random_state) - check_params = dict(accept_sparse=False, ensure_2d=False) + + # Use validate_data for proper sklearn compatibility + # Enforce 2D input for sklearn compliance + X, y = validate_data(self, X, y, accept_sparse=False, ensure_2d=True, + dtype=[np.float64, np.float32]) + self.n_features_in_ = X.shape[1] - X = check_array(X, dtype=[np.float64, np.float32], **check_params) - self.n_features_in_ = 1 if X.ndim == 1 else X.shape[1] - - if y is None: - raise ValueError("y should be a 1d array") - y = check_array(y, dtype=X.dtype, **check_params) if y.ndim > 1: warn( "A column-vector y was passed when a 1d array was expected.", DataConversionWarning, ) y = y[:, 0] - - check_consistent_length(X, y) - + + # Set classes_ attribute for sklearn compatibility + self.classes_ = np.unique(y) + + # Handle single-class case + if len(self.classes_) == 1: + # Store the single class for prediction + self.single_class_ = True + self.coefficients_ = np.array([0.0]) # Dummy coefficients + self.knots_ = np.array([]) # No knots needed + return self + + # Check target type following sklearn's suggested pattern + y_type = type_of_target(y, input_name='y', raise_unknown=True) + if y_type != 'binary': + raise ValueError( + 'Only binary classification is supported. The type of the target ' + f'is {y_type}.' + ) + + # Remove single_class_ flag for normal binary case + self.single_class_ = False + if self.n_knots and self.knots is None: # only n_knots given so we create knots self.n_knots_ = min([self.n_knots, X.shape[0]]) @@ -253,7 +279,7 @@ def fit(self, X, y): raise ValueError("knots and n_knots cannot be both null or non-null") if self.method not in ['SLSQP', 'trust-constr']: - raise ValueError("optimization method can only be either 'SLSQP' or 'trust-contr'") + raise ValueError("optimization method can only be either 'SLSQP' or 'trust-constr'") if self.two_stage_fitting_initial_size is None: self._fit(X, y, initial_guess=None) @@ -275,12 +301,19 @@ def fit(self, X, y): return self def transform(self, X): - if not self.is_fitted: - raise NotFittedError( - "predict or transform is not available if the estimator was not fitted" - ) - check_params = dict(accept_sparse=False, ensure_2d=False) - X = check_array(X, dtype=[np.float64, np.float32], **check_params) + # Check if fitted + check_is_fitted(self, 'coefficients_') + + # Handle single-class case + if hasattr(self, 'single_class_') and self.single_class_: + # Return probabilities of 1.0 for the single class + X = check_array(X, accept_sparse=False, ensure_2d=True, + dtype=[np.float64, np.float32]) + return np.ones(X.shape[0]) + + # Use validate_data to ensure consistency with fitted data + X = validate_data(self, X, accept_sparse=False, ensure_2d=True, + dtype=[np.float64, np.float32], reset=False) design_X = _get_design_matrix( inputs=self.get_input_scores(X), @@ -291,13 +324,27 @@ def transform(self, X): return expit(np.dot(design_X, self.coefficients_)) def predict(self, X): - return self.transform(X) + # Handle single-class case + if hasattr(self, 'single_class_') and self.single_class_: + X = check_array(X, accept_sparse=False, ensure_2d=True, + dtype=[np.float64, np.float32]) + n_samples = X.shape[0] + return np.full(n_samples, self.classes_[0]) + + probabilities = self.transform(X) + # Use classes_ to map predictions to actual class labels + class_indices = (probabilities >= 0.5).astype(int) + return self.classes_[class_indices] + + def predict_proba(self, X): + """Return probability estimates for the positive class.""" + proba_one = self.transform(X) + return np.column_stack([1 - proba_one, proba_one]) @property def is_fitted(self) -> bool: """ Check if the model is fitted by checking if it has 'coefficients_' attribute - Returns ------- is_fitted : bool @@ -306,10 +353,26 @@ def is_fitted(self) -> bool: def _more_tags(self) -> Dict[str, bool]: """ - Override default sklearn tags (sklearn.utils._DEFAULT_TAGS) - + Override default sklearn tags (sklearn.utils._DEFAULT_TAGS) for sklearn < 1.6 Returns ------- tags : dict """ - return {"poor_score": True, "binary_only": True, "requires_y": True} \ No newline at end of file + return {"poor_score": True, "binary_only": True, "requires_y": True} + + def __sklearn_tags__(self): + """ + Define sklearn tags for scikit-learn >= 1.6 + + Returns + ------- + tags : sklearn.utils.Tags + """ + tags = super().__sklearn_tags__() + # Set multi_class=False to indicate binary_only=True + if hasattr(tags, 'classifier_tags') and tags.classifier_tags is not None: + tags.classifier_tags.multi_class = False + tags.classifier_tags.poor_score = True + if hasattr(tags, 'target_tags') and tags.target_tags is not None: + tags.target_tags.required = True + return tags \ No newline at end of file diff --git a/tests/test_linear_spline_logistic_regression.py b/tests/test_linear_spline_logistic_regression.py index 71b8aca..a94c00a 100644 --- a/tests/test_linear_spline_logistic_regression.py +++ b/tests/test_linear_spline_logistic_regression.py @@ -54,7 +54,7 @@ def test_fit_on_piecewise_function_with_exact_knots(self): exact_knots_model.fit(X_values, y_values) assert_allclose_absolute( - exact_knots_model.predict(X_values), true_prob_values, atol=0.05, allowed_not_close_fraction=0.05 + exact_knots_model.transform(X_values), true_prob_values, atol=0.05, allowed_not_close_fraction=0.05 ) def test_fit_on_piecewise_function_with_only_num_knots(self): @@ -82,5 +82,5 @@ def test_fit_on_piecewise_function_with_only_num_knots(self): ) fit_n_knots_model.fit(X_values, y_values) assert_allclose_absolute( - fit_n_knots_model.predict(X_values), true_prob_values, atol=0.08, allowed_not_close_fraction=0.08 + fit_n_knots_model.transform(X_values), true_prob_values, atol=0.08, allowed_not_close_fraction=0.08 )