From 522de043dad706037ec2480ff33001910ac66c6e Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 13 Feb 2024 16:08:51 +0100
Subject: [PATCH 01/15] refactor: remove statsmodels with custom gamma GLM
 implementation for parametric trend curve in default inference model

---
 pydeseq2/default_inference.py | 42 ++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py
index 29ebb65c..5fff7462 100644
--- a/pydeseq2/default_inference.py
+++ b/pydeseq2/default_inference.py
@@ -1,22 +1,17 @@
-import warnings
 from typing import Literal
 from typing import Optional
 from typing import Tuple
 
 import numpy as np
 import pandas as pd
-import statsmodels.api as sm  # type: ignore
 from joblib import Parallel  # type: ignore
 from joblib import delayed
 from joblib import parallel_backend
-from statsmodels.tools.sm_exceptions import DomainWarning  # type: ignore
+from scipy.optimize import minimize  # type: ignore
 
 from pydeseq2 import inference
 from pydeseq2 import utils
 
-# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link.
-warnings.simplefilter("ignore", DomainWarning)
-
 
 class DefaultInference(inference.Inference):
     """Default DESeq2-related inference methods, using scipy/sklearn/numpy.
@@ -207,17 +202,34 @@ def wald_test(  # noqa: D102
     def dispersion_trend_gamma_glm(  # noqa: D102
         self, covariates: pd.Series, targets: pd.Series
     ) -> Tuple[np.ndarray, np.ndarray]:
-        covariates_w_intercept = sm.add_constant(covariates)
-        targets_fit = targets.values
+        covariates_w_intercept = covariates.to_frame()
+        covariates_w_intercept["intercept"] = 1
         covariates_fit = covariates_w_intercept.values
-        glm_gamma = sm.GLM(
-            targets_fit,
-            covariates_fit,
-            family=sm.families.Gamma(link=sm.families.links.identity()),
+        targets_fit = targets.values
+
+        def loss(coeffs):
+            mu = covariates_fit @ coeffs
+            return (targets_fit / mu + np.log(mu)).mean()
+
+        def grad(coeffs):
+            mu = covariates_fit @ coeffs
+            return -(
+                ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None]
+            ).mean(0)
+
+        res = minimize(
+            loss,
+            x0=np.array([1.0, 1.0]),
+            jac=grad,
+            method="L-BFGS-B",
+            bounds=[(0, np.inf)],
         )
-        res = glm_gamma.fit()
-        coeffs = res.params
-        return (coeffs, covariates_fit @ coeffs)
+
+        if not res.success:
+            raise ValueError("Gamma GLM optimization failed.")
+
+        coeffs = res.x
+        return coeffs, covariates_fit @ coeffs
 
     def lfc_shrink_nbinom_glm(  # noqa: D102
         self,

From 51d422e30d24cc246763739f3b39daebca9c9378 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 13 Feb 2024 17:02:48 +0100
Subject: [PATCH 02/15] fix: use consistent ordering for trend curve
 coefficients

---
 pydeseq2/default_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py
index 5fff7462..8a3985be 100644
--- a/pydeseq2/default_inference.py
+++ b/pydeseq2/default_inference.py
@@ -203,7 +203,7 @@ def dispersion_trend_gamma_glm(  # noqa: D102
         self, covariates: pd.Series, targets: pd.Series
     ) -> Tuple[np.ndarray, np.ndarray]:
         covariates_w_intercept = covariates.to_frame()
-        covariates_w_intercept["intercept"] = 1
+        covariates_w_intercept.insert(0, "intercept", 1)
         covariates_fit = covariates_w_intercept.values
         targets_fit = targets.values
 

From a72bec936f5422e4323fe5ed2c1c061e87fb4aa0 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 13 Feb 2024 17:10:15 +0100
Subject: [PATCH 03/15] feat: catch gamma GLM failure and switch to mean fit

---
 pydeseq2/dds.py               | 69 ++++++++++++++++++++++-------------
 pydeseq2/default_inference.py |  2 +-
 2 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
index 665ef5b9..fed7e145 100644
--- a/pydeseq2/dds.py
+++ b/pydeseq2/dds.py
@@ -343,7 +343,7 @@ def vst_fit(
         if use_design:
             # Check that the dispersion trend curve was fitted. If not, fit it.
             # This will call previous functions in a cascade.
-            if "trend_coeffs" not in self.uns:
+            if "disp_function" not in self.uns:
                 self.fit_dispersion_trend()
         else:
             # Reduce the design matrix to an intercept and reconstruct at the end
@@ -601,39 +601,57 @@ def fit_dispersion_trend(self) -> None:
         # Initialize coefficients
         old_coeffs = pd.Series([0.1, 0.1])
         coeffs = pd.Series([1.0, 1.0])
+        try:
+            while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6:
+                old_coeffs = coeffs
+                coeffs, predictions = self.inference.dispersion_trend_gamma_glm(
+                    covariates, targets
+                )
+                # Filter out genes that are too far away from the curve before refitting
+                pred_ratios = (
+                    self[:, covariates.index].varm["genewise_dispersions"] / predictions
+                )
 
-        while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6:
-            old_coeffs = coeffs
-            coeffs, predictions = self.inference.dispersion_trend_gamma_glm(
-                covariates, targets
-            )
-            # Filter out genes that are too far away from the curve before refitting
-            pred_ratios = (
-                self[:, covariates.index].varm["genewise_dispersions"] / predictions
+                targets.drop(
+                    targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
+                    inplace=True,
+                )
+                covariates.drop(
+                    covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
+                    inplace=True,
+                )
+            self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
+
+            self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
+            self.uns["disp_function"] = lambda x: dispersion_trend(
+                x, self.uns["trend_coeffs"]
             )
+            self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.uns[
+                "disp_function"
+            ](self.varm["_normed_means"][self.varm["non_zero"]])
 
-            targets.drop(
-                targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
-                inplace=True,
+        except RuntimeError:
+            warnings.warn(
+                "The dispersion trend curve fitting did not converge. "
+                "Switching to a mean-based dispersion trend.",
+                UserWarning,
+                stacklevel=2,
             )
-            covariates.drop(
-                covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
-                inplace=True,
+            mean_disp = trim_mean(
+                self.varm["genewise_dispersions"][
+                    self.varm["genewise_dispersions"] > 10 * self.min_disp
+                ],
+                proportiontocut=0.001,
             )
 
+            self.uns["disp_function"] = lambda x: mean_disp
+            self.varm["fitted_dispersions"] = np.full(self.n_vars, mean_disp)
+
         end = time.time()
 
         if not self.quiet:
             print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr)
 
-        self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
-
-        self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
-        self.varm["fitted_dispersions"][self.varm["non_zero"]] = dispersion_trend(
-            self.varm["_normed_means"][self.varm["non_zero"]],
-            self.uns["trend_coeffs"],
-        )
-
     def fit_dispersion_prior(self) -> None:
         """Fit dispersion variance priors and standard deviation of log-residuals.
 
@@ -1001,11 +1019,10 @@ def _refit_without_outliers(
 
         # Compute trend dispersions.
         # Note: the trend curve is not refitted.
-        sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
+        sub_dds.uns["disp_function"] = self.uns["disp_function"]
         sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0)
-        sub_dds.varm["fitted_dispersions"] = dispersion_trend(
+        sub_dds.varm["fitted_dispersions"] = self.uns["disp_function"](
             sub_dds.varm["_normed_means"],
-            sub_dds.uns["trend_coeffs"],
         )
 
         # Estimate MAP dispersions.
diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py
index 8a3985be..975375cd 100644
--- a/pydeseq2/default_inference.py
+++ b/pydeseq2/default_inference.py
@@ -226,7 +226,7 @@ def grad(coeffs):
         )
 
         if not res.success:
-            raise ValueError("Gamma GLM optimization failed.")
+            raise RuntimeError("Gamma GLM optimization failed.")
 
         coeffs = res.x
         return coeffs, covariates_fit @ coeffs

From ff59e411a02c437ee052f0b929393965189d5d90 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 13 Feb 2024 17:12:20 +0100
Subject: [PATCH 04/15] build: remove statsmodels dependency

---
 README.md      | 1 -
 pyproject.toml | 4 ----
 setup.py       | 1 -
 3 files changed, 6 deletions(-)

diff --git a/README.md b/README.md
index 9eadcbdf..896f24b6 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,6 @@ For reference, the code is being tested in a github workflow (CI) with python
 - pandas 1.4.3
 - scikit-learn 1.1.1
 - scipy 1.8.1
-- statsmodels 0.13.2
 ```
 
 Please don't hesitate to open an issue in case you encounter any issue due to possible deprecations.
diff --git a/pyproject.toml b/pyproject.toml
index 267a8c30..79205547 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,9 +47,5 @@ convention = "numpy"
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
-    "ignore::DeprecationWarning:patsy[.*]", # due to patsy being a dependency of statsmodels and
-    # calling pandas.core.dtypes.common.is_categorical_dtype.
-    "ignore::DeprecationWarning:statsmodels", # ignore Pyarrow deprecation warnings
-    "ignore::statsmodels.tools.sm_exceptions.DomainWarning", # due to fitting the dipersion curve with the identity link
     "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion
 ]
diff --git a/setup.py b/setup.py
index 5bb3269c..69875617 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,6 @@
         "pandas>=1.4.0",
         "scikit-learn>=1.1.0",
         "scipy>=1.8.0",
-        "statsmodels",
         "matplotlib>=3.6.2",  # not sure why sphinx_gallery does not work without it
     ],  # external packages as dependencies
     extras_require={

From 6edefb34df3030f996fc65f3bb0e44599fbc032b Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 13 Feb 2024 17:18:54 +0100
Subject: [PATCH 05/15] CI: ignore deprecation warning raised by pandas
 regarding Pyarrow.

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 79205547..2606c790 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,5 +47,6 @@ convention = "numpy"
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
+    "ignore::DeprecationWarning:pandas", # ignore Pyarrow deprecation warnings
     "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion
 ]

From baf3e457b6d095c413ab8f0c0bff51cd9e517799 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 13 Feb 2024 17:28:12 +0100
Subject: [PATCH 06/15] CI: ignore deprecation warning raised by pandas
 regarding Pyarrow.

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2606c790..e30e39a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ convention = "numpy"
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
-    "ignore::DeprecationWarning:pandas", # ignore Pyarrow deprecation warnings
+    '''ignore:\s*Pyarrow will become a required dependency of pandas:DeprecationWarning''',
+    # ignore Pyarrow deprecation warnings
     "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion
 ]

From e1814332f561e33382da1587be2ab08b896e72fd Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 10:12:28 +0100
Subject: [PATCH 07/15] fix: switch to mean fit if all coeffs are 0

---
 pydeseq2/dds.py   |  4 ++-
 pydeseq2/ds.py    | 21 +++++-------
 pydeseq2/utils.py | 86 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
index fed7e145..fb2b0e3b 100644
--- a/pydeseq2/dds.py
+++ b/pydeseq2/dds.py
@@ -602,7 +602,9 @@ def fit_dispersion_trend(self) -> None:
         old_coeffs = pd.Series([0.1, 0.1])
         coeffs = pd.Series([1.0, 1.0])
         try:
-            while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6:
+            while (coeffs > 0).all() and (
+                np.log(np.abs(coeffs / old_coeffs)) ** 2
+            ).sum() >= 1e-6:
                 old_coeffs = coeffs
                 coeffs, predictions = self.inference.dispersion_trend_gamma_glm(
                     covariates, targets
diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py
index 9693c318..a86a9844 100644
--- a/pydeseq2/ds.py
+++ b/pydeseq2/ds.py
@@ -7,14 +7,14 @@
 # import anndata as ad
 import numpy as np
 import pandas as pd
-import statsmodels.api as sm  # type: ignore
 from scipy.optimize import root_scalar  # type: ignore
 from scipy.stats import f  # type: ignore
-from statsmodels.stats.multitest import multipletests  # type: ignore
+from scipy.stats import false_discovery_control  # type: ignore
 
 from pydeseq2.dds import DeseqDataSet
 from pydeseq2.default_inference import DefaultInference
 from pydeseq2.inference import Inference
+from pydeseq2.utils import lowess
 from pydeseq2.utils import make_MA_plot
 from pydeseq2.utils import n_or_more_replicates
 
@@ -526,18 +526,15 @@ def _independent_filtering(self) -> None:
             use = (self.base_mean >= cutoff) & (~self.p_values.isna())
             U2 = self.p_values[use]
             if not U2.empty:
-                result.loc[use, i] = multipletests(
-                    U2, alpha=self.alpha, method="fdr_bh"
-                )[1]
-
-        num_rej = (result < self.alpha).sum(0)
-        lowess = sm.nonparametric.lowess(num_rej, theta, frac=1 / 5)
+                result.loc[use, i] = false_discovery_control(U2, method="bh")
+        num_rej = (result < self.alpha).sum(0).values
+        lowess_res = lowess(num_rej, theta, frac=1 / 5)
 
         if num_rej.max() <= 10:
             j = 0
         else:
-            residual = num_rej[num_rej > 0] - lowess[num_rej > 0, 1]
-            thresh = lowess[:, 1].max() - np.sqrt(np.mean(residual**2))
+            residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0, 1]
+            thresh = lowess_res[:, 1].max() - np.sqrt(np.mean(residual**2))
 
             if np.any(num_rej > thresh):
                 j = np.where(num_rej > thresh)[0][0]
@@ -557,8 +554,8 @@ def _p_value_adjustment(self) -> None:
             self.run_wald_test()
 
         self.padj = pd.Series(np.nan, index=self.dds.var_names)
-        self.padj.loc[~self.p_values.isna()] = multipletests(
-            self.p_values.dropna(), alpha=self.alpha, method="fdr_bh"
+        self.padj.loc[~self.p_values.isna()] = false_discovery_control(
+            self.p_values.dropna(), method="bh"
         )[1]
 
     def _cooks_filtering(self) -> None:
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 0f04e0f5..648ed927 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -1,5 +1,6 @@
 import multiprocessing
 import warnings
+from math import ceil
 from math import floor
 from pathlib import Path
 from typing import List
@@ -49,7 +50,8 @@ def load_example_data(
 
     debug : bool
         If true, subsample 10 samples and 100 genes at random.
-        (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``).
+        (Note that the "synthetic" dataset is already 10 features 100.)
+        (default: ``False``).
 
     debug_seed : int
         Seed for the debug mode. (default: ``42``).
@@ -731,7 +733,7 @@ def trimmed_mean(x, trim: float = 0.1, **kwargs) -> Union[float, np.ndarray]:
 
     Parameters
     ----------
-    x : ndarray
+    features : ndarray
         Data whose mean to compute.
 
     trim : float
@@ -815,7 +817,7 @@ def trimmed_variance(
 
     Parameters
     ----------
-    x : ndarray
+    features : ndarray
         Data whose trimmed variance to compute.
 
     trim : float
@@ -1368,7 +1370,7 @@ def mean_absolute_deviation(x: np.ndarray) -> float:
 
     Parameters
     ----------
-    x : ndarray
+    features : ndarray
         1D array whose MAD to compute.
 
     Returns
@@ -1399,13 +1401,13 @@ def make_scatter(
         List of ndarrays to plot.
 
     legend_labels : list
-        List of strings that correspond to plotted y values for legend.
+        List of strings that correspond to plotted targets values for legend.
 
     x_val : ndarray
         1D array to plot (example: ``dds.varm['_normed_means']``).
 
     log : bool
-        Whether or not to log scale x and y axes (``default=True``).
+        Whether or not to log scale features and targets axes (``default=True``).
 
     save_path : str or None
         The path where to save the plot. If left None, the plot won't be saved
@@ -1475,7 +1477,7 @@ def make_MA_plot(
         P-value threshold to subset scatterplot colors on.
 
     log : bool
-        Whether or not to log scale x and y axes (``default=True``).
+        Whether or not to log scale features and targets axes (``default=True``).
 
     save_path : str or None
         The path where to save the plot. If left None, the plot won't be saved
@@ -1520,3 +1522,73 @@ def make_MA_plot(
 
     if save_path is not None:
         plt.savefig(save_path, bbox_inches="tight")
+
+
+# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
+#
+# License: BSD (3-clause)
+
+# Adapted from https://gist.github.com/agramfort/850437
+
+
+def lowess(
+    features: np.ndarray, targets: np.ndarray, frac: float = 2.0 / 3.0, iter: int = 3
+):
+    """Run lowess smoothing: Robust locally weighted regression.
+
+    The lowess function fits a nonparametric regression curve to a scatterplot.
+    The arrays features and targets contain an equal number of elements; each pair
+    (features[i], targets[i]) defines a data point in the scatterplot. The function
+    returns the estimated (smooth) values of targets.
+    The smoothing span is given by f. A larger value for f will result in a
+    smoother curve. The number of robustifying iterations is given by iter. The
+    function will run faster with a smaller number of iterations.
+
+    Parameters
+    ----------
+    features : ndarray
+        Data points.
+    targets : ndarray
+        Target values.
+    frac : float
+        The fraction of the data used when estimating each y-value. (default: ``2/3``).
+    iter : int
+        The number of robustifying iterations. (default: ``3``).
+
+    Returns
+    -------
+    ndarray
+        Estimated (smooth) values of targets.
+    """
+    n = len(features)
+    r = int(ceil(frac * n))
+    h = np.maximum(
+        np.array([np.sort(np.abs(features - features[i]))[r] for i in range(n)]), 1e-12
+    )
+    w = np.clip(
+        np.abs(np.nan_to_num((features[:, None] - features[None, :]) / h)), 0.0, 1.0
+    )
+    w = (1 - w**3) ** 3
+    yest = np.zeros(n)
+    delta = np.ones(n)
+    for _ in range(iter):
+        for i in range(n):
+            weights = delta * w[:, i]
+            b = np.array(
+                [np.sum(weights * targets), np.sum(weights * targets * features)]
+            )
+            A = np.array(
+                [
+                    [np.sum(weights), np.sum(weights * features)],
+                    [np.sum(weights * features), np.sum(weights * features * features)],
+                ]
+            )
+            beta = np.linalg.lstsq(A, b)[0]
+            yest[i] = beta[0] + beta[1] * features[i]
+
+        residuals = targets - yest
+        s = np.median(np.abs(residuals))
+        delta = np.clip(residuals / (6.0 * s), -1, 1)
+        delta = (1 - delta**2) ** 2
+
+    return yest

From f3d74b5fac496bda1269e10515f7577c08741562 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 10:29:35 +0100
Subject: [PATCH 08/15] test: catch both warnings in test_zero_inflated

---
 tests/test_edge_cases.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py
index 5a2b8a71..e0fa7a21 100644
--- a/tests/test_edge_cases.py
+++ b/tests/test_edge_cases.py
@@ -468,7 +468,9 @@ def test_zero_inflated():
     counts_df.iloc[idx, :] = 0
 
     dds = DeseqDataSet(counts=counts_df, metadata=metadata)
-    with pytest.warns(RuntimeWarning):
+    with pytest.warns(RuntimeWarning), pytest.warns(UserWarning):
+        # Will warn that each gene contains a zero
+        # and that the parametric curve is not a good fit
         dds.deseq2()
 
 

From e66321b01a37deacfd10b70155f31df700d9af35 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 10:42:13 +0100
Subject: [PATCH 09/15] build: bump scipy version requirement to get access to
 false_discovery_control

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 69875617..de697701 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
         "numpy>=1.23.0",
         "pandas>=1.4.0",
         "scikit-learn>=1.1.0",
-        "scipy>=1.8.0",
+        "scipy>=1.11.0",
         "matplotlib>=3.6.2",  # not sure why sphinx_gallery does not work without it
     ],  # external packages as dependencies
     extras_require={

From 3306d1b09c7a89ea3728c8b7ad8a988fc5690bd6 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 10:46:30 +0100
Subject: [PATCH 10/15] build: BREAKING CHANGE deprecate python 3.8

---
 .github/workflows/pr_validation.yml | 2 --
 setup.py                            | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml
index 4c14dfcb..4c55eebc 100644
--- a/.github/workflows/pr_validation.yml
+++ b/.github/workflows/pr_validation.yml
@@ -17,8 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - os: ubuntu-latest
-            python: "3.8"
           - os: ubuntu-latest
             python: "3.9"
           - os: ubuntu-latest
diff --git a/setup.py b/setup.py
index de697701..ee560ea2 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
 setup(
     name="pydeseq2",
     version=about["__version__"],
-    python_requires=">=3.8.0",
+    python_requires=">=3.9.0",
     license="MIT",
     description="A python implementation of DESeq2.",
     long_description=readme,

From fdddb4cdf2b0434659b304d13b13cb7959e2ea6c Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 10:51:46 +0100
Subject: [PATCH 11/15] refactor: switch from RuntimeWarning to UserWarning

---
 pydeseq2/dds.py          | 2 +-
 tests/test_edge_cases.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
index fb2b0e3b..1ed249db 100644
--- a/pydeseq2/dds.py
+++ b/pydeseq2/dds.py
@@ -477,7 +477,7 @@ def fit_size_factors(
             warnings.warn(
                 "Every gene contains at least one zero, "
                 "cannot compute log geometric means. Switching to iterative mode.",
-                RuntimeWarning,
+                UserWarning,
                 stacklevel=2,
             )
             self._fit_iterate_size_factors()
diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py
index e0fa7a21..2746e7b3 100644
--- a/tests/test_edge_cases.py
+++ b/tests/test_edge_cases.py
@@ -468,7 +468,7 @@ def test_zero_inflated():
     counts_df.iloc[idx, :] = 0
 
     dds = DeseqDataSet(counts=counts_df, metadata=metadata)
-    with pytest.warns(RuntimeWarning), pytest.warns(UserWarning):
+    with pytest.warns(UserWarning):
         # Will warn that each gene contains a zero
         # and that the parametric curve is not a good fit
         dds.deseq2()

From c157066fa799afd1b9bb5c8b5826a35972dc81b9 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 10:59:54 +0100
Subject: [PATCH 12/15] docs: remove statsmodels dependency

---
 docs/source/conf.py                | 5 -----
 docs/source/usage/requirements.rst | 1 -
 2 files changed, 6 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 282f7230..d74ce929 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -7,8 +7,6 @@
 
 # -- Path setup --------------------------------------------------------------
 
-import warnings
-
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -17,12 +15,9 @@
 from pathlib import Path
 
 import git
-from statsmodels.tools.sm_exceptions import DomainWarning
 
 import pydeseq2
 
-# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link.
-warnings.simplefilter("ignore", DomainWarning)
 
 # -- Project information -----------------------------------------------------
 
diff --git a/docs/source/usage/requirements.rst b/docs/source/usage/requirements.rst
index e24e72dd..e091b284 100644
--- a/docs/source/usage/requirements.rst
+++ b/docs/source/usage/requirements.rst
@@ -8,6 +8,5 @@ For reference, the code was tested with python 3.8 and the following package ver
     - pandas==1.4.3
     - scikit-learn==1.1.1
     - scipy==1.8.1
-    - statsmodels==0.13.2
 
 Please don't hesitate to open an issue in case you encounter any problems due to possible deprecations.
\ No newline at end of file

From f5a735cdbd5afa2f2b572c535244f772c7de9e01 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 14 Feb 2024 12:02:02 +0100
Subject: [PATCH 13/15] fix: stop size factor iterations if no eligible genes
 are left for the fit

---
 pydeseq2/dds.py               | 44 ++++++++++++++++++++++++-----------
 pydeseq2/default_inference.py |  8 +++----
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
index 1ed249db..6ed1a290 100644
--- a/pydeseq2/dds.py
+++ b/pydeseq2/dds.py
@@ -625,12 +625,10 @@ def fit_dispersion_trend(self) -> None:
             self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
 
             self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
-            self.uns["disp_function"] = lambda x: dispersion_trend(
-                x, self.uns["trend_coeffs"]
+            self.uns["disp_function_type"] = "parametric"
+            self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function(
+                self.varm["_normed_means"][self.varm["non_zero"]]
             )
-            self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.uns[
-                "disp_function"
-            ](self.varm["_normed_means"][self.varm["non_zero"]])
 
         except RuntimeError:
             warnings.warn(
@@ -639,21 +637,28 @@ def fit_dispersion_trend(self) -> None:
                 UserWarning,
                 stacklevel=2,
             )
-            mean_disp = trim_mean(
+            self.uns["mean_disp"] = trim_mean(
                 self.varm["genewise_dispersions"][
                     self.varm["genewise_dispersions"] > 10 * self.min_disp
                 ],
                 proportiontocut=0.001,
             )
 
-            self.uns["disp_function"] = lambda x: mean_disp
-            self.varm["fitted_dispersions"] = np.full(self.n_vars, mean_disp)
+            self.uns["disp_function_type"] = "mean"
+            self.varm["fitted_dispersions"] = np.full(self.n_vars, self.uns["mean_disp"])
 
         end = time.time()
 
         if not self.quiet:
             print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr)
 
+    def disp_function(self, x):
+        """Return the dispersion trend function at x."""
+        if self.uns["disp_function_type"] == "parametric":
+            return dispersion_trend(x, self.uns["trend_coeffs"])
+        elif self.uns["disp_function_type"] == "mean":
+            return self.uns["mean_disp"]
+
     def fit_dispersion_prior(self) -> None:
         """Fit dispersion variance priors and standard deviation of log-residuals.
 
@@ -1021,10 +1026,14 @@ def _refit_without_outliers(
 
         # Compute trend dispersions.
         # Note: the trend curve is not refitted.
-        sub_dds.uns["disp_function"] = self.uns["disp_function"]
+        sub_dds.uns["disp_function_type"] = self.uns["disp_function_type"]
+        if sub_dds.uns["disp_function_type"] == "parametric":
+            sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
+        elif sub_dds.uns["disp_function_type"] == "mean":
+            sub_dds.uns["mean_disp"] = self.uns["mean_disp"]
         sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0)
-        sub_dds.varm["fitted_dispersions"] = self.uns["disp_function"](
-            sub_dds.varm["_normed_means"],
+        sub_dds.varm["fitted_dispersions"] = sub_dds.disp_function(
+            sub_dds.varm["_normed_means"]
         )
 
         # Estimate MAP dispersions.
@@ -1108,9 +1117,18 @@ def objective(p):
                 & self.varm["non_zero"]
             ]
 
-            mean_disp = trimmed_mean(
-                self[:, use_for_mean_genes].varm["genewise_dispersions"], trim=0.001
+            if len(use_for_mean_genes) == 0:
+                print(
+                    "No genes have a dispersion above 10 * min_disp in "
+                    "_fit_iterate_size_factors."
+                )
+                break
+
+            mean_disp = trim_mean(
+                self[:, use_for_mean_genes].varm["genewise_dispersions"],
+                proportiontocut=0.001,
             )
+
             self.varm["fitted_dispersions"] = np.ones(self.n_vars) * mean_disp
             self.fit_dispersion_prior()
             self.fit_MAP_dispersions()
diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py
index 975375cd..ed3c18da 100644
--- a/pydeseq2/default_inference.py
+++ b/pydeseq2/default_inference.py
@@ -209,13 +209,13 @@ def dispersion_trend_gamma_glm(  # noqa: D102
 
         def loss(coeffs):
             mu = covariates_fit @ coeffs
-            return (targets_fit / mu + np.log(mu)).mean()
+            return np.nanmean(targets_fit / mu + np.log(mu), axis=0)
 
         def grad(coeffs):
             mu = covariates_fit @ coeffs
-            return -(
-                ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None]
-            ).mean(0)
+            return -np.nanmean(
+                ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None], axis=0
+            )
 
         res = minimize(
             loss,

From cf2b47dd026a44d9b7b27ebf65f359993516ef34 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 15 Feb 2024 17:55:40 +0100
Subject: [PATCH 14/15] docs: improve lowess parameter description

---
 pydeseq2/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 648ed927..1e012b00 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -1547,9 +1547,9 @@ def lowess(
     Parameters
     ----------
     features : ndarray
-        Data points.
+        A 1D array of data points.
     targets : ndarray
-        Target values.
+        A 1D array of target values (with the same shape as features).
     frac : float
         The fraction of the data used when estimating each y-value. (default: ``2/3``).
     iter : int

From 21a5f8522d29f04d2893a83ffee6ad0dc54e404b Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 19 Feb 2024 15:11:57 +0100
Subject: [PATCH 15/15] refactor: return convergence status from gamma glm fit
 instead of raising error

---
 pydeseq2/dds.py               | 74 ++++++++++++++++++-----------------
 pydeseq2/default_inference.py |  7 +---
 pydeseq2/inference.py         |  4 +-
 pydeseq2/utils.py             |  2 +-
 4 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py
index 6ed1a290..dbe790a7 100644
--- a/pydeseq2/dds.py
+++ b/pydeseq2/dds.py
@@ -601,51 +601,55 @@ def fit_dispersion_trend(self) -> None:
         # Initialize coefficients
         old_coeffs = pd.Series([0.1, 0.1])
         coeffs = pd.Series([1.0, 1.0])
-        try:
-            while (coeffs > 0).all() and (
-                np.log(np.abs(coeffs / old_coeffs)) ** 2
-            ).sum() >= 1e-6:
-                old_coeffs = coeffs
-                coeffs, predictions = self.inference.dispersion_trend_gamma_glm(
-                    covariates, targets
+        while (coeffs > 0).all() and (
+            np.log(np.abs(coeffs / old_coeffs)) ** 2
+        ).sum() >= 1e-6:
+            old_coeffs = coeffs
+            coeffs, predictions, converged = self.inference.dispersion_trend_gamma_glm(
+                covariates, targets
+            )
+
+            if not converged:
+                warnings.warn(
+                    "The dispersion trend curve fitting did not converge. "
+                    "Switching to a mean-based dispersion trend.",
+                    UserWarning,
+                    stacklevel=2,
                 )
-                # Filter out genes that are too far away from the curve before refitting
-                pred_ratios = (
-                    self[:, covariates.index].varm["genewise_dispersions"] / predictions
+                self.uns["mean_disp"] = trim_mean(
+                    self.varm["genewise_dispersions"][
+                        self.varm["genewise_dispersions"] > 10 * self.min_disp
+                    ],
+                    proportiontocut=0.001,
                 )
 
-                targets.drop(
-                    targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
-                    inplace=True,
-                )
-                covariates.drop(
-                    covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
-                    inplace=True,
+                self.uns["disp_function_type"] = "mean"
+                self.varm["fitted_dispersions"] = np.full(
+                    self.n_vars, self.uns["mean_disp"]
                 )
-            self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
 
-            self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
-            self.uns["disp_function_type"] = "parametric"
-            self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function(
-                self.varm["_normed_means"][self.varm["non_zero"]]
+                break
+
+            # Filter out genes that are too far away from the curve before refitting
+            pred_ratios = (
+                self[:, covariates.index].varm["genewise_dispersions"] / predictions
             )
 
-        except RuntimeError:
-            warnings.warn(
-                "The dispersion trend curve fitting did not converge. "
-                "Switching to a mean-based dispersion trend.",
-                UserWarning,
-                stacklevel=2,
+            targets.drop(
+                targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
+                inplace=True,
             )
-            self.uns["mean_disp"] = trim_mean(
-                self.varm["genewise_dispersions"][
-                    self.varm["genewise_dispersions"] > 10 * self.min_disp
-                ],
-                proportiontocut=0.001,
+            covariates.drop(
+                covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
+                inplace=True,
             )
+        self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
 
-            self.uns["disp_function_type"] = "mean"
-            self.varm["fitted_dispersions"] = np.full(self.n_vars, self.uns["mean_disp"])
+        self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
+        self.uns["disp_function_type"] = "parametric"
+        self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function(
+            self.varm["_normed_means"][self.varm["non_zero"]]
+        )
 
         end = time.time()
 
diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py
index ed3c18da..923b3b55 100644
--- a/pydeseq2/default_inference.py
+++ b/pydeseq2/default_inference.py
@@ -201,7 +201,7 @@ def wald_test(  # noqa: D102
 
     def dispersion_trend_gamma_glm(  # noqa: D102
         self, covariates: pd.Series, targets: pd.Series
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray, bool]:
         covariates_w_intercept = covariates.to_frame()
         covariates_w_intercept.insert(0, "intercept", 1)
         covariates_fit = covariates_w_intercept.values
@@ -225,11 +225,8 @@ def grad(coeffs):
             bounds=[(0, np.inf)],
         )
 
-        if not res.success:
-            raise RuntimeError("Gamma GLM optimization failed.")
-
         coeffs = res.x
-        return coeffs, covariates_fit @ coeffs
+        return coeffs, covariates_fit @ coeffs, res.success
 
     def lfc_shrink_nbinom_glm(  # noqa: D102
         self,
diff --git a/pydeseq2/inference.py b/pydeseq2/inference.py
index e4e50a66..801a86c4 100644
--- a/pydeseq2/inference.py
+++ b/pydeseq2/inference.py
@@ -285,7 +285,7 @@ def fit_moments_dispersions(
     @abstractmethod
     def dispersion_trend_gamma_glm(
         self, covariates: pd.Series, targets: pd.Series
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray, bool]:
         """Fit a gamma glm on gene dispersions.
 
         The intercept should be concatenated in this method
@@ -304,6 +304,8 @@ def dispersion_trend_gamma_glm(
             Coefficients of the regression.
         predictions : ndarray
             Predictions of the regression.
+        converged : bool
+            Whether the optimization converged.
         """
 
     @abstractmethod
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 1e012b00..3e67ddae 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -1540,7 +1540,7 @@ def lowess(
     The arrays features and targets contain an equal number of elements; each pair
     (features[i], targets[i]) defines a data point in the scatterplot. The function
     returns the estimated (smooth) values of targets.
-    The smoothing span is given by f. A larger value for f will result in a
+    The smoothing span is given by frac. A larger value for frac will result in a
     smoother curve. The number of robustifying iterations is given by iter. The
     function will run faster with a smaller number of iterations.