From 522de043dad706037ec2480ff33001910ac66c6e Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 13 Feb 2024 16:08:51 +0100 Subject: [PATCH 01/15] refactor: remove statsmodels with custom gamma GLM implementation for parametric trend curve in default inference model --- pydeseq2/default_inference.py | 42 ++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py index 29ebb65c..5fff7462 100644 --- a/pydeseq2/default_inference.py +++ b/pydeseq2/default_inference.py @@ -1,22 +1,17 @@ -import warnings from typing import Literal from typing import Optional from typing import Tuple import numpy as np import pandas as pd -import statsmodels.api as sm # type: ignore from joblib import Parallel # type: ignore from joblib import delayed from joblib import parallel_backend -from statsmodels.tools.sm_exceptions import DomainWarning # type: ignore +from scipy.optimize import minimize # type: ignore from pydeseq2 import inference from pydeseq2 import utils -# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link. -warnings.simplefilter("ignore", DomainWarning) - class DefaultInference(inference.Inference): """Default DESeq2-related inference methods, using scipy/sklearn/numpy. @@ -207,17 +202,34 @@ def wald_test( # noqa: D102 def dispersion_trend_gamma_glm( # noqa: D102 self, covariates: pd.Series, targets: pd.Series ) -> Tuple[np.ndarray, np.ndarray]: - covariates_w_intercept = sm.add_constant(covariates) - targets_fit = targets.values + covariates_w_intercept = covariates.to_frame() + covariates_w_intercept["intercept"] = 1 covariates_fit = covariates_w_intercept.values - glm_gamma = sm.GLM( - targets_fit, - covariates_fit, - family=sm.families.Gamma(link=sm.families.links.identity()), + targets_fit = targets.values + + def loss(coeffs): + mu = covariates_fit @ coeffs + return (targets_fit / mu + np.log(mu)).mean() + + def grad(coeffs): + mu = covariates_fit @ coeffs + return -( + ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None] + ).mean(0) + + res = minimize( + loss, + x0=np.array([1.0, 1.0]), + jac=grad, + method="L-BFGS-B", + bounds=[(0, np.inf)], ) - res = glm_gamma.fit() - coeffs = res.params - return (coeffs, covariates_fit @ coeffs) + + if not res.success: + raise ValueError("Gamma GLM optimization failed.") + + coeffs = res.x + return coeffs, covariates_fit @ coeffs def lfc_shrink_nbinom_glm( # noqa: D102 self, From 51d422e30d24cc246763739f3b39daebca9c9378 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 13 Feb 2024 17:02:48 +0100 Subject: [PATCH 02/15] fix: use consistent ordering for trend curve coefficients --- pydeseq2/default_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py index 5fff7462..8a3985be 100644 --- a/pydeseq2/default_inference.py +++ b/pydeseq2/default_inference.py @@ -203,7 +203,7 @@ def dispersion_trend_gamma_glm( # noqa: D102 self, covariates: pd.Series, targets: pd.Series ) -> Tuple[np.ndarray, np.ndarray]: covariates_w_intercept = covariates.to_frame() - covariates_w_intercept["intercept"] = 1 + covariates_w_intercept.insert(0, "intercept", 1) covariates_fit = covariates_w_intercept.values targets_fit = targets.values From a72bec936f5422e4323fe5ed2c1c061e87fb4aa0 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 13 Feb 2024 17:10:15 +0100 Subject: [PATCH 03/15] feat: catch gamma GLM failure and switch to mean fit --- pydeseq2/dds.py | 69 ++++++++++++++++++++++------------- pydeseq2/default_inference.py | 2 +- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index 665ef5b9..fed7e145 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -343,7 +343,7 @@ def vst_fit( if use_design: # Check that the dispersion trend curve was fitted. If not, fit it. # This will call previous functions in a cascade. - if "trend_coeffs" not in self.uns: + if "disp_function" not in self.uns: self.fit_dispersion_trend() else: # Reduce the design matrix to an intercept and reconstruct at the end @@ -601,39 +601,57 @@ def fit_dispersion_trend(self) -> None: # Initialize coefficients old_coeffs = pd.Series([0.1, 0.1]) coeffs = pd.Series([1.0, 1.0]) + try: + while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6: + old_coeffs = coeffs + coeffs, predictions = self.inference.dispersion_trend_gamma_glm( + covariates, targets + ) + # Filter out genes that are too far away from the curve before refitting + pred_ratios = ( + self[:, covariates.index].varm["genewise_dispersions"] / predictions + ) - while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6: - old_coeffs = coeffs - coeffs, predictions = self.inference.dispersion_trend_gamma_glm( - covariates, targets - ) - # Filter out genes that are too far away from the curve before refitting - pred_ratios = ( - self[:, covariates.index].varm["genewise_dispersions"] / predictions + targets.drop( + targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, + inplace=True, + ) + covariates.drop( + covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, + inplace=True, + ) + self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) + + self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) + self.uns["disp_function"] = lambda x: dispersion_trend( + x, self.uns["trend_coeffs"] ) + self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.uns[ + "disp_function" + ](self.varm["_normed_means"][self.varm["non_zero"]]) - targets.drop( - targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, - inplace=True, + except RuntimeError: + warnings.warn( + "The dispersion trend curve fitting did not converge. " + "Switching to a mean-based dispersion trend.", + UserWarning, + stacklevel=2, ) - covariates.drop( - covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, - inplace=True, + mean_disp = trim_mean( + self.varm["genewise_dispersions"][ + self.varm["genewise_dispersions"] > 10 * self.min_disp + ], + proportiontocut=0.001, ) + self.uns["disp_function"] = lambda x: mean_disp + self.varm["fitted_dispersions"] = np.full(self.n_vars, mean_disp) + end = time.time() if not self.quiet: print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr) - self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) - - self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) - self.varm["fitted_dispersions"][self.varm["non_zero"]] = dispersion_trend( - self.varm["_normed_means"][self.varm["non_zero"]], - self.uns["trend_coeffs"], - ) - def fit_dispersion_prior(self) -> None: """Fit dispersion variance priors and standard deviation of log-residuals. @@ -1001,11 +1019,10 @@ def _refit_without_outliers( # Compute trend dispersions. # Note: the trend curve is not refitted. - sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"] + sub_dds.uns["disp_function"] = self.uns["disp_function"] sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0) - sub_dds.varm["fitted_dispersions"] = dispersion_trend( + sub_dds.varm["fitted_dispersions"] = self.uns["disp_function"]( sub_dds.varm["_normed_means"], - sub_dds.uns["trend_coeffs"], ) # Estimate MAP dispersions. diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py index 8a3985be..975375cd 100644 --- a/pydeseq2/default_inference.py +++ b/pydeseq2/default_inference.py @@ -226,7 +226,7 @@ def grad(coeffs): ) if not res.success: - raise ValueError("Gamma GLM optimization failed.") + raise RuntimeError("Gamma GLM optimization failed.") coeffs = res.x return coeffs, covariates_fit @ coeffs From ff59e411a02c437ee052f0b929393965189d5d90 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 13 Feb 2024 17:12:20 +0100 Subject: [PATCH 04/15] build: remove statsmodels dependency --- README.md | 1 - pyproject.toml | 4 ---- setup.py | 1 - 3 files changed, 6 deletions(-) diff --git a/README.md b/README.md index 9eadcbdf..896f24b6 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,6 @@ For reference, the code is being tested in a github workflow (CI) with python - pandas 1.4.3 - scikit-learn 1.1.1 - scipy 1.8.1 -- statsmodels 0.13.2 ``` Please don't hesitate to open an issue in case you encounter any issue due to possible deprecations. diff --git a/pyproject.toml b/pyproject.toml index 267a8c30..79205547 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,9 +47,5 @@ convention = "numpy" [tool.pytest.ini_options] filterwarnings = [ "error", - "ignore::DeprecationWarning:patsy[.*]", # due to patsy being a dependency of statsmodels and - # calling pandas.core.dtypes.common.is_categorical_dtype. - "ignore::DeprecationWarning:statsmodels", # ignore Pyarrow deprecation warnings - "ignore::statsmodels.tools.sm_exceptions.DomainWarning", # due to fitting the dipersion curve with the identity link "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion ] diff --git a/setup.py b/setup.py index 5bb3269c..69875617 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,6 @@ "pandas>=1.4.0", "scikit-learn>=1.1.0", "scipy>=1.8.0", - "statsmodels", "matplotlib>=3.6.2", # not sure why sphinx_gallery does not work without it ], # external packages as dependencies extras_require={ From 6edefb34df3030f996fc65f3bb0e44599fbc032b Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 13 Feb 2024 17:18:54 +0100 Subject: [PATCH 05/15] CI: ignore deprecation warning raised by pandas regarding Pyarrow. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 79205547..2606c790 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,5 +47,6 @@ convention = "numpy" [tool.pytest.ini_options] filterwarnings = [ "error", + "ignore::DeprecationWarning:pandas", # ignore Pyarrow deprecation warnings "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion ] From baf3e457b6d095c413ab8f0c0bff51cd9e517799 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 13 Feb 2024 17:28:12 +0100 Subject: [PATCH 06/15] CI: ignore deprecation warning raised by pandas regarding Pyarrow. --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2606c790..e30e39a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ convention = "numpy" [tool.pytest.ini_options] filterwarnings = [ "error", - "ignore::DeprecationWarning:pandas", # ignore Pyarrow deprecation warnings + '''ignore:\s*Pyarrow will become a required dependency of pandas:DeprecationWarning''', + # ignore Pyarrow deprecation warnings "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion ] From e1814332f561e33382da1587be2ab08b896e72fd Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 10:12:28 +0100 Subject: [PATCH 07/15] fix: switch to mean fit if all coeffs are 0 --- pydeseq2/dds.py | 4 ++- pydeseq2/ds.py | 21 +++++------- pydeseq2/utils.py | 86 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 91 insertions(+), 20 deletions(-) diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index fed7e145..fb2b0e3b 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -602,7 +602,9 @@ def fit_dispersion_trend(self) -> None: old_coeffs = pd.Series([0.1, 0.1]) coeffs = pd.Series([1.0, 1.0]) try: - while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6: + while (coeffs > 0).all() and ( + np.log(np.abs(coeffs / old_coeffs)) ** 2 + ).sum() >= 1e-6: old_coeffs = coeffs coeffs, predictions = self.inference.dispersion_trend_gamma_glm( covariates, targets diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py index 9693c318..a86a9844 100644 --- a/pydeseq2/ds.py +++ b/pydeseq2/ds.py @@ -7,14 +7,14 @@ # import anndata as ad import numpy as np import pandas as pd -import statsmodels.api as sm # type: ignore from scipy.optimize import root_scalar # type: ignore from scipy.stats import f # type: ignore -from statsmodels.stats.multitest import multipletests # type: ignore +from scipy.stats import false_discovery_control # type: ignore from pydeseq2.dds import DeseqDataSet from pydeseq2.default_inference import DefaultInference from pydeseq2.inference import Inference +from pydeseq2.utils import lowess from pydeseq2.utils import make_MA_plot from pydeseq2.utils import n_or_more_replicates @@ -526,18 +526,15 @@ def _independent_filtering(self) -> None: use = (self.base_mean >= cutoff) & (~self.p_values.isna()) U2 = self.p_values[use] if not U2.empty: - result.loc[use, i] = multipletests( - U2, alpha=self.alpha, method="fdr_bh" - )[1] - - num_rej = (result < self.alpha).sum(0) - lowess = sm.nonparametric.lowess(num_rej, theta, frac=1 / 5) + result.loc[use, i] = false_discovery_control(U2, method="bh") + num_rej = (result < self.alpha).sum(0).values + lowess_res = lowess(num_rej, theta, frac=1 / 5) if num_rej.max() <= 10: j = 0 else: - residual = num_rej[num_rej > 0] - lowess[num_rej > 0, 1] - thresh = lowess[:, 1].max() - np.sqrt(np.mean(residual**2)) + residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0, 1] + thresh = lowess_res[:, 1].max() - np.sqrt(np.mean(residual**2)) if np.any(num_rej > thresh): j = np.where(num_rej > thresh)[0][0] @@ -557,8 +554,8 @@ def _p_value_adjustment(self) -> None: self.run_wald_test() self.padj = pd.Series(np.nan, index=self.dds.var_names) - self.padj.loc[~self.p_values.isna()] = multipletests( - self.p_values.dropna(), alpha=self.alpha, method="fdr_bh" + self.padj.loc[~self.p_values.isna()] = false_discovery_control( + self.p_values.dropna(), method="bh" )[1] def _cooks_filtering(self) -> None: diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 0f04e0f5..648ed927 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -1,5 +1,6 @@ import multiprocessing import warnings +from math import ceil from math import floor from pathlib import Path from typing import List @@ -49,7 +50,8 @@ def load_example_data( debug : bool If true, subsample 10 samples and 100 genes at random. - (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``). + (Note that the "synthetic" dataset is already 10 features 100.) + (default: ``False``). debug_seed : int Seed for the debug mode. (default: ``42``). @@ -731,7 +733,7 @@ def trimmed_mean(x, trim: float = 0.1, **kwargs) -> Union[float, np.ndarray]: Parameters ---------- - x : ndarray + features : ndarray Data whose mean to compute. trim : float @@ -815,7 +817,7 @@ def trimmed_variance( Parameters ---------- - x : ndarray + features : ndarray Data whose trimmed variance to compute. trim : float @@ -1368,7 +1370,7 @@ def mean_absolute_deviation(x: np.ndarray) -> float: Parameters ---------- - x : ndarray + features : ndarray 1D array whose MAD to compute. Returns @@ -1399,13 +1401,13 @@ def make_scatter( List of ndarrays to plot. legend_labels : list - List of strings that correspond to plotted y values for legend. + List of strings that correspond to plotted targets values for legend. x_val : ndarray 1D array to plot (example: ``dds.varm['_normed_means']``). log : bool - Whether or not to log scale x and y axes (``default=True``). + Whether or not to log scale features and targets axes (``default=True``). save_path : str or None The path where to save the plot. If left None, the plot won't be saved @@ -1475,7 +1477,7 @@ def make_MA_plot( P-value threshold to subset scatterplot colors on. log : bool - Whether or not to log scale x and y axes (``default=True``). + Whether or not to log scale features and targets axes (``default=True``). save_path : str or None The path where to save the plot. If left None, the plot won't be saved @@ -1520,3 +1522,73 @@ def make_MA_plot( if save_path is not None: plt.savefig(save_path, bbox_inches="tight") + + +# Authors: Alexandre Gramfort +# +# License: BSD (3-clause) + +# Adapted from https://gist.github.com/agramfort/850437 + + +def lowess( + features: np.ndarray, targets: np.ndarray, frac: float = 2.0 / 3.0, iter: int = 3 +): + """Run lowess smoothing: Robust locally weighted regression. + + The lowess function fits a nonparametric regression curve to a scatterplot. + The arrays features and targets contain an equal number of elements; each pair + (features[i], targets[i]) defines a data point in the scatterplot. The function + returns the estimated (smooth) values of targets. + The smoothing span is given by f. A larger value for f will result in a + smoother curve. The number of robustifying iterations is given by iter. The + function will run faster with a smaller number of iterations. + + Parameters + ---------- + features : ndarray + Data points. + targets : ndarray + Target values. + frac : float + The fraction of the data used when estimating each y-value. (default: ``2/3``). + iter : int + The number of robustifying iterations. (default: ``3``). + + Returns + ------- + ndarray + Estimated (smooth) values of targets. + """ + n = len(features) + r = int(ceil(frac * n)) + h = np.maximum( + np.array([np.sort(np.abs(features - features[i]))[r] for i in range(n)]), 1e-12 + ) + w = np.clip( + np.abs(np.nan_to_num((features[:, None] - features[None, :]) / h)), 0.0, 1.0 + ) + w = (1 - w**3) ** 3 + yest = np.zeros(n) + delta = np.ones(n) + for _ in range(iter): + for i in range(n): + weights = delta * w[:, i] + b = np.array( + [np.sum(weights * targets), np.sum(weights * targets * features)] + ) + A = np.array( + [ + [np.sum(weights), np.sum(weights * features)], + [np.sum(weights * features), np.sum(weights * features * features)], + ] + ) + beta = np.linalg.lstsq(A, b)[0] + yest[i] = beta[0] + beta[1] * features[i] + + residuals = targets - yest + s = np.median(np.abs(residuals)) + delta = np.clip(residuals / (6.0 * s), -1, 1) + delta = (1 - delta**2) ** 2 + + return yest From f3d74b5fac496bda1269e10515f7577c08741562 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 10:29:35 +0100 Subject: [PATCH 08/15] test: catch both warnings in test_zero_inflated --- tests/test_edge_cases.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index 5a2b8a71..e0fa7a21 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -468,7 +468,9 @@ def test_zero_inflated(): counts_df.iloc[idx, :] = 0 dds = DeseqDataSet(counts=counts_df, metadata=metadata) - with pytest.warns(RuntimeWarning): + with pytest.warns(RuntimeWarning), pytest.warns(UserWarning): + # Will warn that each gene contains a zero + # and that the parametric curve is not a good fit dds.deseq2() From e66321b01a37deacfd10b70155f31df700d9af35 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 10:42:13 +0100 Subject: [PATCH 09/15] build: bump scipy version requirement to get access to false_discovery_control --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 69875617..de697701 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ "numpy>=1.23.0", "pandas>=1.4.0", "scikit-learn>=1.1.0", - "scipy>=1.8.0", + "scipy>=1.11.0", "matplotlib>=3.6.2", # not sure why sphinx_gallery does not work without it ], # external packages as dependencies extras_require={ From 3306d1b09c7a89ea3728c8b7ad8a988fc5690bd6 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 10:46:30 +0100 Subject: [PATCH 10/15] build: BREAKING CHANGE deprecate python 3.8 --- .github/workflows/pr_validation.yml | 2 -- setup.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml index 4c14dfcb..4c55eebc 100644 --- a/.github/workflows/pr_validation.yml +++ b/.github/workflows/pr_validation.yml @@ -17,8 +17,6 @@ jobs: strategy: matrix: include: - - os: ubuntu-latest - python: "3.8" - os: ubuntu-latest python: "3.9" - os: ubuntu-latest diff --git a/setup.py b/setup.py index de697701..ee560ea2 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( name="pydeseq2", version=about["__version__"], - python_requires=">=3.8.0", + python_requires=">=3.9.0", license="MIT", description="A python implementation of DESeq2.", long_description=readme, From fdddb4cdf2b0434659b304d13b13cb7959e2ea6c Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 10:51:46 +0100 Subject: [PATCH 11/15] refactor: switch from RuntimeWarning to UserWarning --- pydeseq2/dds.py | 2 +- tests/test_edge_cases.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index fb2b0e3b..1ed249db 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -477,7 +477,7 @@ def fit_size_factors( warnings.warn( "Every gene contains at least one zero, " "cannot compute log geometric means. Switching to iterative mode.", - RuntimeWarning, + UserWarning, stacklevel=2, ) self._fit_iterate_size_factors() diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index e0fa7a21..2746e7b3 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -468,7 +468,7 @@ def test_zero_inflated(): counts_df.iloc[idx, :] = 0 dds = DeseqDataSet(counts=counts_df, metadata=metadata) - with pytest.warns(RuntimeWarning), pytest.warns(UserWarning): + with pytest.warns(UserWarning): # Will warn that each gene contains a zero # and that the parametric curve is not a good fit dds.deseq2() From c157066fa799afd1b9bb5c8b5826a35972dc81b9 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 10:59:54 +0100 Subject: [PATCH 12/15] docs: remove statsmodels dependency --- docs/source/conf.py | 5 ----- docs/source/usage/requirements.rst | 1 - 2 files changed, 6 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 282f7230..d74ce929 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -7,8 +7,6 @@ # -- Path setup -------------------------------------------------------------- -import warnings - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -17,12 +15,9 @@ from pathlib import Path import git -from statsmodels.tools.sm_exceptions import DomainWarning import pydeseq2 -# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link. -warnings.simplefilter("ignore", DomainWarning) # -- Project information ----------------------------------------------------- diff --git a/docs/source/usage/requirements.rst b/docs/source/usage/requirements.rst index e24e72dd..e091b284 100644 --- a/docs/source/usage/requirements.rst +++ b/docs/source/usage/requirements.rst @@ -8,6 +8,5 @@ For reference, the code was tested with python 3.8 and the following package ver - pandas==1.4.3 - scikit-learn==1.1.1 - scipy==1.8.1 - - statsmodels==0.13.2 Please don't hesitate to open an issue in case you encounter any problems due to possible deprecations. \ No newline at end of file From f5a735cdbd5afa2f2b572c535244f772c7de9e01 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 14 Feb 2024 12:02:02 +0100 Subject: [PATCH 13/15] fix: stop size factor iterations if no eligible genes are left for the fit --- pydeseq2/dds.py | 44 ++++++++++++++++++++++++----------- pydeseq2/default_inference.py | 8 +++---- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index 1ed249db..6ed1a290 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -625,12 +625,10 @@ def fit_dispersion_trend(self) -> None: self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) - self.uns["disp_function"] = lambda x: dispersion_trend( - x, self.uns["trend_coeffs"] + self.uns["disp_function_type"] = "parametric" + self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function( + self.varm["_normed_means"][self.varm["non_zero"]] ) - self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.uns[ - "disp_function" - ](self.varm["_normed_means"][self.varm["non_zero"]]) except RuntimeError: warnings.warn( @@ -639,21 +637,28 @@ def fit_dispersion_trend(self) -> None: UserWarning, stacklevel=2, ) - mean_disp = trim_mean( + self.uns["mean_disp"] = trim_mean( self.varm["genewise_dispersions"][ self.varm["genewise_dispersions"] > 10 * self.min_disp ], proportiontocut=0.001, ) - self.uns["disp_function"] = lambda x: mean_disp - self.varm["fitted_dispersions"] = np.full(self.n_vars, mean_disp) + self.uns["disp_function_type"] = "mean" + self.varm["fitted_dispersions"] = np.full(self.n_vars, self.uns["mean_disp"]) end = time.time() if not self.quiet: print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr) + def disp_function(self, x): + """Return the dispersion trend function at x.""" + if self.uns["disp_function_type"] == "parametric": + return dispersion_trend(x, self.uns["trend_coeffs"]) + elif self.uns["disp_function_type"] == "mean": + return self.uns["mean_disp"] + def fit_dispersion_prior(self) -> None: """Fit dispersion variance priors and standard deviation of log-residuals. @@ -1021,10 +1026,14 @@ def _refit_without_outliers( # Compute trend dispersions. # Note: the trend curve is not refitted. - sub_dds.uns["disp_function"] = self.uns["disp_function"] + sub_dds.uns["disp_function_type"] = self.uns["disp_function_type"] + if sub_dds.uns["disp_function_type"] == "parametric": + sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"] + elif sub_dds.uns["disp_function_type"] == "mean": + sub_dds.uns["mean_disp"] = self.uns["mean_disp"] sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0) - sub_dds.varm["fitted_dispersions"] = self.uns["disp_function"]( - sub_dds.varm["_normed_means"], + sub_dds.varm["fitted_dispersions"] = sub_dds.disp_function( + sub_dds.varm["_normed_means"] ) # Estimate MAP dispersions. @@ -1108,9 +1117,18 @@ def objective(p): & self.varm["non_zero"] ] - mean_disp = trimmed_mean( - self[:, use_for_mean_genes].varm["genewise_dispersions"], trim=0.001 + if len(use_for_mean_genes) == 0: + print( + "No genes have a dispersion above 10 * min_disp in " + "_fit_iterate_size_factors." + ) + break + + mean_disp = trim_mean( + self[:, use_for_mean_genes].varm["genewise_dispersions"], + proportiontocut=0.001, ) + self.varm["fitted_dispersions"] = np.ones(self.n_vars) * mean_disp self.fit_dispersion_prior() self.fit_MAP_dispersions() diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py index 975375cd..ed3c18da 100644 --- a/pydeseq2/default_inference.py +++ b/pydeseq2/default_inference.py @@ -209,13 +209,13 @@ def dispersion_trend_gamma_glm( # noqa: D102 def loss(coeffs): mu = covariates_fit @ coeffs - return (targets_fit / mu + np.log(mu)).mean() + return np.nanmean(targets_fit / mu + np.log(mu), axis=0) def grad(coeffs): mu = covariates_fit @ coeffs - return -( - ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None] - ).mean(0) + return -np.nanmean( + ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None], axis=0 + ) res = minimize( loss, From cf2b47dd026a44d9b7b27ebf65f359993516ef34 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 15 Feb 2024 17:55:40 +0100 Subject: [PATCH 14/15] docs: improve lowess parameter description --- pydeseq2/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 648ed927..1e012b00 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -1547,9 +1547,9 @@ def lowess( Parameters ---------- features : ndarray - Data points. + A 1D array of data points. targets : ndarray - Target values. + A 1D array of target values (with the same shape as features). frac : float The fraction of the data used when estimating each y-value. (default: ``2/3``). iter : int From 21a5f8522d29f04d2893a83ffee6ad0dc54e404b Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 19 Feb 2024 15:11:57 +0100 Subject: [PATCH 15/15] refactor: return convergence status from gamma glm fit instead of raising error --- pydeseq2/dds.py | 74 ++++++++++++++++++----------------- pydeseq2/default_inference.py | 7 +--- pydeseq2/inference.py | 4 +- pydeseq2/utils.py | 2 +- 4 files changed, 45 insertions(+), 42 deletions(-) diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index 6ed1a290..dbe790a7 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -601,51 +601,55 @@ def fit_dispersion_trend(self) -> None: # Initialize coefficients old_coeffs = pd.Series([0.1, 0.1]) coeffs = pd.Series([1.0, 1.0]) - try: - while (coeffs > 0).all() and ( - np.log(np.abs(coeffs / old_coeffs)) ** 2 - ).sum() >= 1e-6: - old_coeffs = coeffs - coeffs, predictions = self.inference.dispersion_trend_gamma_glm( - covariates, targets + while (coeffs > 0).all() and ( + np.log(np.abs(coeffs / old_coeffs)) ** 2 + ).sum() >= 1e-6: + old_coeffs = coeffs + coeffs, predictions, converged = self.inference.dispersion_trend_gamma_glm( + covariates, targets + ) + + if not converged: + warnings.warn( + "The dispersion trend curve fitting did not converge. " + "Switching to a mean-based dispersion trend.", + UserWarning, + stacklevel=2, ) - # Filter out genes that are too far away from the curve before refitting - pred_ratios = ( - self[:, covariates.index].varm["genewise_dispersions"] / predictions + self.uns["mean_disp"] = trim_mean( + self.varm["genewise_dispersions"][ + self.varm["genewise_dispersions"] > 10 * self.min_disp + ], + proportiontocut=0.001, ) - targets.drop( - targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, - inplace=True, - ) - covariates.drop( - covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, - inplace=True, + self.uns["disp_function_type"] = "mean" + self.varm["fitted_dispersions"] = np.full( + self.n_vars, self.uns["mean_disp"] ) - self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) - self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) - self.uns["disp_function_type"] = "parametric" - self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function( - self.varm["_normed_means"][self.varm["non_zero"]] + break + + # Filter out genes that are too far away from the curve before refitting + pred_ratios = ( + self[:, covariates.index].varm["genewise_dispersions"] / predictions ) - except RuntimeError: - warnings.warn( - "The dispersion trend curve fitting did not converge. " - "Switching to a mean-based dispersion trend.", - UserWarning, - stacklevel=2, + targets.drop( + targets[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, + inplace=True, ) - self.uns["mean_disp"] = trim_mean( - self.varm["genewise_dispersions"][ - self.varm["genewise_dispersions"] > 10 * self.min_disp - ], - proportiontocut=0.001, + covariates.drop( + covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, + inplace=True, ) + self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) - self.uns["disp_function_type"] = "mean" - self.varm["fitted_dispersions"] = np.full(self.n_vars, self.uns["mean_disp"]) + self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) + self.uns["disp_function_type"] = "parametric" + self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function( + self.varm["_normed_means"][self.varm["non_zero"]] + ) end = time.time() diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py index ed3c18da..923b3b55 100644 --- a/pydeseq2/default_inference.py +++ b/pydeseq2/default_inference.py @@ -201,7 +201,7 @@ def wald_test( # noqa: D102 def dispersion_trend_gamma_glm( # noqa: D102 self, covariates: pd.Series, targets: pd.Series - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> Tuple[np.ndarray, np.ndarray, bool]: covariates_w_intercept = covariates.to_frame() covariates_w_intercept.insert(0, "intercept", 1) covariates_fit = covariates_w_intercept.values @@ -225,11 +225,8 @@ def grad(coeffs): bounds=[(0, np.inf)], ) - if not res.success: - raise RuntimeError("Gamma GLM optimization failed.") - coeffs = res.x - return coeffs, covariates_fit @ coeffs + return coeffs, covariates_fit @ coeffs, res.success def lfc_shrink_nbinom_glm( # noqa: D102 self, diff --git a/pydeseq2/inference.py b/pydeseq2/inference.py index e4e50a66..801a86c4 100644 --- a/pydeseq2/inference.py +++ b/pydeseq2/inference.py @@ -285,7 +285,7 @@ def fit_moments_dispersions( @abstractmethod def dispersion_trend_gamma_glm( self, covariates: pd.Series, targets: pd.Series - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> Tuple[np.ndarray, np.ndarray, bool]: """Fit a gamma glm on gene dispersions. The intercept should be concatenated in this method @@ -304,6 +304,8 @@ def dispersion_trend_gamma_glm( Coefficients of the regression. predictions : ndarray Predictions of the regression. + converged : bool + Whether the optimization converged. """ @abstractmethod diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 1e012b00..3e67ddae 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -1540,7 +1540,7 @@ def lowess( The arrays features and targets contain an equal number of elements; each pair (features[i], targets[i]) defines a data point in the scatterplot. The function returns the estimated (smooth) values of targets. - The smoothing span is given by f. A larger value for f will result in a + The smoothing span is given by frac. A larger value for frac will result in a smoother curve. The number of robustifying iterations is given by iter. The function will run faster with a smaller number of iterations.