diff --git a/.github/workflows/pr_validation.yml b/.github/workflows/pr_validation.yml index 4c14dfcb..4c55eebc 100644 --- a/.github/workflows/pr_validation.yml +++ b/.github/workflows/pr_validation.yml @@ -17,8 +17,6 @@ jobs: strategy: matrix: include: - - os: ubuntu-latest - python: "3.8" - os: ubuntu-latest python: "3.9" - os: ubuntu-latest diff --git a/README.md b/README.md index 9eadcbdf..896f24b6 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,6 @@ For reference, the code is being tested in a github workflow (CI) with python - pandas 1.4.3 - scikit-learn 1.1.1 - scipy 1.8.1 -- statsmodels 0.13.2 ``` Please don't hesitate to open an issue in case you encounter any issue due to possible deprecations. diff --git a/docs/source/conf.py b/docs/source/conf.py index 282f7230..d74ce929 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -7,8 +7,6 @@ # -- Path setup -------------------------------------------------------------- -import warnings - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -17,12 +15,9 @@ from pathlib import Path import git -from statsmodels.tools.sm_exceptions import DomainWarning import pydeseq2 -# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link. -warnings.simplefilter("ignore", DomainWarning) # -- Project information ----------------------------------------------------- diff --git a/docs/source/usage/requirements.rst b/docs/source/usage/requirements.rst index e24e72dd..e091b284 100644 --- a/docs/source/usage/requirements.rst +++ b/docs/source/usage/requirements.rst @@ -8,6 +8,5 @@ For reference, the code was tested with python 3.8 and the following package ver - pandas==1.4.3 - scikit-learn==1.1.1 - scipy==1.8.1 - - statsmodels==0.13.2 Please don't hesitate to open an issue in case you encounter any problems due to possible deprecations. \ No newline at end of file diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index 665ef5b9..dbe790a7 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -343,7 +343,7 @@ def vst_fit( if use_design: # Check that the dispersion trend curve was fitted. If not, fit it. # This will call previous functions in a cascade. - if "trend_coeffs" not in self.uns: + if "disp_function" not in self.uns: self.fit_dispersion_trend() else: # Reduce the design matrix to an intercept and reconstruct at the end @@ -477,7 +477,7 @@ def fit_size_factors( warnings.warn( "Every gene contains at least one zero, " "cannot compute log geometric means. Switching to iterative mode.", - RuntimeWarning, + UserWarning, stacklevel=2, ) self._fit_iterate_size_factors() @@ -601,12 +601,35 @@ def fit_dispersion_trend(self) -> None: # Initialize coefficients old_coeffs = pd.Series([0.1, 0.1]) coeffs = pd.Series([1.0, 1.0]) - - while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6: + while (coeffs > 0).all() and ( + np.log(np.abs(coeffs / old_coeffs)) ** 2 + ).sum() >= 1e-6: old_coeffs = coeffs - coeffs, predictions = self.inference.dispersion_trend_gamma_glm( + coeffs, predictions, converged = self.inference.dispersion_trend_gamma_glm( covariates, targets ) + + if not converged: + warnings.warn( + "The dispersion trend curve fitting did not converge. " + "Switching to a mean-based dispersion trend.", + UserWarning, + stacklevel=2, + ) + self.uns["mean_disp"] = trim_mean( + self.varm["genewise_dispersions"][ + self.varm["genewise_dispersions"] > 10 * self.min_disp + ], + proportiontocut=0.001, + ) + + self.uns["disp_function_type"] = "mean" + self.varm["fitted_dispersions"] = np.full( + self.n_vars, self.uns["mean_disp"] + ) + + break + # Filter out genes that are too far away from the curve before refitting pred_ratios = ( self[:, covariates.index].varm["genewise_dispersions"] / predictions @@ -620,19 +643,25 @@ def fit_dispersion_trend(self) -> None: covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index, inplace=True, ) + self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) + + self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) + self.uns["disp_function_type"] = "parametric" + self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function( + self.varm["_normed_means"][self.varm["non_zero"]] + ) end = time.time() if not self.quiet: print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr) - self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"]) - - self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN) - self.varm["fitted_dispersions"][self.varm["non_zero"]] = dispersion_trend( - self.varm["_normed_means"][self.varm["non_zero"]], - self.uns["trend_coeffs"], - ) + def disp_function(self, x): + """Return the dispersion trend function at x.""" + if self.uns["disp_function_type"] == "parametric": + return dispersion_trend(x, self.uns["trend_coeffs"]) + elif self.uns["disp_function_type"] == "mean": + return self.uns["mean_disp"] def fit_dispersion_prior(self) -> None: """Fit dispersion variance priors and standard deviation of log-residuals. @@ -1001,11 +1030,14 @@ def _refit_without_outliers( # Compute trend dispersions. # Note: the trend curve is not refitted. - sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"] + sub_dds.uns["disp_function_type"] = self.uns["disp_function_type"] + if sub_dds.uns["disp_function_type"] == "parametric": + sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"] + elif sub_dds.uns["disp_function_type"] == "mean": + sub_dds.uns["mean_disp"] = self.uns["mean_disp"] sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0) - sub_dds.varm["fitted_dispersions"] = dispersion_trend( - sub_dds.varm["_normed_means"], - sub_dds.uns["trend_coeffs"], + sub_dds.varm["fitted_dispersions"] = sub_dds.disp_function( + sub_dds.varm["_normed_means"] ) # Estimate MAP dispersions. @@ -1089,9 +1121,18 @@ def objective(p): & self.varm["non_zero"] ] - mean_disp = trimmed_mean( - self[:, use_for_mean_genes].varm["genewise_dispersions"], trim=0.001 + if len(use_for_mean_genes) == 0: + print( + "No genes have a dispersion above 10 * min_disp in " + "_fit_iterate_size_factors." + ) + break + + mean_disp = trim_mean( + self[:, use_for_mean_genes].varm["genewise_dispersions"], + proportiontocut=0.001, ) + self.varm["fitted_dispersions"] = np.ones(self.n_vars) * mean_disp self.fit_dispersion_prior() self.fit_MAP_dispersions() diff --git a/pydeseq2/default_inference.py b/pydeseq2/default_inference.py index 29ebb65c..923b3b55 100644 --- a/pydeseq2/default_inference.py +++ b/pydeseq2/default_inference.py @@ -1,22 +1,17 @@ -import warnings from typing import Literal from typing import Optional from typing import Tuple import numpy as np import pandas as pd -import statsmodels.api as sm # type: ignore from joblib import Parallel # type: ignore from joblib import delayed from joblib import parallel_backend -from statsmodels.tools.sm_exceptions import DomainWarning # type: ignore +from scipy.optimize import minimize # type: ignore from pydeseq2 import inference from pydeseq2 import utils -# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link. -warnings.simplefilter("ignore", DomainWarning) - class DefaultInference(inference.Inference): """Default DESeq2-related inference methods, using scipy/sklearn/numpy. @@ -206,18 +201,32 @@ def wald_test( # noqa: D102 def dispersion_trend_gamma_glm( # noqa: D102 self, covariates: pd.Series, targets: pd.Series - ) -> Tuple[np.ndarray, np.ndarray]: - covariates_w_intercept = sm.add_constant(covariates) - targets_fit = targets.values + ) -> Tuple[np.ndarray, np.ndarray, bool]: + covariates_w_intercept = covariates.to_frame() + covariates_w_intercept.insert(0, "intercept", 1) covariates_fit = covariates_w_intercept.values - glm_gamma = sm.GLM( - targets_fit, - covariates_fit, - family=sm.families.Gamma(link=sm.families.links.identity()), + targets_fit = targets.values + + def loss(coeffs): + mu = covariates_fit @ coeffs + return np.nanmean(targets_fit / mu + np.log(mu), axis=0) + + def grad(coeffs): + mu = covariates_fit @ coeffs + return -np.nanmean( + ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None], axis=0 + ) + + res = minimize( + loss, + x0=np.array([1.0, 1.0]), + jac=grad, + method="L-BFGS-B", + bounds=[(0, np.inf)], ) - res = glm_gamma.fit() - coeffs = res.params - return (coeffs, covariates_fit @ coeffs) + + coeffs = res.x + return coeffs, covariates_fit @ coeffs, res.success def lfc_shrink_nbinom_glm( # noqa: D102 self, diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py index 9693c318..a86a9844 100644 --- a/pydeseq2/ds.py +++ b/pydeseq2/ds.py @@ -7,14 +7,14 @@ # import anndata as ad import numpy as np import pandas as pd -import statsmodels.api as sm # type: ignore from scipy.optimize import root_scalar # type: ignore from scipy.stats import f # type: ignore -from statsmodels.stats.multitest import multipletests # type: ignore +from scipy.stats import false_discovery_control # type: ignore from pydeseq2.dds import DeseqDataSet from pydeseq2.default_inference import DefaultInference from pydeseq2.inference import Inference +from pydeseq2.utils import lowess from pydeseq2.utils import make_MA_plot from pydeseq2.utils import n_or_more_replicates @@ -526,18 +526,15 @@ def _independent_filtering(self) -> None: use = (self.base_mean >= cutoff) & (~self.p_values.isna()) U2 = self.p_values[use] if not U2.empty: - result.loc[use, i] = multipletests( - U2, alpha=self.alpha, method="fdr_bh" - )[1] - - num_rej = (result < self.alpha).sum(0) - lowess = sm.nonparametric.lowess(num_rej, theta, frac=1 / 5) + result.loc[use, i] = false_discovery_control(U2, method="bh") + num_rej = (result < self.alpha).sum(0).values + lowess_res = lowess(num_rej, theta, frac=1 / 5) if num_rej.max() <= 10: j = 0 else: - residual = num_rej[num_rej > 0] - lowess[num_rej > 0, 1] - thresh = lowess[:, 1].max() - np.sqrt(np.mean(residual**2)) + residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0, 1] + thresh = lowess_res[:, 1].max() - np.sqrt(np.mean(residual**2)) if np.any(num_rej > thresh): j = np.where(num_rej > thresh)[0][0] @@ -557,8 +554,8 @@ def _p_value_adjustment(self) -> None: self.run_wald_test() self.padj = pd.Series(np.nan, index=self.dds.var_names) - self.padj.loc[~self.p_values.isna()] = multipletests( - self.p_values.dropna(), alpha=self.alpha, method="fdr_bh" + self.padj.loc[~self.p_values.isna()] = false_discovery_control( + self.p_values.dropna(), method="bh" )[1] def _cooks_filtering(self) -> None: diff --git a/pydeseq2/inference.py b/pydeseq2/inference.py index e4e50a66..801a86c4 100644 --- a/pydeseq2/inference.py +++ b/pydeseq2/inference.py @@ -285,7 +285,7 @@ def fit_moments_dispersions( @abstractmethod def dispersion_trend_gamma_glm( self, covariates: pd.Series, targets: pd.Series - ) -> Tuple[np.ndarray, np.ndarray]: + ) -> Tuple[np.ndarray, np.ndarray, bool]: """Fit a gamma glm on gene dispersions. The intercept should be concatenated in this method @@ -304,6 +304,8 @@ def dispersion_trend_gamma_glm( Coefficients of the regression. predictions : ndarray Predictions of the regression. + converged : bool + Whether the optimization converged. """ @abstractmethod diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 0f04e0f5..3e67ddae 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -1,5 +1,6 @@ import multiprocessing import warnings +from math import ceil from math import floor from pathlib import Path from typing import List @@ -49,7 +50,8 @@ def load_example_data( debug : bool If true, subsample 10 samples and 100 genes at random. - (Note that the "synthetic" dataset is already 10 x 100.) (default: ``False``). + (Note that the "synthetic" dataset is already 10 features 100.) + (default: ``False``). debug_seed : int Seed for the debug mode. (default: ``42``). @@ -731,7 +733,7 @@ def trimmed_mean(x, trim: float = 0.1, **kwargs) -> Union[float, np.ndarray]: Parameters ---------- - x : ndarray + features : ndarray Data whose mean to compute. trim : float @@ -815,7 +817,7 @@ def trimmed_variance( Parameters ---------- - x : ndarray + features : ndarray Data whose trimmed variance to compute. trim : float @@ -1368,7 +1370,7 @@ def mean_absolute_deviation(x: np.ndarray) -> float: Parameters ---------- - x : ndarray + features : ndarray 1D array whose MAD to compute. Returns @@ -1399,13 +1401,13 @@ def make_scatter( List of ndarrays to plot. legend_labels : list - List of strings that correspond to plotted y values for legend. + List of strings that correspond to plotted targets values for legend. x_val : ndarray 1D array to plot (example: ``dds.varm['_normed_means']``). log : bool - Whether or not to log scale x and y axes (``default=True``). + Whether or not to log scale features and targets axes (``default=True``). save_path : str or None The path where to save the plot. If left None, the plot won't be saved @@ -1475,7 +1477,7 @@ def make_MA_plot( P-value threshold to subset scatterplot colors on. log : bool - Whether or not to log scale x and y axes (``default=True``). + Whether or not to log scale features and targets axes (``default=True``). save_path : str or None The path where to save the plot. If left None, the plot won't be saved @@ -1520,3 +1522,73 @@ def make_MA_plot( if save_path is not None: plt.savefig(save_path, bbox_inches="tight") + + +# Authors: Alexandre Gramfort +# +# License: BSD (3-clause) + +# Adapted from https://gist.github.com/agramfort/850437 + + +def lowess( + features: np.ndarray, targets: np.ndarray, frac: float = 2.0 / 3.0, iter: int = 3 +): + """Run lowess smoothing: Robust locally weighted regression. + + The lowess function fits a nonparametric regression curve to a scatterplot. + The arrays features and targets contain an equal number of elements; each pair + (features[i], targets[i]) defines a data point in the scatterplot. The function + returns the estimated (smooth) values of targets. + The smoothing span is given by frac. A larger value for frac will result in a + smoother curve. The number of robustifying iterations is given by iter. The + function will run faster with a smaller number of iterations. + + Parameters + ---------- + features : ndarray + A 1D array of data points. + targets : ndarray + A 1D array of target values (with the same shape as features). + frac : float + The fraction of the data used when estimating each y-value. (default: ``2/3``). + iter : int + The number of robustifying iterations. (default: ``3``). + + Returns + ------- + ndarray + Estimated (smooth) values of targets. + """ + n = len(features) + r = int(ceil(frac * n)) + h = np.maximum( + np.array([np.sort(np.abs(features - features[i]))[r] for i in range(n)]), 1e-12 + ) + w = np.clip( + np.abs(np.nan_to_num((features[:, None] - features[None, :]) / h)), 0.0, 1.0 + ) + w = (1 - w**3) ** 3 + yest = np.zeros(n) + delta = np.ones(n) + for _ in range(iter): + for i in range(n): + weights = delta * w[:, i] + b = np.array( + [np.sum(weights * targets), np.sum(weights * targets * features)] + ) + A = np.array( + [ + [np.sum(weights), np.sum(weights * features)], + [np.sum(weights * features), np.sum(weights * features * features)], + ] + ) + beta = np.linalg.lstsq(A, b)[0] + yest[i] = beta[0] + beta[1] * features[i] + + residuals = targets - yest + s = np.median(np.abs(residuals)) + delta = np.clip(residuals / (6.0 * s), -1, 1) + delta = (1 - delta**2) ** 2 + + return yest diff --git a/pyproject.toml b/pyproject.toml index 267a8c30..e30e39a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,9 +47,7 @@ convention = "numpy" [tool.pytest.ini_options] filterwarnings = [ "error", - "ignore::DeprecationWarning:patsy[.*]", # due to patsy being a dependency of statsmodels and - # calling pandas.core.dtypes.common.is_categorical_dtype. - "ignore::DeprecationWarning:statsmodels", # ignore Pyarrow deprecation warnings - "ignore::statsmodels.tools.sm_exceptions.DomainWarning", # due to fitting the dipersion curve with the identity link + '''ignore:\s*Pyarrow will become a required dependency of pandas:DeprecationWarning''', + # ignore Pyarrow deprecation warnings "ignore::FutureWarning", # Ignore AnnData FutureWarning about implicit conversion ] diff --git a/setup.py b/setup.py index 5bb3269c..ee560ea2 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( name="pydeseq2", version=about["__version__"], - python_requires=">=3.8.0", + python_requires=">=3.9.0", license="MIT", description="A python implementation of DESeq2.", long_description=readme, @@ -31,8 +31,7 @@ "numpy>=1.23.0", "pandas>=1.4.0", "scikit-learn>=1.1.0", - "scipy>=1.8.0", - "statsmodels", + "scipy>=1.11.0", "matplotlib>=3.6.2", # not sure why sphinx_gallery does not work without it ], # external packages as dependencies extras_require={ diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index 5a2b8a71..2746e7b3 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -468,7 +468,9 @@ def test_zero_inflated(): counts_df.iloc[idx, :] = 0 dds = DeseqDataSet(counts=counts_df, metadata=metadata) - with pytest.warns(RuntimeWarning): + with pytest.warns(UserWarning): + # Will warn that each gene contains a zero + # and that the parametric curve is not a good fit dds.deseq2()