From c668ad405885451691f223f252116f0817f46221 Mon Sep 17 00:00:00 2001 From: Malte Benedikt Kuehl Date: Thu, 22 Jan 2026 12:37:02 +0100 Subject: [PATCH 1/2] Fix pandas 3 data type and copy/write bugs --- pydeseq2/dds.py | 16 +++++++------- pydeseq2/ds.py | 44 +++++++++++++++++++++------------------ pydeseq2/grid_search.py | 4 ++-- pydeseq2/preprocessing.py | 2 +- pydeseq2/utils.py | 10 ++++----- pyproject.toml | 2 +- 6 files changed, 42 insertions(+), 36 deletions(-) diff --git a/pydeseq2/dds.py b/pydeseq2/dds.py index 58d1f1c1..aee7ac44 100644 --- a/pydeseq2/dds.py +++ b/pydeseq2/dds.py @@ -657,7 +657,7 @@ def fit_size_factors( # Calculate logcounts for x > 0 and take the mean for each gene log_counts = np.zeros_like(self.X, dtype=float) np.log(self.X, out=log_counts, where=self.X != 0) - logmeans = log_counts.mean(0) + logmeans = log_counts.mean(axis=0) # Determine which genes are usable (finite logmeans) self.filtered_genes = (~np.isinf(logmeans)) & (logmeans > 0) @@ -705,7 +705,7 @@ def sizeFactor(x): raise ValueError("Counts matrix 'X' is None, cannot fit size factors.") end = time.time() - self.var["_normed_means"] = self.layers["normed_counts"].mean(0) + self.var["_normed_means"] = self.layers["normed_counts"].mean(axis=0) if not self.quiet: print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr) @@ -793,7 +793,9 @@ def fit_genewise_dispersions(self, vst=False) -> None: dispersions_, self.min_disp, self.max_disp ) - self.var["_genewise_converged"] = np.full(self.n_vars, np.nan) + self.var["_genewise_converged"] = pd.array( + [pd.NA] * self.n_vars, dtype="boolean" + ) self.var.loc[self.var["non_zero"], "_genewise_converged"] = l_bfgs_b_converged_ def fit_dispersion_trend(self, vst: bool = False) -> None: @@ -919,7 +921,7 @@ def fit_MAP_dispersions(self) -> None: dispersions_, self.min_disp, self.max_disp ) - self.var["_MAP_converged"] = np.full(self.n_vars, np.nan) + self.var["_MAP_converged"] = pd.array([pd.NA] * self.n_vars, dtype="boolean") self.var.loc[self.var["non_zero"], "_MAP_converged"] = l_bfgs_b_converged_ # Filter outlier genes for which we won't apply shrinkage @@ -980,7 +982,7 @@ def fit_LFC(self) -> None: self.obsm["_mu_LFC"] = mu_ self.obsm["_hat_diagonals"] = hat_diagonals_ - self.var["_LFC_converged"] = np.full(self.n_vars, np.nan) + self.var["_LFC_converged"] = pd.array([pd.NA] * self.n_vars, dtype="boolean") self.var.loc[self.var["non_zero"], "_LFC_converged"] = converged_ def calculate_cooks(self) -> None: @@ -1098,7 +1100,7 @@ def cooks_outlier(self): cooks_outlier[cooks_outlier] = ( self.X[:, cooks_outlier] > self.X[:, cooks_outlier][pos, np.arange(len(pos))] - ).sum(0) < 3 + ).sum(axis=0) < 3 if self.low_memory: del self.layers["cooks"] @@ -1423,7 +1425,7 @@ def _refit_without_outliers( sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"] elif sub_dds.uns["disp_function_type"] == "mean": sub_dds.uns["mean_disp"] = self.uns["mean_disp"] - sub_dds.var["_normed_means"] = sub_dds.layers["normed_counts"].mean(0) + sub_dds.var["_normed_means"] = sub_dds.layers["normed_counts"].mean(axis=0) # Reshape in case there's a single gene to refit sub_dds.var["fitted_dispersions"] = sub_dds.disp_function( sub_dds.var["_normed_means"] diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py index 14491567..6e22fde7 100644 --- a/pydeseq2/ds.py +++ b/pydeseq2/ds.py @@ -411,29 +411,33 @@ def lfc_shrink(self, coeff: str, adapt: bool = True) -> None: if not self.quiet: print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr) - self.LFC.iloc[:, coeff_idx].update( - pd.Series( - np.array(lfcs)[:, coeff_idx], - index=self.dds.non_zero_genes, - ) + new_lfc_values = np.array(lfcs)[:, coeff_idx] + new_se_values = np.array( + [ + np.sqrt(np.abs(inv_hess[coeff_idx, coeff_idx])) + for inv_hess in inv_hessians + ] ) - - self.SE.update( - pd.Series( - np.array( - [ - np.sqrt(np.abs(inv_hess[coeff_idx, coeff_idx])) - for inv_hess in inv_hessians - ] - ), - index=self.dds.non_zero_genes, + nan_mask = ~np.isfinite(new_lfc_values) | ~np.isfinite(new_se_values) + + if nan_mask.any(): + warnings.warn( + f"{nan_mask.sum()} gene(s) had NaN/infinite values during LFC shrinkage," + " their LFCs and SEs were not updated.", + UserWarning, + stacklevel=2, ) - ) - self._LFC_shrink_converged = pd.Series(np.nan, index=self.dds.var_names) - self._LFC_shrink_converged.update( - pd.Series(l_bfgs_b_converged_, index=self.dds.non_zero_genes) + # Only update genes with valid (non-NaN) shrinkage results + valid_genes = self.dds.non_zero_genes[~nan_mask] + self.LFC.loc[valid_genes, coeff] = new_lfc_values[~nan_mask] + self.SE.loc[valid_genes] = new_se_values[~nan_mask] + + self._LFC_shrink_converged = pd.Series( + pd.array([pd.NA] * len(self.dds.var_names), dtype="boolean"), + index=self.dds.var_names, ) + self._LFC_shrink_converged.loc[self.dds.non_zero_genes] = l_bfgs_b_converged_ # Set a flag to indicate that LFCs were shrunk self.shrunk_LFCs = True @@ -511,7 +515,7 @@ def _independent_filtering(self) -> None: U2 = self.p_values[use] if not U2.empty: result.loc[use, i] = false_discovery_control(U2, method="bh") - num_rej = (result < self.alpha).sum(0).to_numpy().astype(int) + num_rej = (result < self.alpha).sum(axis=0).to_numpy().astype(int) lowess_res = lowess(theta, num_rej, frac=1 / 5) if num_rej.max() <= 10: diff --git a/pydeseq2/grid_search.py b/pydeseq2/grid_search.py index a172ce96..e46290fd 100644 --- a/pydeseq2/grid_search.py +++ b/pydeseq2/grid_search.py @@ -42,13 +42,13 @@ def vec_nb_nll( -logbinom + (counts[:, None] + alpha_neg1) * np.log(mu[:, None] + alpha_neg1) - (counts * np.log(mu))[:, None] - ).sum(0) + ).sum(axis=0) else: return n * alpha_neg1 * np.log(alpha) + ( -logbinom + (counts[:, None] + alpha_neg1) * np.log(mu + alpha_neg1) - (counts[:, None] * np.log(mu)) - ).sum(0) + ).sum(axis=0) def grid_fit_alpha( diff --git a/pydeseq2/preprocessing.py b/pydeseq2/preprocessing.py index 0c0a5a34..c20a283f 100644 --- a/pydeseq2/preprocessing.py +++ b/pydeseq2/preprocessing.py @@ -49,7 +49,7 @@ def deseq2_norm_fit(counts: pd.DataFrame | np.ndarray) -> tuple[np.ndarray, np.n # Compute gene-wise mean log counts with np.errstate(divide="ignore"): # ignore division by zero warnings log_counts = np.log(counts) - logmeans = log_counts.mean(0) + logmeans = log_counts.mean(axis=0) # Filter out genes with -∞ log means filtered_genes = ~np.isinf(logmeans) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 1e3bb23f..5fcb50e6 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -222,7 +222,7 @@ def nb_nll( - logbinom + (counts + alpha_neg1) * np.log(mu + alpha_neg1) - (counts * np.log(mu)) - ).sum(0) + ).sum(axis=0) else: return ( n * alpha_neg1 * np.log(alpha) @@ -849,7 +849,7 @@ def fit_rough_dispersions( y_hat = np.maximum(y_hat, 1) alpha_rde = ( ((normed_counts - y_hat) ** 2 - y_hat) / ((num_samples - num_vars) * y_hat**2) - ).sum(0) + ).sum(axis=0) return np.maximum(alpha_rde, 0) @@ -877,7 +877,7 @@ def fit_moments_dispersions( # Exclude genes with all zeroes normed_counts = normed_counts[:, ~(normed_counts == 0).all(axis=0)] # mean inverse size factor - s_mean_inv = (1 / size_factors).mean() + s_mean_inv = (1 / size_factors).mean(axis=0) mu = normed_counts.mean(0) sigma = normed_counts.var(0, ddof=1) # ddof=1 is to use an unbiased estimator, as in R @@ -951,7 +951,7 @@ def robust_method_of_moments_disp( np.ndarray, trimmed_variance(normed_counts) ) # Since normed_counts is always 2D, trimmed_variance returns ndarray - m = normed_counts.mean(0) + m = normed_counts.mean(axis=0) alpha = (v - m) / m**2 # cannot use the typical min_disp = 1e-8 here or else all counts in the same # group as the outlier count will get an extreme Cook's distance @@ -1202,7 +1202,7 @@ def nbinomFn( nll = ( counts * xbeta - (counts + size) * np.logaddexp(xbeta + offset, np.log(size)) - ).sum(0) + ).sum(axis=0) return prior - nll diff --git a/pyproject.toml b/pyproject.toml index 3828a74c..2cfe8ac7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "hatchling" ] [project] name = "pydeseq2" -version = "0.5.3" +version = "0.5.4" description = "A python implementation of DESeq2." readme = "README.md" license = { file = "LICENSE" } From fe4bd0f5b7d09f6826edca50c6960f437cbbc2cd Mon Sep 17 00:00:00 2001 From: Malte Benedikt Kuehl Date: Thu, 22 Jan 2026 12:49:42 +0100 Subject: [PATCH 2/2] Fix old DataFrame intersphinx reference --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index a9fa9905..12ce74ee 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -230,6 +230,7 @@ nitpick_ignore = [ ("py:class", "pd.Series"), ("py:class", "pd.DataFrame"), + ("py:class", "pandas.core.frame.DataFrame"), ("py:class", "ndarray"), ("py:class", "numpy._typing._generic_alias.ScalarType"), ("py:class", "pydantic.main.BaseModel"),