From 8052637fa745d9cb9f5618d7140a1f46b422b413 Mon Sep 17 00:00:00 2001 From: Marius Lange Date: Wed, 13 Aug 2025 08:37:06 +0200 Subject: [PATCH 1/5] Fix CellMapper linear NaNs --- src/methods/cellmapper_linear/config.vsh.yaml | 7 ++++--- src/methods/cellmapper_linear/script.py | 7 ++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml index bb18e26..a0e0fad 100644 --- a/src/methods/cellmapper_linear/config.vsh.yaml +++ b/src/methods/cellmapper_linear/config.vsh.yaml @@ -42,19 +42,20 @@ arguments: - name: "--fallback_representation" type: "string" choices: ["joint_pca", "fast_cca"] - default: "fast_cca" + default: "fast_pca" description: Fallback representation to use for k-NN mapping (computed if use_rep is None). - name: "--mask_var" type: "string" + default: "hvg" description: Variable to mask for fallback representation. - name: "--kernel_method" type: "string" choices: ["hnoca", "gauss"] - default: "hnoca" + default: "gauss" description: Kernel function to compute k-NN edge weights. - name: "--n_neighbors" type: "integer" - default: 30 + default: 50 description: Number of neighbors to consider for k-NN graph construction. resources: - type: python_script diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py index 8863a22..2e10f47 100644 --- a/src/methods/cellmapper_linear/script.py +++ b/src/methods/cellmapper_linear/script.py @@ -33,10 +33,15 @@ # copy the normalized layer to obsm for mod2 input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] +# choose the kNN method based on total cell number +n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs +knn_method = "sklearn" if n_obs < 60000 else "pynndescent" + print("Set up and prepare Cellmapper", flush=True) cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) cmap.compute_neighbors( - use_rep=None, + use_rep=None, + knn_method=knn_method, fallback_representation=par['fallback_representation'], n_neighbors=par['n_neighbors'], fallback_kwargs={"mask_var": par['mask_var']}, From 651b2ac868b5070ce849e745daa9dc3f29893c8a Mon Sep 17 00:00:00 2001 From: Marius Lange Date: Wed, 13 Aug 2025 10:20:45 +0200 Subject: [PATCH 2/5] Update cellmaper_scvi --- src/methods/cellmapper_linear/script.py | 5 +- src/methods/cellmapper_scvi/config.vsh.yaml | 4 +- src/methods/cellmapper_scvi/script.py | 11 ++- src/methods/cellmapper_scvi/utils.py | 95 +++++++++++++++++++-- 4 files changed, 98 insertions(+), 17 deletions(-) diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py index 2e10f47..df35423 100644 --- a/src/methods/cellmapper_linear/script.py +++ b/src/methods/cellmapper_linear/script.py @@ -35,13 +35,12 @@ # choose the kNN method based on total cell number n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs -knn_method = "sklearn" if n_obs < 60000 else "pynndescent" print("Set up and prepare Cellmapper", flush=True) cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) cmap.compute_neighbors( - use_rep=None, - knn_method=knn_method, + use_rep=None, + knn_method="sklearn" if n_obs < 60000 else "pynndescent", fallback_representation=par['fallback_representation'], n_neighbors=par['n_neighbors'], fallback_kwargs={"mask_var": par['mask_var']}, diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml index 359579a..613c8e6 100644 --- a/src/methods/cellmapper_scvi/config.vsh.yaml +++ b/src/methods/cellmapper_scvi/config.vsh.yaml @@ -43,11 +43,11 @@ arguments: - name: "--kernel_method" type: "string" choices: ["hnoca", "gauss"] - default: "hnoca" + default: "gauss" description: Kernel function to compute k-NN edge weights (CellMapper parameter). - name: "--n_neighbors" type: "integer" - default: 30 + default: 50 description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter). - name: "--use_hvg" type: boolean diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py index 1f9e74b..702b97b 100644 --- a/src/methods/cellmapper_scvi/script.py +++ b/src/methods/cellmapper_scvi/script.py @@ -11,9 +11,9 @@ 'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', 'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', 'output': 'output.h5ad', - 'n_neighbors': 30, - 'kernel_method': 'hnoca', - 'use_hvg': False, + 'n_neighbors': 50, + 'kernel_method': 'gauss', + 'use_hvg': True, 'adt_normalization': 'clr', # Normalization method for ADT data 'plot_umap': True, @@ -41,6 +41,7 @@ adata = ad.concat( [input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"] ) +adata.X = adata.layers["counts"] # Compute a latent representation using an appropriate model based on the modality print("Get latent representation", flush=True) @@ -55,10 +56,14 @@ # copy the normalized layer to obsm for mod2 input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] +# choose the kNN method based on total cell number +n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs + print('Setup and prepare Cellmapper', flush=True) cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) cmap.compute_neighbors( use_rep="X_scvi", + knn_method="sklearn" if n_obs < 60000 else "pynndescent", n_neighbors=par['n_neighbors'], ) cmap.compute_mapping_matrix(kernel_method=par['kernel_method']) diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py index f059877..c2a428d 100644 --- a/src/methods/cellmapper_scvi/utils.py +++ b/src/methods/cellmapper_scvi/utils.py @@ -4,6 +4,75 @@ from scipy.sparse import issparse, csr_matrix, csc_matrix import muon import scanpy as sc +import numpy as np + + + +def preprocess_features( + adata: ad.AnnData, + modality: Literal["GEX", "ADT", "ATAC"], + use_hvg: bool = True, + min_cells_fraction: float = 0.05, + feature_filter_threshold: int = 20000, +) -> ad.AnnData: + """ + Preprocess features with optional filtering and HVG selection. + + Parameters + ---------- + adata + AnnData object to preprocess + modality + The modality type, affects filtering strategy + use_hvg + Whether to apply HVG selection (applies to all modalities if requested) + min_cells_fraction + Minimum fraction of cells a feature must be detected in to be kept + feature_filter_threshold + Only apply feature filtering if n_vars > this threshold + + Returns + ------- + Preprocessed AnnData object + """ + print(f"Starting preprocessing for {modality} with {adata.n_obs} cells and {adata.n_vars} features") + + # Feature filtering: only apply if we have many features (mainly for ATAC) + if adata.n_vars > feature_filter_threshold: + min_cells = int(adata.n_obs * min_cells_fraction) + print(f"Applying feature filtering: removing features detected in <{min_cells} cells ({min_cells_fraction:.1%} threshold)") + + n_features_before = adata.n_vars + sc.pp.filter_genes(adata, min_cells=min_cells) + n_features_after = adata.n_vars + + print(f"Features after filtering: {n_features_after} (removed {n_features_before - n_features_after})") + else: + print(f"Skipping feature filtering (only {adata.n_vars} features, threshold is {feature_filter_threshold})") + + # HVG selection: apply if requested + if use_hvg: + n_hvg = adata.var["hvg"].sum() + print(f"Applying HVG selection: subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})") + adata = adata[:, adata.var["hvg"]].copy() + else: + print("HVG selection disabled, using all remaining features") + + # Critical check: ensure no cells have zero counts after filtering + cell_counts = np.array(adata.layers['counts'].sum(axis=1)).flatten() + zero_count_cells = (cell_counts == 0).sum() + + if zero_count_cells > 0: + raise ValueError( + f"After preprocessing, {zero_count_cells} cells have zero counts! " + "This would prevent generating latent representations for these cells. " + "Consider relaxing filtering parameters or checking data quality." + ) + + print(f"Final dataset: {adata.n_obs} cells × {adata.n_vars} features") + print(f"Mean counts per cell: {cell_counts.mean():.1f}") + + return adata def get_representation( @@ -12,6 +81,8 @@ def get_representation( use_hvg: bool = True, adt_normalization: Literal["clr", "log_cp10k"] = "clr", plot_umap: bool = False, + min_cells_fraction: float = 0.05, + feature_filter_threshold: int = 20000, ) -> ad.AnnData: """ Get a joint latent space representation of the data based on the modality. @@ -25,7 +96,7 @@ def get_representation( - "GEX": scVI model for gene expression data with ZINB likelihood on raw counts. - "ADT": scVI model for ADT data (surface proteins) with Gaussian likelihood on normalized data. - - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on binarized count data. + - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on count data. We assume that regardless of the modality, the raw data will be stored in the `counts` layer (e.g. UMI counts for GEX and peak counts for ATAC), and the normalized data in the `normalized` layer. @@ -37,20 +108,26 @@ def get_representation( - "log_cp10k" (normalization to 10k counts per cell and logarithm transformation) plot_umap Purely for diagnostic purposes, to see whether the data integration looks ok, this optionally computes - a UMAP in shared latent space and stores a plot. + a UMAP in shared latent space and stores a plot. + min_cells_fraction + Minimum fraction of cells a feature must be detected in to be kept during filtering. + feature_filter_threshold + Only apply feature filtering if n_vars > this threshold. This automatically separates + ATAC data (many peaks) from other modalities (fewer features). Returns ------- ad.AnnData AnnData object with the latent representation in `obsm["X_scvi"]`, regardless of the modality. """ - # Subset to highly variable features - if "hvg" in adata.var.columns and use_hvg: - n_hvg = adata.var["hvg"].sum() - print(f"Subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})", flush=True) - adata = adata[:, adata.var["hvg"]].copy() - else: - print("Training on all available features", flush=True) + # Preprocess features (filtering and HVG selection) + adata = preprocess_features( + adata, + modality=modality, + use_hvg=use_hvg, + min_cells_fraction=min_cells_fraction, + feature_filter_threshold=feature_filter_threshold + ) # Setup the AnnData object for scVI if modality == "GEX": From d63f5cf6451b17f6ca3786adae3146b8b3d670f7 Mon Sep 17 00:00:00 2001 From: Marius Lange Date: Wed, 13 Aug 2025 10:31:13 +0200 Subject: [PATCH 3/5] Disable hvg for scvi-based methods --- src/methods/cellmapper_scvi/config.vsh.yaml | 2 +- src/methods/cellmapper_scvi/script.py | 2 +- src/methods/cellmapper_scvi/utils.py | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml index 613c8e6..a86f650 100644 --- a/src/methods/cellmapper_scvi/config.vsh.yaml +++ b/src/methods/cellmapper_scvi/config.vsh.yaml @@ -51,7 +51,7 @@ arguments: description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter). - name: "--use_hvg" type: boolean - default: true + default: false description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter). - name: "--adt_normalization" type: "string" diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py index 702b97b..b0d2aa7 100644 --- a/src/methods/cellmapper_scvi/script.py +++ b/src/methods/cellmapper_scvi/script.py @@ -13,7 +13,7 @@ 'output': 'output.h5ad', 'n_neighbors': 50, 'kernel_method': 'gauss', - 'use_hvg': True, + 'use_hvg': False, 'adt_normalization': 'clr', # Normalization method for ADT data 'plot_umap': True, diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py index c2a428d..e5fbdc9 100644 --- a/src/methods/cellmapper_scvi/utils.py +++ b/src/methods/cellmapper_scvi/utils.py @@ -12,7 +12,7 @@ def preprocess_features( adata: ad.AnnData, modality: Literal["GEX", "ADT", "ATAC"], use_hvg: bool = True, - min_cells_fraction: float = 0.05, + min_cells_fraction: float = 0.01, feature_filter_threshold: int = 20000, ) -> ad.AnnData: """ @@ -150,9 +150,14 @@ def get_representation( scvi.model.SCVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) model = scvi.model.SCVI(adata, gene_likelihood="normal", n_layers=1, n_latent=10) elif modality == "ATAC": - layer = "counts" - scvi.model.PEAKVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) - model = scvi.model.PEAKVI(adata) + + print("Converting read counts to fragment counts") + scvi.data.reads_to_fragments(adata, read_layer="counts") + print(f"One counts: {(adata.layers['fragments'] == 1).sum()}, Two counts: {(adata.layers['fragments'] == 2).sum()}") + layer = "fragments" + + scvi.external.POISSONVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) + model = scvi.external.POISSONVI(adata) else: raise ValueError(f"Unknown modality: {modality}") From f8229810358c50487b477ed9d7eb98e6a1b27825 Mon Sep 17 00:00:00 2001 From: Marius Lange Date: Wed, 13 Aug 2025 10:37:32 +0200 Subject: [PATCH 4/5] Fix pca key --- src/methods/cellmapper_linear/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml index a0e0fad..2697e71 100644 --- a/src/methods/cellmapper_linear/config.vsh.yaml +++ b/src/methods/cellmapper_linear/config.vsh.yaml @@ -42,7 +42,7 @@ arguments: - name: "--fallback_representation" type: "string" choices: ["joint_pca", "fast_cca"] - default: "fast_pca" + default: "joint_pca" description: Fallback representation to use for k-NN mapping (computed if use_rep is None). - name: "--mask_var" type: "string" From adcfa6da522bba42043ac27c51a12e79ab162270 Mon Sep 17 00:00:00 2001 From: Marius Lange Date: Wed, 13 Aug 2025 10:45:19 +0200 Subject: [PATCH 5/5] Fix defaults --- src/methods/cellmapper_scvi/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py index e5fbdc9..4bff783 100644 --- a/src/methods/cellmapper_scvi/utils.py +++ b/src/methods/cellmapper_scvi/utils.py @@ -11,9 +11,9 @@ def preprocess_features( adata: ad.AnnData, modality: Literal["GEX", "ADT", "ATAC"], - use_hvg: bool = True, - min_cells_fraction: float = 0.01, - feature_filter_threshold: int = 20000, + use_hvg: bool, + min_cells_fraction: float, + feature_filter_threshold: int, ) -> ad.AnnData: """ Preprocess features with optional filtering and HVG selection. @@ -81,7 +81,7 @@ def get_representation( use_hvg: bool = True, adt_normalization: Literal["clr", "log_cp10k"] = "clr", plot_umap: bool = False, - min_cells_fraction: float = 0.05, + min_cells_fraction: float = 0.01, feature_filter_threshold: int = 20000, ) -> ad.AnnData: """