diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml index bb18e26..2697e71 100644 --- a/src/methods/cellmapper_linear/config.vsh.yaml +++ b/src/methods/cellmapper_linear/config.vsh.yaml @@ -42,19 +42,20 @@ arguments: - name: "--fallback_representation" type: "string" choices: ["joint_pca", "fast_cca"] - default: "fast_cca" + default: "joint_pca" description: Fallback representation to use for k-NN mapping (computed if use_rep is None). - name: "--mask_var" type: "string" + default: "hvg" description: Variable to mask for fallback representation. - name: "--kernel_method" type: "string" choices: ["hnoca", "gauss"] - default: "hnoca" + default: "gauss" description: Kernel function to compute k-NN edge weights. - name: "--n_neighbors" type: "integer" - default: 30 + default: 50 description: Number of neighbors to consider for k-NN graph construction. resources: - type: python_script diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py index 8863a22..df35423 100644 --- a/src/methods/cellmapper_linear/script.py +++ b/src/methods/cellmapper_linear/script.py @@ -33,10 +33,14 @@ # copy the normalized layer to obsm for mod2 input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] +# choose the kNN method based on total cell number +n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs + print("Set up and prepare Cellmapper", flush=True) cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) cmap.compute_neighbors( use_rep=None, + knn_method="sklearn" if n_obs < 60000 else "pynndescent", fallback_representation=par['fallback_representation'], n_neighbors=par['n_neighbors'], fallback_kwargs={"mask_var": par['mask_var']}, diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml index 359579a..a86f650 100644 --- a/src/methods/cellmapper_scvi/config.vsh.yaml +++ b/src/methods/cellmapper_scvi/config.vsh.yaml @@ -43,15 +43,15 @@ arguments: - name: "--kernel_method" type: "string" choices: ["hnoca", "gauss"] - default: "hnoca" + default: "gauss" description: Kernel function to compute k-NN edge weights (CellMapper parameter). - name: "--n_neighbors" type: "integer" - default: 30 + default: 50 description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter). - name: "--use_hvg" type: boolean - default: true + default: false description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter). - name: "--adt_normalization" type: "string" diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py index 1f9e74b..b0d2aa7 100644 --- a/src/methods/cellmapper_scvi/script.py +++ b/src/methods/cellmapper_scvi/script.py @@ -11,8 +11,8 @@ 'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', 'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', 'output': 'output.h5ad', - 'n_neighbors': 30, - 'kernel_method': 'hnoca', + 'n_neighbors': 50, + 'kernel_method': 'gauss', 'use_hvg': False, 'adt_normalization': 'clr', # Normalization method for ADT data 'plot_umap': True, @@ -41,6 +41,7 @@ adata = ad.concat( [input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"] ) +adata.X = adata.layers["counts"] # Compute a latent representation using an appropriate model based on the modality print("Get latent representation", flush=True) @@ -55,10 +56,14 @@ # copy the normalized layer to obsm for mod2 input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] +# choose the kNN method based on total cell number +n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs + print('Setup and prepare Cellmapper', flush=True) cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) cmap.compute_neighbors( use_rep="X_scvi", + knn_method="sklearn" if n_obs < 60000 else "pynndescent", n_neighbors=par['n_neighbors'], ) cmap.compute_mapping_matrix(kernel_method=par['kernel_method']) diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py index f059877..4bff783 100644 --- a/src/methods/cellmapper_scvi/utils.py +++ b/src/methods/cellmapper_scvi/utils.py @@ -4,6 +4,75 @@ from scipy.sparse import issparse, csr_matrix, csc_matrix import muon import scanpy as sc +import numpy as np + + + +def preprocess_features( + adata: ad.AnnData, + modality: Literal["GEX", "ADT", "ATAC"], + use_hvg: bool, + min_cells_fraction: float, + feature_filter_threshold: int, +) -> ad.AnnData: + """ + Preprocess features with optional filtering and HVG selection. + + Parameters + ---------- + adata + AnnData object to preprocess + modality + The modality type, affects filtering strategy + use_hvg + Whether to apply HVG selection (applies to all modalities if requested) + min_cells_fraction + Minimum fraction of cells a feature must be detected in to be kept + feature_filter_threshold + Only apply feature filtering if n_vars > this threshold + + Returns + ------- + Preprocessed AnnData object + """ + print(f"Starting preprocessing for {modality} with {adata.n_obs} cells and {adata.n_vars} features") + + # Feature filtering: only apply if we have many features (mainly for ATAC) + if adata.n_vars > feature_filter_threshold: + min_cells = int(adata.n_obs * min_cells_fraction) + print(f"Applying feature filtering: removing features detected in <{min_cells} cells ({min_cells_fraction:.1%} threshold)") + + n_features_before = adata.n_vars + sc.pp.filter_genes(adata, min_cells=min_cells) + n_features_after = adata.n_vars + + print(f"Features after filtering: {n_features_after} (removed {n_features_before - n_features_after})") + else: + print(f"Skipping feature filtering (only {adata.n_vars} features, threshold is {feature_filter_threshold})") + + # HVG selection: apply if requested + if use_hvg: + n_hvg = adata.var["hvg"].sum() + print(f"Applying HVG selection: subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})") + adata = adata[:, adata.var["hvg"]].copy() + else: + print("HVG selection disabled, using all remaining features") + + # Critical check: ensure no cells have zero counts after filtering + cell_counts = np.array(adata.layers['counts'].sum(axis=1)).flatten() + zero_count_cells = (cell_counts == 0).sum() + + if zero_count_cells > 0: + raise ValueError( + f"After preprocessing, {zero_count_cells} cells have zero counts! " + "This would prevent generating latent representations for these cells. " + "Consider relaxing filtering parameters or checking data quality." + ) + + print(f"Final dataset: {adata.n_obs} cells × {adata.n_vars} features") + print(f"Mean counts per cell: {cell_counts.mean():.1f}") + + return adata def get_representation( @@ -12,6 +81,8 @@ def get_representation( use_hvg: bool = True, adt_normalization: Literal["clr", "log_cp10k"] = "clr", plot_umap: bool = False, + min_cells_fraction: float = 0.01, + feature_filter_threshold: int = 20000, ) -> ad.AnnData: """ Get a joint latent space representation of the data based on the modality. @@ -25,7 +96,7 @@ def get_representation( - "GEX": scVI model for gene expression data with ZINB likelihood on raw counts. - "ADT": scVI model for ADT data (surface proteins) with Gaussian likelihood on normalized data. - - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on binarized count data. + - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on count data. We assume that regardless of the modality, the raw data will be stored in the `counts` layer (e.g. UMI counts for GEX and peak counts for ATAC), and the normalized data in the `normalized` layer. @@ -37,20 +108,26 @@ def get_representation( - "log_cp10k" (normalization to 10k counts per cell and logarithm transformation) plot_umap Purely for diagnostic purposes, to see whether the data integration looks ok, this optionally computes - a UMAP in shared latent space and stores a plot. + a UMAP in shared latent space and stores a plot. + min_cells_fraction + Minimum fraction of cells a feature must be detected in to be kept during filtering. + feature_filter_threshold + Only apply feature filtering if n_vars > this threshold. This automatically separates + ATAC data (many peaks) from other modalities (fewer features). Returns ------- ad.AnnData AnnData object with the latent representation in `obsm["X_scvi"]`, regardless of the modality. """ - # Subset to highly variable features - if "hvg" in adata.var.columns and use_hvg: - n_hvg = adata.var["hvg"].sum() - print(f"Subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})", flush=True) - adata = adata[:, adata.var["hvg"]].copy() - else: - print("Training on all available features", flush=True) + # Preprocess features (filtering and HVG selection) + adata = preprocess_features( + adata, + modality=modality, + use_hvg=use_hvg, + min_cells_fraction=min_cells_fraction, + feature_filter_threshold=feature_filter_threshold + ) # Setup the AnnData object for scVI if modality == "GEX": @@ -73,9 +150,14 @@ def get_representation( scvi.model.SCVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) model = scvi.model.SCVI(adata, gene_likelihood="normal", n_layers=1, n_latent=10) elif modality == "ATAC": - layer = "counts" - scvi.model.PEAKVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) - model = scvi.model.PEAKVI(adata) + + print("Converting read counts to fragment counts") + scvi.data.reads_to_fragments(adata, read_layer="counts") + print(f"One counts: {(adata.layers['fragments'] == 1).sum()}, Two counts: {(adata.layers['fragments'] == 2).sum()}") + layer = "fragments" + + scvi.external.POISSONVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) + model = scvi.external.POISSONVI(adata) else: raise ValueError(f"Unknown modality: {modality}")