From 8052637fa745d9cb9f5618d7140a1f46b422b413 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Wed, 13 Aug 2025 08:37:06 +0200
Subject: [PATCH 1/5] Fix CellMapper linear NaNs

---
 src/methods/cellmapper_linear/config.vsh.yaml | 7 ++++---
 src/methods/cellmapper_linear/script.py       | 7 ++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml
index bb18e26..a0e0fad 100644
--- a/src/methods/cellmapper_linear/config.vsh.yaml
+++ b/src/methods/cellmapper_linear/config.vsh.yaml
@@ -42,19 +42,20 @@ arguments:
   - name: "--fallback_representation"
     type: "string"
     choices: ["joint_pca", "fast_cca"]
-    default: "fast_cca"
+    default: "fast_pca"
     description: Fallback representation to use for k-NN mapping (computed if use_rep is None).
   - name: "--mask_var"
     type: "string"
+    default: "hvg"
     description: Variable to mask for fallback representation.
   - name: "--kernel_method"
     type: "string"
     choices: ["hnoca", "gauss"]
-    default: "hnoca"
+    default: "gauss"
     description: Kernel function to compute k-NN edge weights. 
   - name: "--n_neighbors"
     type: "integer"
-    default: 30
+    default: 50
     description: Number of neighbors to consider for k-NN graph construction.
 resources:
   - type: python_script
diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py
index 8863a22..2e10f47 100644
--- a/src/methods/cellmapper_linear/script.py
+++ b/src/methods/cellmapper_linear/script.py
@@ -33,10 +33,15 @@
 # copy the normalized layer to obsm for mod2
 input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] 
 
+# choose the kNN method based on total cell number
+n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs
+knn_method = "sklearn" if n_obs < 60000 else "pynndescent"
+
 print("Set up and prepare Cellmapper", flush=True)
 cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
 cmap.compute_neighbors(
-    use_rep=None,
+    use_rep=None, 
+    knn_method=knn_method,
     fallback_representation=par['fallback_representation'],
     n_neighbors=par['n_neighbors'],
     fallback_kwargs={"mask_var": par['mask_var']},

From 651b2ac868b5070ce849e745daa9dc3f29893c8a Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Wed, 13 Aug 2025 10:20:45 +0200
Subject: [PATCH 2/5] Update cellmaper_scvi

---
 src/methods/cellmapper_linear/script.py     |  5 +-
 src/methods/cellmapper_scvi/config.vsh.yaml |  4 +-
 src/methods/cellmapper_scvi/script.py       | 11 ++-
 src/methods/cellmapper_scvi/utils.py        | 95 +++++++++++++++++++--
 4 files changed, 98 insertions(+), 17 deletions(-)

diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py
index 2e10f47..df35423 100644
--- a/src/methods/cellmapper_linear/script.py
+++ b/src/methods/cellmapper_linear/script.py
@@ -35,13 +35,12 @@
 
 # choose the kNN method based on total cell number
 n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs
-knn_method = "sklearn" if n_obs < 60000 else "pynndescent"
 
 print("Set up and prepare Cellmapper", flush=True)
 cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
 cmap.compute_neighbors(
-    use_rep=None, 
-    knn_method=knn_method,
+    use_rep=None,
+    knn_method="sklearn" if n_obs < 60000 else "pynndescent",
     fallback_representation=par['fallback_representation'],
     n_neighbors=par['n_neighbors'],
     fallback_kwargs={"mask_var": par['mask_var']},
diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml
index 359579a..613c8e6 100644
--- a/src/methods/cellmapper_scvi/config.vsh.yaml
+++ b/src/methods/cellmapper_scvi/config.vsh.yaml
@@ -43,11 +43,11 @@ arguments:
   - name: "--kernel_method"
     type: "string"
     choices: ["hnoca", "gauss"]
-    default: "hnoca"
+    default: "gauss"
     description: Kernel function to compute k-NN edge weights (CellMapper parameter). 
   - name: "--n_neighbors"
     type: "integer"
-    default: 30
+    default: 50
     description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter).
   - name: "--use_hvg"
     type: boolean
diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py
index 1f9e74b..702b97b 100644
--- a/src/methods/cellmapper_scvi/script.py
+++ b/src/methods/cellmapper_scvi/script.py
@@ -11,9 +11,9 @@
     'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad',
     'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad',
     'output': 'output.h5ad',
-    'n_neighbors': 30, 
-    'kernel_method': 'hnoca',
-    'use_hvg': False,
+    'n_neighbors': 50,
+    'kernel_method': 'gauss',
+    'use_hvg': True,
     'adt_normalization': 'clr',  # Normalization method for ADT data
     'plot_umap': True,
 
@@ -41,6 +41,7 @@
 adata = ad.concat(
     [input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"]
     )
+adata.X = adata.layers["counts"]
 
 # Compute a latent representation using an appropriate model based on the modality
 print("Get latent representation", flush=True)
@@ -55,10 +56,14 @@
 # copy the normalized layer to obsm for mod2
 input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] 
 
+# choose the kNN method based on total cell number
+n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs
+
 print('Setup and prepare Cellmapper', flush=True)
 cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
 cmap.compute_neighbors(
     use_rep="X_scvi",
+    knn_method="sklearn" if n_obs < 60000 else "pynndescent",
     n_neighbors=par['n_neighbors'], 
     )
 cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])
diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py
index f059877..c2a428d 100644
--- a/src/methods/cellmapper_scvi/utils.py
+++ b/src/methods/cellmapper_scvi/utils.py
@@ -4,6 +4,75 @@
 from scipy.sparse import issparse, csr_matrix, csc_matrix
 import muon
 import scanpy as sc
+import numpy as np
+
+
+
+def preprocess_features(
+    adata: ad.AnnData,
+    modality: Literal["GEX", "ADT", "ATAC"],
+    use_hvg: bool = True,
+    min_cells_fraction: float = 0.05,
+    feature_filter_threshold: int = 20000,
+) -> ad.AnnData:
+    """
+    Preprocess features with optional filtering and HVG selection.
+    
+    Parameters
+    ----------
+    adata
+        AnnData object to preprocess
+    modality
+        The modality type, affects filtering strategy
+    use_hvg
+        Whether to apply HVG selection (applies to all modalities if requested)
+    min_cells_fraction
+        Minimum fraction of cells a feature must be detected in to be kept
+    feature_filter_threshold
+        Only apply feature filtering if n_vars > this threshold
+        
+    Returns
+    -------
+    Preprocessed AnnData object
+    """
+    print(f"Starting preprocessing for {modality} with {adata.n_obs} cells and {adata.n_vars} features")
+    
+    # Feature filtering: only apply if we have many features (mainly for ATAC)
+    if adata.n_vars > feature_filter_threshold:
+        min_cells = int(adata.n_obs * min_cells_fraction)
+        print(f"Applying feature filtering: removing features detected in <{min_cells} cells ({min_cells_fraction:.1%} threshold)")
+        
+        n_features_before = adata.n_vars
+        sc.pp.filter_genes(adata, min_cells=min_cells)
+        n_features_after = adata.n_vars
+        
+        print(f"Features after filtering: {n_features_after} (removed {n_features_before - n_features_after})")
+    else:
+        print(f"Skipping feature filtering (only {adata.n_vars} features, threshold is {feature_filter_threshold})")
+    
+    # HVG selection: apply if requested
+    if use_hvg:
+        n_hvg = adata.var["hvg"].sum()
+        print(f"Applying HVG selection: subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})")
+        adata = adata[:, adata.var["hvg"]].copy()
+    else:
+        print("HVG selection disabled, using all remaining features")
+    
+    # Critical check: ensure no cells have zero counts after filtering
+    cell_counts = np.array(adata.layers['counts'].sum(axis=1)).flatten()
+    zero_count_cells = (cell_counts == 0).sum()
+    
+    if zero_count_cells > 0:
+        raise ValueError(
+            f"After preprocessing, {zero_count_cells} cells have zero counts! "
+            "This would prevent generating latent representations for these cells. "
+            "Consider relaxing filtering parameters or checking data quality."
+        )
+    
+    print(f"Final dataset: {adata.n_obs} cells × {adata.n_vars} features")
+    print(f"Mean counts per cell: {cell_counts.mean():.1f}")
+    
+    return adata
 
 
 def get_representation(
@@ -12,6 +81,8 @@ def get_representation(
         use_hvg: bool = True, 
         adt_normalization: Literal["clr", "log_cp10k"] = "clr",
         plot_umap: bool = False,
+        min_cells_fraction: float = 0.05,
+        feature_filter_threshold: int = 20000,
     ) -> ad.AnnData:
     """
     Get a joint latent space representation of the data based on the modality.
@@ -25,7 +96,7 @@ def get_representation(
 
         - "GEX": scVI model for gene expression data with ZINB likelihood on raw counts. 
         - "ADT": scVI model for ADT data (surface proteins) with Gaussian likelihood on normalized data.
-        - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on binarized count data. 
+        - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on count data. 
 
         We assume that regardless of the modality, the raw data will be stored in the `counts` layer 
         (e.g. UMI counts for GEX and peak counts for ATAC), and the normalized data in the `normalized` layer.
@@ -37,20 +108,26 @@ def get_representation(
          - "log_cp10k" (normalization to 10k counts per cell and logarithm transformation)
     plot_umap
         Purely for diagnostic purposes, to see whether the data integration looks ok, this optionally computes 
-        a UMAP in shared latent space and stores a plot. 
+        a UMAP in shared latent space and stores a plot.
+    min_cells_fraction
+        Minimum fraction of cells a feature must be detected in to be kept during filtering.
+    feature_filter_threshold
+        Only apply feature filtering if n_vars > this threshold. This automatically separates
+        ATAC data (many peaks) from other modalities (fewer features).
 
     Returns
     -------
     ad.AnnData
         AnnData object with the latent representation in `obsm["X_scvi"]`, regardless of the modality.
     """
-    # Subset to highly variable features
-    if "hvg" in adata.var.columns and use_hvg:
-        n_hvg = adata.var["hvg"].sum()
-        print(f"Subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})", flush=True)
-        adata = adata[:, adata.var["hvg"]].copy()
-    else:
-        print("Training on all available features", flush=True)
+    # Preprocess features (filtering and HVG selection)
+    adata = preprocess_features(
+        adata, 
+        modality=modality, 
+        use_hvg=use_hvg,
+        min_cells_fraction=min_cells_fraction,
+        feature_filter_threshold=feature_filter_threshold
+    )
 
     # Setup the AnnData object for scVI
     if modality == "GEX":

From d63f5cf6451b17f6ca3786adae3146b8b3d670f7 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Wed, 13 Aug 2025 10:31:13 +0200
Subject: [PATCH 3/5] Disable hvg for scvi-based methods

---
 src/methods/cellmapper_scvi/config.vsh.yaml |  2 +-
 src/methods/cellmapper_scvi/script.py       |  2 +-
 src/methods/cellmapper_scvi/utils.py        | 13 +++++++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml
index 613c8e6..a86f650 100644
--- a/src/methods/cellmapper_scvi/config.vsh.yaml
+++ b/src/methods/cellmapper_scvi/config.vsh.yaml
@@ -51,7 +51,7 @@ arguments:
     description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter).
   - name: "--use_hvg"
     type: boolean
-    default: true
+    default: false
     description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter).
   - name: "--adt_normalization"
     type: "string"
diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py
index 702b97b..b0d2aa7 100644
--- a/src/methods/cellmapper_scvi/script.py
+++ b/src/methods/cellmapper_scvi/script.py
@@ -13,7 +13,7 @@
     'output': 'output.h5ad',
     'n_neighbors': 50,
     'kernel_method': 'gauss',
-    'use_hvg': True,
+    'use_hvg': False,
     'adt_normalization': 'clr',  # Normalization method for ADT data
     'plot_umap': True,
 
diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py
index c2a428d..e5fbdc9 100644
--- a/src/methods/cellmapper_scvi/utils.py
+++ b/src/methods/cellmapper_scvi/utils.py
@@ -12,7 +12,7 @@ def preprocess_features(
     adata: ad.AnnData,
     modality: Literal["GEX", "ADT", "ATAC"],
     use_hvg: bool = True,
-    min_cells_fraction: float = 0.05,
+    min_cells_fraction: float = 0.01,
     feature_filter_threshold: int = 20000,
 ) -> ad.AnnData:
     """
@@ -150,9 +150,14 @@ def get_representation(
         scvi.model.SCVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"])
         model = scvi.model.SCVI(adata, gene_likelihood="normal", n_layers=1, n_latent=10)
     elif modality == "ATAC":
-        layer = "counts"
-        scvi.model.PEAKVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"])
-        model = scvi.model.PEAKVI(adata)
+
+        print("Converting read counts to fragment counts")
+        scvi.data.reads_to_fragments(adata, read_layer="counts")
+        print(f"One counts: {(adata.layers['fragments'] == 1).sum()}, Two counts: {(adata.layers['fragments'] == 2).sum()}")
+        layer = "fragments"
+
+        scvi.external.POISSONVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"])
+        model = scvi.external.POISSONVI(adata)
     else:
         raise ValueError(f"Unknown modality: {modality}")
     

From f8229810358c50487b477ed9d7eb98e6a1b27825 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Wed, 13 Aug 2025 10:37:32 +0200
Subject: [PATCH 4/5] Fix pca key

---
 src/methods/cellmapper_linear/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml
index a0e0fad..2697e71 100644
--- a/src/methods/cellmapper_linear/config.vsh.yaml
+++ b/src/methods/cellmapper_linear/config.vsh.yaml
@@ -42,7 +42,7 @@ arguments:
   - name: "--fallback_representation"
     type: "string"
     choices: ["joint_pca", "fast_cca"]
-    default: "fast_pca"
+    default: "joint_pca"
     description: Fallback representation to use for k-NN mapping (computed if use_rep is None).
   - name: "--mask_var"
     type: "string"

From adcfa6da522bba42043ac27c51a12e79ab162270 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Wed, 13 Aug 2025 10:45:19 +0200
Subject: [PATCH 5/5] Fix defaults

---
 src/methods/cellmapper_scvi/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py
index e5fbdc9..4bff783 100644
--- a/src/methods/cellmapper_scvi/utils.py
+++ b/src/methods/cellmapper_scvi/utils.py
@@ -11,9 +11,9 @@
 def preprocess_features(
     adata: ad.AnnData,
     modality: Literal["GEX", "ADT", "ATAC"],
-    use_hvg: bool = True,
-    min_cells_fraction: float = 0.01,
-    feature_filter_threshold: int = 20000,
+    use_hvg: bool,
+    min_cells_fraction: float,
+    feature_filter_threshold: int,
 ) -> ad.AnnData:
     """
     Preprocess features with optional filtering and HVG selection.
@@ -81,7 +81,7 @@ def get_representation(
         use_hvg: bool = True, 
         adt_normalization: Literal["clr", "log_cp10k"] = "clr",
         plot_umap: bool = False,
-        min_cells_fraction: float = 0.05,
+        min_cells_fraction: float = 0.01,
         feature_filter_threshold: int = 20000,
     ) -> ad.AnnData:
     """