openproblems-bio · mumichae · Aug 5, 2025 · Aug 5, 2025 · Aug 12, 2025 · Aug 28, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52).
 
+* Added `metircs/ksim` component (PR #75).
-* Added `metircs/ksim` component (PR #75).
+* Added `metrics/ksim` component (PR #75).
-* Added `metircs/ksim` component (PR #75).
+* Added `metrics/ksim` component (PR #75).
+
 ## Minor changes
 
 * Un-pin the scPRINT version and update parameters (PR #51)

diff --git a/src/metrics/ksim/config.vsh.yaml b/src/metrics/ksim/config.vsh.yaml
@@ -0,0 +1,61 @@
+__merge__: ../../api/comp_metric.yaml
+name: ksim
+info:
+  metrics:
+
+    - name: ksim
+      label: kSIM
+      summary: "The kSIM acceptance rate measures whether cells of the same pre-annotated cell type are still close to each other in the local neighborhoods after batch correction."
+      description: |
+        The kSIM acceptance rate requires ground truth cell type information and measures whether the neighbors of a cell have the same cell type as it does. If a method overcorrects the batch effects, it will have a low kSIM acceptance rate. We use the HNSW algorithm to find k-NNs (including the cell itself) for each cell i and denote the number of neighbors that have the same cell type as i as . In addition, we require at least β fraction of neighbors of cell i to have the same cell type as i in order to say cell i has a consistent neighborhood. 
+      references:
+        doi: 
+          - 10.1038/s41592-020-0905-x
+      links:
+        documentation: https://pegasus.readthedocs.io/en/stable/api/pegasus.calc_kSIM.html#pegasus.calc_kSIM
+        repository: https://github.com/lilab-bcb/pegasus
+      min: 0
+      max: 1
+      maximize: true
+
+arguments:
+  - name: "--K"
+    type: "integer"
+    default: 24
+    description: The number of nearest neighbors to be considered.
+  - name: "--min_rate"
+    type: "double"
+    default: 0.9
+    description: Acceptance rate threshold. A cell is accepted if its kSIM rate is larger than or equal to min_rate.
+  - name: "--n_jobs"
+    type: "integer"
+    default: -1
+    description: Number of threads used. If -1, use all physical CPU cores.
+  - name: "--random_state"
+    type: "integer"
+    default: 0
+    description: Random seed set for reproducing results.
+  - name: "--use_cache"
+    type: "boolean"
+    default: True
+    description: If use cache results for kNN.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+        - type: python
+          pypi:
+          - pegasuspy==1.10.2
+
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/metrics/ksim/script.py b/src/metrics/ksim/script.py
@@ -0,0 +1,46 @@
+import anndata as ad
+import sys
+import pegasus as pg
+import pegasusio
+from scipy.sparse import csr_matrix
+
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+print('Reading input files', flush=True)
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns')
+adata.obs = read_anndata(par['input_solution'], obs='obs').obs
+adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+print(adata)
+
+print('Convert to pegasusio.MultimodalData...', flush=True)
+adata.X = csr_matrix(adata.shape)
+mmdata = pegasusio.MultimodalData(adata)
+
+print('Compute metrics', flush=True)
+score = pg.calc_kSIM(
+    mmdata,
+    attr='cell_type',
+    rep='emb',
+    K=par["K"],
+    min_rate=par["min_rate"],
+    n_jobs=par["n_jobs"],
+    random_state=par["random_state"],
+    use_cache=par["use_cache"]
+)
+print("score:", score)
+
+print('Create output AnnData object', flush=True)
+output = ad.AnnData(
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': adata.uns['method_id'],
+        'metric_ids': [ meta['name'] ],
+        'metric_values': [ score ]
+    }
+)
+
+print("Write output AnnData to file", flush=True)
+output.write_h5ad(par['output'], compression='gzip')