From 88588c17ecf722256dedbf19c9025f3650c056c8 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 5 Aug 2025 17:34:11 +0200 Subject: [PATCH 1/6] fill out config --- scripts/create_component/create_python_metric.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh index d36bc7a9..1da0b0be 100755 --- a/scripts/create_component/create_python_metric.sh +++ b/scripts/create_component/create_python_metric.sh @@ -3,6 +3,6 @@ set -e common/scripts/create_component \ - --name my_python_metric \ + --name ksim \ --language python \ --type metric From 4f08836d1d32ba0699e390e3cb6847fd1888dc7a Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 5 Aug 2025 18:06:34 +0200 Subject: [PATCH 2/6] wip --- src/metrics/ksim/config.vsh.yaml | 65 ++++++++++++++++++++++++++++++++ src/metrics/ksim/script.py | 58 ++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 src/metrics/ksim/config.vsh.yaml create mode 100644 src/metrics/ksim/script.py diff --git a/src/metrics/ksim/config.vsh.yaml b/src/metrics/ksim/config.vsh.yaml new file mode 100644 index 00000000..f85b3cc6 --- /dev/null +++ b/src/metrics/ksim/config.vsh.yaml @@ -0,0 +1,65 @@ +__merge__: ../../api/comp_metric.yaml +name: ksim +info: + metrics: + + - name: ksim + label: kSIM + summary: "The kSIM acceptance rate measures whether cells of the same pre-annotated cell type are still close to each other in the local neighborhoods after batch correction." + description: | + The kSIM acceptance rate requires ground truth cell type information and measures whether the neighbors of a cell have the same cell type as it does. If a method overcorrects the batch effects, it will have a low kSIM acceptance rate. We use the HNSW algorithm to find k-NNs (including the cell itself) for each cell i and denote the number of neighbors that have the same cell type as i as . In addition, we require at least β fraction of neighbors of cell i to have the same cell type as i in order to say cell i has a consistent neighborhood. + references: + doi: + - 10.1038/s41592-020-0905-x + links: + documentation: https://pegasus.readthedocs.io/en/stable/api/pegasus.calc_kSIM.html#pegasus.calc_kSIM + repository: https://github.com/lilab-bcb/pegasus + min: 0 + max: 1 + maximize: true + +arguments: + - name: "--rep" + type: "string" + default: "pca" + description: The embedding representation to consider. + - name: "--K" + type: "integer" + default: 24 + description: The number of nearest neighbors to be considered. + - name: "--min_rate" + type: "double" + default: 0.9 + description: Acceptance rate threshold. A cell is accepted if its kSIM rate is larger than or equal to min_rate. + - name: "--n_jobs" + type: "integer" + default: -1 + description: Number of threads used. If -1, use all physical CPU cores. + - name: "--random_state" + type: "integer" + default: 0 + description: Random seed set for reproducing results. + - name: "--use_cache" + type: "boolean" + default: True + description: If use cache results for kNN. + +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - pegasuspy==1.10.2 + + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/metrics/ksim/script.py b/src/metrics/ksim/script.py new file mode 100644 index 00000000..57d37758 --- /dev/null +++ b/src/metrics/ksim/script.py @@ -0,0 +1,58 @@ +import anndata as ad +import sys +import pegasus as pg + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', + 'input_solution': 'resources_test/.../solution.h5ad', + 'output': 'output.h5ad', + "rep": "pca", + "K": 24, + "min_rate": 0.9, + "n_jobs": -1, + "random_state": 0, + "use_cache": True +} +meta = { + 'name': 'ksim' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print('Reading input files', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns + +print('Compute metrics', flush=True) +score = pg.calc_kSIM( + adata, + attr='cell_type', + rep=par["rep"], + K=par["K"], + min_rate=par["min_rate"], + n_jobs=par["n_jobs"], + random_state=par["random_state"], + use_cache=par["use_cache"] +)[1] + +# TODO RETURNS A TOUPLE OF TWO THINGS: kSIM_mean (float) – Mean kSIM rate over all the cells., kSIM_accept_rate (float) – kSIM Acceptance rate of the sample + +print('Create output AnnData object', flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], + 'metric_ids': [ meta['name'] ], + 'metric_values': [ score ] + } +) + +print("Write output AnnData to file", flush=True) +output.write_h5ad(par['output'], compression='gzip') From 47882e3177cd239d27a1aae3d023a1a7551caa23 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 12 Aug 2025 13:32:15 +0200 Subject: [PATCH 3/6] working --- src/metrics/ksim/config.vsh.yaml | 4 ---- src/metrics/ksim/script.py | 34 +++++++++++--------------------- 2 files changed, 11 insertions(+), 27 deletions(-) diff --git a/src/metrics/ksim/config.vsh.yaml b/src/metrics/ksim/config.vsh.yaml index f85b3cc6..4c7a9b1b 100644 --- a/src/metrics/ksim/config.vsh.yaml +++ b/src/metrics/ksim/config.vsh.yaml @@ -19,10 +19,6 @@ info: maximize: true arguments: - - name: "--rep" - type: "string" - default: "pca" - description: The embedding representation to consider. - name: "--K" type: "integer" default: 24 diff --git a/src/metrics/ksim/script.py b/src/metrics/ksim/script.py index 57d37758..fa8a7fe7 100644 --- a/src/metrics/ksim/script.py +++ b/src/metrics/ksim/script.py @@ -1,25 +1,9 @@ import anndata as ad import sys import pegasus as pg +import pegasusio +from scipy.sparse import csr_matrix -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', - 'input_solution': 'resources_test/.../solution.h5ad', - 'output': 'output.h5ad', - "rep": "pca", - "K": 24, - "min_rate": 0.9, - "n_jobs": -1, - "random_state": 0, - "use_cache": True -} -meta = { - 'name': 'ksim' -} -## VIASH END sys.path.append(meta["resources_dir"]) from read_anndata_partial import read_anndata @@ -28,20 +12,24 @@ adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') adata.obs = read_anndata(par['input_solution'], obs='obs').obs adata.uns |= read_anndata(par['input_solution'], uns='uns').uns +print(adata) + +print('Convert to pegasusio.MultimodalData...', flush=True) +adata.X = csr_matrix(adata.shape) +mmdata = pegasusio.MultimodalData(adata) print('Compute metrics', flush=True) score = pg.calc_kSIM( - adata, + mmdata, attr='cell_type', - rep=par["rep"], + rep='emb', K=par["K"], min_rate=par["min_rate"], n_jobs=par["n_jobs"], random_state=par["random_state"], use_cache=par["use_cache"] -)[1] - -# TODO RETURNS A TOUPLE OF TWO THINGS: kSIM_mean (float) – Mean kSIM rate over all the cells., kSIM_accept_rate (float) – kSIM Acceptance rate of the sample +) +print("score:", score) print('Create output AnnData object', flush=True) output = ad.AnnData( From 74bd5c54b757612255213cd1fb19cb8015d9ca89 Mon Sep 17 00:00:00 2001 From: seo <159482645+seohyonkim@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:59:18 +0200 Subject: [PATCH 4/6] Update scripts/create_component/create_python_metric.sh Co-authored-by: Robrecht Cannoodt --- scripts/create_component/create_python_metric.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh index 1da0b0be..d36bc7a9 100755 --- a/scripts/create_component/create_python_metric.sh +++ b/scripts/create_component/create_python_metric.sh @@ -3,6 +3,6 @@ set -e common/scripts/create_component \ - --name ksim \ + --name my_python_metric \ --language python \ --type metric From a8814e29e020c3b8998d133035af6b5b84a23891 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 9 Sep 2025 18:00:14 +0200 Subject: [PATCH 5/6] add to changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09d672d0..9f9e1031 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). +* Added `metircs/ksim` component (PR #75). + ## Minor changes * Un-pin the scPRINT version and update parameters (PR #51) From 86581248c00f87b0abfb95dcbbdce1111db00846 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Thu, 25 Sep 2025 03:20:56 +0200 Subject: [PATCH 6/6] rephrase metric infos --- src/metrics/ksim/config.vsh.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/metrics/ksim/config.vsh.yaml b/src/metrics/ksim/config.vsh.yaml index 4c7a9b1b..b9ea44a4 100644 --- a/src/metrics/ksim/config.vsh.yaml +++ b/src/metrics/ksim/config.vsh.yaml @@ -5,9 +5,11 @@ info: - name: ksim label: kSIM - summary: "The kSIM acceptance rate measures whether cells of the same pre-annotated cell type are still close to each other in the local neighborhoods after batch correction." + summary: "The kSIM acceptance rate evaluates whether cells of the same known cell type remain grouped together in their neighborhoods after batch correction." description: | - The kSIM acceptance rate requires ground truth cell type information and measures whether the neighbors of a cell have the same cell type as it does. If a method overcorrects the batch effects, it will have a low kSIM acceptance rate. We use the HNSW algorithm to find k-NNs (including the cell itself) for each cell i and denote the number of neighbors that have the same cell type as i as . In addition, we require at least β fraction of neighbors of cell i to have the same cell type as i in order to say cell i has a consistent neighborhood. + The kSIM acceptance rate uses prior knowledge of cell type labels to assess local neighborhood consistency. For each cell, we look at its nearest neighbors (including itself) and check how many share the same cell type. + A cell is considered to have a consistent neighborhood if the majority of its neighbors still belong to its own type. The acceptance rate is the overall fraction of such cells in the dataset. + A high kSIM value means that cells of the same type remain locally clustered after correction, while a low value suggests that the correction has disrupted true biological structure—for example, by overcorrecting batch effects. references: doi: - 10.1038/s41592-020-0905-x