From 827cef0b9ce4299817c4736f47072080c811de3b Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Wed, 16 Apr 2025 12:21:02 +0200 Subject: [PATCH 01/10] add method semi-supervised STACAS (ssSTACAS) --- src/methods/ss_stacas/config.vsh.yaml | 81 +++++++++++++++++++++++++++ src/methods/ss_stacas/script.R | 56 ++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 src/methods/ss_stacas/config.vsh.yaml create mode 100644 src/methods/ss_stacas/script.R diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml new file mode 100644 index 00000000..8823bdf2 --- /dev/null +++ b/src/methods/ss_stacas/config.vsh.yaml @@ -0,0 +1,81 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_method.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: ss_stacas +# A relatively short label, used when rendering visualisations (required) +label: ssSTACAS +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: Accurate semi-supervised integration of single-cell transcriptomics data +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + STACAS is a method for scRNA-seq integration, + especially suited to accurately integrate datasets with large cell type imbalance + (e.g. in terms of proportions of distinct cell populations). + Prior cell type knowledge, given as cell type labels, can be provided to the algorithm to perform + semi-supervised integration, leading to increased preservation of biological variability + in the resulting integrated space. + STACAS is robust to incomplete cell type labels and can be applied to large-scale integration tasks. +references: + doi: 10.1038/s41467-024-45240-z + # Andreatta M, Hérault L, Gueguen P, Gfeller D, Berenstein AJ, Carmona SJ. + # Semi-supervised integration of single-cell transcriptomics data. + # Nature Communications*. 2024;15(1):1-13. doi:10.1038/s41467-024-45240-z +links: + # URL to the documentation for this method (required). + documentation: https://carmonalab.github.io/STACAS.demo/STACAS.demo.html + # URL to the code repository for this method (required). + repository: https://github.com/carmonalab/STACAS +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: log_cp10k + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: r_script + path: script.R + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_r:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: r + #github: https://github.com/carmonalab/STACAS.git@2.2.0 + cran: + - Seurat + - SeuratObject + - R.utils + bioc: + - BiocNeighbors + - BiocParallel + script: remotes::install_github("carmonalab/STACAS@2.2.0", dependencies = FALSE) + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/ss_stacas/script.R b/src/methods/ss_stacas/script.R new file mode 100644 index 00000000..397c27be --- /dev/null +++ b/src/methods/ss_stacas/script.R @@ -0,0 +1,56 @@ +requireNamespace("anndata", quietly = TRUE) +suppressPackageStartupMessages({ + library(STACAS) + library(Matrix) + library(SeuratObject) + library(Seurat) +}) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "ss_stacas" +) +## VIASH END + +cat("Reading input file\n") +adata <- anndata::read_h5ad(par[["input"]]) + +cat("Create Seurat object\n") +# Transpose because Seurat expects genes in rows, cells in columns +counts_r <- Matrix::t(adata$layers[["counts"]]) +normalized_r <- Matrix::t(adata$layers[["normalized"]]) +# Convert to a regular sparse matrix first and then to dgCMatrix +counts_c <- as(as(counts_r, "CsparseMatrix"), "dgCMatrix") +normalized_c <- as(as(normalized_r, "CsparseMatrix"), "dgCMatrix") + +# Create Seurat object with raw counts, these are needed to compute Variable Genes +seurat_obj <- Seurat::CreateSeuratObject(counts = counts_c, + meta.data = adata$obs) +# Manually assign pre-normalized values to the "data" slot +seurat_obj@assays$RNA$data <- normalized_c + +cat("Run STACAS\n") +object_integrated <- seurat_obj |> + Seurat::SplitObject(split.by = "batch") |> + STACAS::Run.STACAS(cell.labels = "cell_type") + +cat("Store outputs\n") +output <- anndata::AnnData( + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + obs = adata$obs, + var = adata$var, + obsm = list( + X_emb = object_integrated@reductions$pca@cell.embeddings + ) +) + +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip") From cfa41e5dd99f9be4c5499ae84b4a4c4a9ca59b4e Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Wed, 16 Apr 2025 12:54:56 +0200 Subject: [PATCH 02/10] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09d672d0..69682b64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # task_batch_integration devel +## New functionality +* Add `methods/ss_stacas` new method. +Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. +This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects. + ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). From dee1e1cf6ffcf210fe717ce11fc596bacdcca7bf Mon Sep 17 00:00:00 2001 From: Josep Garnica <61703467+JGarnica22@users.noreply.github.com> Date: Wed, 17 Sep 2025 16:17:12 +0200 Subject: [PATCH 03/10] Update: base_r container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Michaela Müller <51025211+mumichae@users.noreply.github.com> --- src/methods/ss_stacas/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml index 8823bdf2..1b84842e 100644 --- a/src/methods/ss_stacas/config.vsh.yaml +++ b/src/methods/ss_stacas/config.vsh.yaml @@ -57,7 +57,7 @@ resources: engines: # Specifications for the Docker image for this component. - type: docker - image: openproblems/base_r:1.0.0 + image: openproblems/base_r:1 # Add custom dependencies here (optional). For more information, see # https://viash.io/reference/config/engines/docker/#setup . setup: From aaacd549fcbb2574f354b567b1168b44fcc4ef7f Mon Sep 17 00:00:00 2001 From: Josep Garnica <61703467+JGarnica22@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:21:29 +0200 Subject: [PATCH 04/10] add: method_types configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Michaela Müller <51025211+mumichae@users.noreply.github.com> --- src/methods/ss_stacas/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml index 1b84842e..493ad50e 100644 --- a/src/methods/ss_stacas/config.vsh.yaml +++ b/src/methods/ss_stacas/config.vsh.yaml @@ -37,6 +37,7 @@ links: info: # Which normalisation method this component prefers to use (required). preferred_normalization: log_cp10k + method_types: [embedding] # Component-specific parameters (optional) # arguments: From 1291bf8ff7e522fd0e1d9237af868620cf95542b Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Thu, 18 Sep 2025 10:26:31 +0200 Subject: [PATCH 05/10] fix: move ssSTACAS below kBET --- CHANGELOG.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69682b64..1b89404f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,10 @@ # task_batch_integration devel -## New functionality -* Add `methods/ss_stacas` new method. -Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. -This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects. - ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). +* Add `methods/ss_stacas` new method. + - Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects. ## Minor changes From 4a40ddcfd7665440b6d43f1d9b14ea5bde23f7d6 Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Thu, 18 Sep 2025 10:48:43 +0200 Subject: [PATCH 06/10] add: PR # --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b89404f..c4caf002 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). -* Add `methods/ss_stacas` new method. +* Add `methods/ss_stacas` new method (PR #59). - Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects. ## Minor changes From 340de8f978db31bcfd9004ab56827d4efb7b56c2 Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Thu, 18 Sep 2025 10:54:14 +0200 Subject: [PATCH 07/10] fix: remove boilerplat comments --- src/methods/ss_stacas/config.vsh.yaml | 37 --------------------------- 1 file changed, 37 deletions(-) diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml index 493ad50e..0b4afe11 100644 --- a/src/methods/ss_stacas/config.vsh.yaml +++ b/src/methods/ss_stacas/config.vsh.yaml @@ -1,20 +1,7 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test __merge__: ../../api/comp_method.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. name: ss_stacas -# A relatively short label, used when rendering visualisations (required) label: ssSTACAS -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. summary: Accurate semi-supervised integration of single-cell transcriptomics data -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. description: | STACAS is a method for scRNA-seq integration, especially suited to accurately integrate datasets with large cell type imbalance @@ -29,38 +16,17 @@ references: # Semi-supervised integration of single-cell transcriptomics data. # Nature Communications*. 2024;15(1):1-13. doi:10.1038/s41467-024-45240-z links: - # URL to the documentation for this method (required). documentation: https://carmonalab.github.io/STACAS.demo/STACAS.demo.html - # URL to the code repository for this method (required). repository: https://github.com/carmonalab/STACAS -# Metadata for your component info: - # Which normalisation method this component prefers to use (required). preferred_normalization: log_cp10k method_types: [embedding] - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component resources: - # The script of your component (required) - type: r_script path: script.R - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - engines: - # Specifications for the Docker image for this component. - type: docker image: openproblems/base_r:1 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . setup: - type: r #github: https://github.com/carmonalab/STACAS.git@2.2.0 @@ -72,11 +38,8 @@ engines: - BiocNeighbors - BiocParallel script: remotes::install_github("carmonalab/STACAS@2.2.0", dependencies = FALSE) - runners: - # This platform allows running the component natively - type: executable - # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: label: [midtime,midmem,midcpu] From 93d5477e3ef5253e16819158f084df2ee6c8c528 Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Thu, 18 Sep 2025 10:54:53 +0200 Subject: [PATCH 08/10] udpate: latest STACAS version --- src/methods/ss_stacas/config.vsh.yaml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml index 0b4afe11..1f2acf57 100644 --- a/src/methods/ss_stacas/config.vsh.yaml +++ b/src/methods/ss_stacas/config.vsh.yaml @@ -29,15 +29,7 @@ engines: image: openproblems/base_r:1 setup: - type: r - #github: https://github.com/carmonalab/STACAS.git@2.2.0 - cran: - - Seurat - - SeuratObject - - R.utils - bioc: - - BiocNeighbors - - BiocParallel - script: remotes::install_github("carmonalab/STACAS@2.2.0", dependencies = FALSE) + github: carmonalab/STACAS@2.3.0 runners: - type: executable - type: nextflow From 7b1357c5a6816c43b99a1eca2c03920fff53990b Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Thu, 2 Oct 2025 16:05:19 +0200 Subject: [PATCH 09/10] fix: do not load unneeded counts, and specificy anchor.features --- src/methods/ss_stacas/script.R | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/methods/ss_stacas/script.R b/src/methods/ss_stacas/script.R index 397c27be..ea8f64f1 100644 --- a/src/methods/ss_stacas/script.R +++ b/src/methods/ss_stacas/script.R @@ -20,23 +20,29 @@ cat("Reading input file\n") adata <- anndata::read_h5ad(par[["input"]]) cat("Create Seurat object\n") +# Only loading normalized values, as raw counts are not needed + # Transpose because Seurat expects genes in rows, cells in columns -counts_r <- Matrix::t(adata$layers[["counts"]]) -normalized_r <- Matrix::t(adata$layers[["normalized"]]) +normalized <- Matrix::t(adata$layers[["normalized"]]) # Convert to a regular sparse matrix first and then to dgCMatrix -counts_c <- as(as(counts_r, "CsparseMatrix"), "dgCMatrix") -normalized_c <- as(as(normalized_r, "CsparseMatrix"), "dgCMatrix") +normalized <- as(as(normalized, "CsparseMatrix"), "dgCMatrix") -# Create Seurat object with raw counts, these are needed to compute Variable Genes -seurat_obj <- Seurat::CreateSeuratObject(counts = counts_c, +# Create Seurat object +seurat_obj <- Seurat::CreateSeuratObject(counts = normalized, meta.data = adata$obs) # Manually assign pre-normalized values to the "data" slot -seurat_obj@assays$RNA$data <- normalized_c +seurat_obj@assays$RNA$data <- normalized +seurat_obj@assays$RNA$counts <- NULL # remove counts + + +# Obtain anchor features from the preprocessing pipeline +anchor.features <- head(adata$var[order(adata$var$hvg_score, decreasing = T), "feature_id"], 2000) cat("Run STACAS\n") object_integrated <- seurat_obj |> Seurat::SplitObject(split.by = "batch") |> - STACAS::Run.STACAS(cell.labels = "cell_type") + STACAS::Run.STACAS(cell.labels = "cell_type", + anchor.features = anchor.features) cat("Store outputs\n") output <- anndata::AnnData( From 5e31e19079620cd364591dd24661fb9299f09a71 Mon Sep 17 00:00:00 2001 From: JGarnica22 Date: Fri, 3 Oct 2025 09:53:30 +0200 Subject: [PATCH 10/10] resolve conflict --- CHANGELOG.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0923298..373f2eab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,16 +3,12 @@ ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). -<<<<<<< HEAD * Add `methods/ss_stacas` new method (PR #59). - Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects. -======= * Added `methods/stacas` new method (PR #58). - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations. ->>>>>>> upstream/main * Added `method/drvi` component (PR #61). * Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68). - * Added `metrics/cilisi` new metric component (PR #57). - ciLISI measures batch mixing in a cell type-aware manner by computing iLISI within each cell type and normalizing the scores between 0 and 1. Unlike iLISI, ciLISI preserves sensitivity to biological variance and avoids favoring