openproblems-bio · JGarnica22 · Apr 16, 2025 · Apr 16, 2025 · Aug 8, 2025 · Sep 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,7 @@
 /output
 trace-*
 .ipynb_checkpoints
-__pycache__
+__pycache__
+.Rproj.user
+.Rhistory
+*.Rproj
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -5,6 +5,14 @@
     "common/schemas/task_config.yaml": "_viash.yaml",
     "common/schemas/task_method.yaml": "**/methods/**/config.vsh.yaml",
     "common/schemas/task_control_method.yaml": "**/control_methods/**/config.vsh.yaml",
-    "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml"
+    "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml",
+    "https://raw.githubusercontent.com/viash-io/viash-schemas/refs/heads/main/json_schemas/unknown/config.schema.json": [
+      "*.vsh.yaml",
+      "*.vsh.yml"
+    ],
+    "https://raw.githubusercontent.com/viash-io/viash-schemas/refs/heads/main/json_schemas/unknown/package.schema.json": [
+      "_viash.yaml",
+      "_viash.yml"
+    ]
   }
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,11 +3,12 @@
 ## New functionality
 
 * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52).
+* Add `methods/ss_stacas` new method (PR #59).
+    - Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects.
 * Added `methods/stacas` new method (PR #58).
     - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations.
 * Added `method/drvi` component (PR #61).
 * Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68).
-
 * Added `metrics/cilisi` new metric component (PR #57).
     - ciLISI measures batch mixing in a cell type-aware manner by computing iLISI within each cell type and normalizing
         the scores between 0 and 1. Unlike iLISI, ciLISI preserves sensitivity to biological variance and avoids favoring

diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh
@@ -3,6 +3,6 @@
 set -e
 
 common/scripts/create_component \
-  --name my_r_method \
+  --name ss_stacas \
   --language r \
   --type method
diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml
@@ -0,0 +1,37 @@
+__merge__: ../../api/comp_method.yaml
+name: ss_stacas
+label: ssSTACAS
+summary: Accurate semi-supervised integration of single-cell transcriptomics data
+description: |
+  STACAS is a method for scRNA-seq integration,
+  especially suited to accurately integrate datasets with large cell type imbalance
+  (e.g. in terms of proportions of distinct cell populations).
+  Prior cell type knowledge, given as cell type labels, can be provided to the algorithm to perform
+  semi-supervised integration, leading to increased preservation of biological variability
+  in the resulting integrated space.
+  STACAS is robust to incomplete cell type labels and can be applied to large-scale integration tasks.
+references:
+  doi: 10.1038/s41467-024-45240-z
+  # Andreatta M, Hérault L, Gueguen P, Gfeller D, Berenstein AJ, Carmona SJ.
+  # Semi-supervised integration of single-cell transcriptomics data.
+  # Nature Communications*. 2024;15(1):1-13. doi:10.1038/s41467-024-45240-z
+links:
+  documentation: https://carmonalab.github.io/STACAS.demo/STACAS.demo.html
+  repository: https://github.com/carmonalab/STACAS
+info:
+  preferred_normalization: log_cp10k
+  method_types: [embedding]
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      - type: r
+        github: carmonalab/STACAS@2.3.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/ss_stacas/script.R b/src/methods/ss_stacas/script.R
@@ -0,0 +1,62 @@
+requireNamespace("anndata", quietly = TRUE)
+suppressPackageStartupMessages({
+  library(STACAS)
+  library(Matrix)
+  library(SeuratObject)
+  library(Seurat)
+})
+
+## VIASH START
+par <- list(
+  input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(
+  name = "ss_stacas"
+)
+## VIASH END
+
+cat("Reading input file\n")
+adata <- anndata::read_h5ad(par[["input"]])
+
+cat("Create Seurat object\n")
+# Only loading normalized values, as raw counts are not needed
+
+# Transpose because Seurat expects genes in rows, cells in columns
+normalized <- Matrix::t(adata$layers[["normalized"]])
+# Convert to a regular sparse matrix first and then to dgCMatrix
+normalized <- as(as(normalized, "CsparseMatrix"), "dgCMatrix")
+
+# Create Seurat object
+seurat_obj <- Seurat::CreateSeuratObject(counts = normalized,
+                                         meta.data = adata$obs)
+# Manually assign pre-normalized values to the "data" slot
+seurat_obj@assays$RNA$data   <- normalized
+seurat_obj@assays$RNA$counts <- NULL # remove counts
+
+
+# Obtain anchor features from the preprocessing pipeline
+anchor.features <- head(adata$var[order(adata$var$hvg_score, decreasing = T), "feature_id"], 2000)
+
+cat("Run STACAS\n")
+object_integrated <- seurat_obj |>
+      Seurat::SplitObject(split.by = "batch") |>
+      STACAS::Run.STACAS(cell.labels = "cell_type",
+                        anchor.features = anchor.features) 
+
+cat("Store outputs\n")
+output <- anndata::AnnData(
+    uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name
+  ),
+  obs = adata$obs,
+  var = adata$var,
+  obsm = list(
+    X_emb = object_integrated@reductions$pca@cell.embeddings
+  )
+)
+
+cat("Write output AnnData to file\n")
+output$write_h5ad(par[["output"]], compression = "gzip")