diff --git a/.gitignore b/.gitignore index bbe13c47..19c2076f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ /output trace-* .ipynb_checkpoints -__pycache__ \ No newline at end of file +__pycache__ +.Rproj.user +.Rhistory +*.Rproj \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 75292106..d26c0ce3 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,14 @@ "common/schemas/task_config.yaml": "_viash.yaml", "common/schemas/task_method.yaml": "**/methods/**/config.vsh.yaml", "common/schemas/task_control_method.yaml": "**/control_methods/**/config.vsh.yaml", - "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml" + "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml", + "https://raw.githubusercontent.com/viash-io/viash-schemas/refs/heads/main/json_schemas/unknown/config.schema.json": [ + "*.vsh.yaml", + "*.vsh.yml" + ], + "https://raw.githubusercontent.com/viash-io/viash-schemas/refs/heads/main/json_schemas/unknown/package.schema.json": [ + "_viash.yaml", + "_viash.yml" + ] } } diff --git a/CHANGELOG.md b/CHANGELOG.md index 890c4eb7..373f2eab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,12 @@ ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). +* Add `methods/ss_stacas` new method (PR #59). + - Add semi-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality leverages partial or imperfect knowledge of cell identity to improve integration quality by preserving biological variation while correcting for batch effects. * Added `methods/stacas` new method (PR #58). - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations. * Added `method/drvi` component (PR #61). * Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68). - * Added `metrics/cilisi` new metric component (PR #57). - ciLISI measures batch mixing in a cell type-aware manner by computing iLISI within each cell type and normalizing the scores between 0 and 1. Unlike iLISI, ciLISI preserves sensitivity to biological variance and avoids favoring diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh index 0ab03945..da61e9b7 100755 --- a/scripts/create_component/create_r_method.sh +++ b/scripts/create_component/create_r_method.sh @@ -3,6 +3,6 @@ set -e common/scripts/create_component \ - --name my_r_method \ + --name ss_stacas \ --language r \ --type method diff --git a/src/methods/ss_stacas/config.vsh.yaml b/src/methods/ss_stacas/config.vsh.yaml new file mode 100644 index 00000000..1f2acf57 --- /dev/null +++ b/src/methods/ss_stacas/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml +name: ss_stacas +label: ssSTACAS +summary: Accurate semi-supervised integration of single-cell transcriptomics data +description: | + STACAS is a method for scRNA-seq integration, + especially suited to accurately integrate datasets with large cell type imbalance + (e.g. in terms of proportions of distinct cell populations). + Prior cell type knowledge, given as cell type labels, can be provided to the algorithm to perform + semi-supervised integration, leading to increased preservation of biological variability + in the resulting integrated space. + STACAS is robust to incomplete cell type labels and can be applied to large-scale integration tasks. +references: + doi: 10.1038/s41467-024-45240-z + # Andreatta M, Hérault L, Gueguen P, Gfeller D, Berenstein AJ, Carmona SJ. + # Semi-supervised integration of single-cell transcriptomics data. + # Nature Communications*. 2024;15(1):1-13. doi:10.1038/s41467-024-45240-z +links: + documentation: https://carmonalab.github.io/STACAS.demo/STACAS.demo.html + repository: https://github.com/carmonalab/STACAS +info: + preferred_normalization: log_cp10k + method_types: [embedding] +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1 + setup: + - type: r + github: carmonalab/STACAS@2.3.0 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/ss_stacas/script.R b/src/methods/ss_stacas/script.R new file mode 100644 index 00000000..ea8f64f1 --- /dev/null +++ b/src/methods/ss_stacas/script.R @@ -0,0 +1,62 @@ +requireNamespace("anndata", quietly = TRUE) +suppressPackageStartupMessages({ + library(STACAS) + library(Matrix) + library(SeuratObject) + library(Seurat) +}) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "ss_stacas" +) +## VIASH END + +cat("Reading input file\n") +adata <- anndata::read_h5ad(par[["input"]]) + +cat("Create Seurat object\n") +# Only loading normalized values, as raw counts are not needed + +# Transpose because Seurat expects genes in rows, cells in columns +normalized <- Matrix::t(adata$layers[["normalized"]]) +# Convert to a regular sparse matrix first and then to dgCMatrix +normalized <- as(as(normalized, "CsparseMatrix"), "dgCMatrix") + +# Create Seurat object +seurat_obj <- Seurat::CreateSeuratObject(counts = normalized, + meta.data = adata$obs) +# Manually assign pre-normalized values to the "data" slot +seurat_obj@assays$RNA$data <- normalized +seurat_obj@assays$RNA$counts <- NULL # remove counts + + +# Obtain anchor features from the preprocessing pipeline +anchor.features <- head(adata$var[order(adata$var$hvg_score, decreasing = T), "feature_id"], 2000) + +cat("Run STACAS\n") +object_integrated <- seurat_obj |> + Seurat::SplitObject(split.by = "batch") |> + STACAS::Run.STACAS(cell.labels = "cell_type", + anchor.features = anchor.features) + +cat("Store outputs\n") +output <- anndata::AnnData( + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + obs = adata$obs, + var = adata$var, + obsm = list( + X_emb = object_integrated@reductions$pca@cell.embeddings + ) +) + +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip")