diff --git a/.gitignore b/.gitignore index c63a6b7..4478bda 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,7 @@ rsconnect/ *.o *.so .DS_Store + +# pkgdown site (deployed to gh-pages) +docs/* +!docs/.nojekyll diff --git a/DESCRIPTION b/DESCRIPTION index bbab603..1701095 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,28 +1,24 @@ Package: splikit -Title: A toolkit for analysing RNA splicing in scRNA-seq data -Version: 1.0.6 -Authors@R: +Title: A Toolkit for Analysing RNA Splicing in scRNA-Seq Data +Version: 2.0.0 +Authors@R: person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9453-6951")) -Description: - Splikit /ˈsplaɪ.kɪt/ is a toolkit designed for the analysis of high-dimensional single-cell - splicing data. It provides a framework to extract and work with ratio-based data structures - derived from single-cell RNA sequencing experiments. The package avoids the need for bulky - S4 objects by offering direct and efficient manipulation of matrices. Core functionalities - are implemented in C++ via Rcpp to ensure high performance and scalability on large datasets. -URL: https://arshammik.github.io/splikit/, https://github.com/Arshammik/splikit -BugReports: https://github.com/Arshammik/splikit/issues +Description: A toolkit designed for the analysis of high-dimensional single-cell + splicing data. Provides a framework to extract and work with ratio-based + data structures derived from single-cell RNA sequencing experiments. + Offers both a modern R6 object-oriented interface and direct matrix + manipulation functions. Core functionalities are implemented in C++ via + 'Rcpp' to ensure high performance and scalability on large datasets. +URL: https://csglab.github.io/splikit/, https://github.com/csglab/splikit +BugReports: https://github.com/csglab/splikit/issues License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 -Imports: Matrix, data.table, methods, stats, Rcpp, RcppArmadillo +RoxygenNote: 7.3.3 +Imports: Matrix, data.table, methods, stats, Rcpp, R6 LinkingTo: Rcpp, RcppArmadillo -Suggests: testthat (>= 3.0.0) +Suggests: testthat (>= 3.0.0), knitr, rmarkdown Config/testthat/edition: 3 -NeedsCompilation: yes -Packaged: 2025-05-09 22:41:00 UTC; arsham79 -Author: Arsham Mikaeili Namini [aut, cre] - () -Maintainer: Arsham Mikaeili Namini -Depends: R (>= 3.5.0) +VignetteBuilder: knitr +Depends: R (>= 4.1.0) diff --git a/NAMESPACE b/NAMESPACE index 0ae24c3..6d0586c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(SplikitObject) export(find_variable_events) export(find_variable_genes) export(get_pseudo_correlation) @@ -13,6 +14,7 @@ export(make_junction_ab) export(make_m1) export(make_m2) export(make_velo_count) +export(splikit) import(Matrix) import(Rcpp) import(data.table) @@ -21,8 +23,11 @@ importClassesFrom(Matrix,dgTMatrix) importClassesFrom(Matrix,dsCMatrix) importClassesFrom(Matrix,dsTMatrix) importFrom(Matrix,Matrix) +importFrom(Matrix,nnzero) importFrom(Matrix,readMM) +importFrom(Matrix,rowSums) importFrom(Matrix,sparseMatrix) +importFrom(R6,R6Class) importFrom(Rcpp,evalCpp) importFrom(data.table,":=") importFrom(data.table,.GRP) @@ -34,4 +39,7 @@ importFrom(data.table,fread) importFrom(data.table,is.data.table) importFrom(data.table,setDT) importFrom(data.table,setnames) +importFrom(methods,as) +importFrom(stats,na.omit) +importFrom(stats,setNames) useDynLib(splikit, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index 909b942..10e3dd8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -37,6 +37,40 @@ standardizeSparse_variance_vst <- function(matSEXP, display_progress = FALSE) { .Call(`_splikit_standardizeSparse_variance_vst`, matSEXP, display_progress) } +#' Memory-efficient M2 computation building CSC format directly +#' +#' This version minimizes memory usage by: +#' - Building CSC format directly (no triplet intermediate) +#' - Using O(n_groups) workspace per column instead of dense group_sums matrix +#' - Two-pass algorithm: count then fill +#' +#' Memory usage: O(nnz_output) + O(n_groups) workspace +#' vs previous: O(n_groups * n_cells) + O(6 * nnz_output) +#' +#' @param M1 Sparse matrix (dgCMatrix) of inclusion counts (events x cells) +#' @param group_ids Integer vector of group IDs for each event +#' @param n_threads Number of threads for OpenMP (default 1) +#' +#' @return Sparse matrix M2 with same dimensions as M1 +#' +#' @keywords internal +make_m2_cpp_parallel <- function(M1, group_ids, n_threads = 1L) { + .Call(`_splikit_make_m2_cpp_parallel`, M1, group_ids, n_threads) +} + +#' Legacy function - redirects to memory-efficient version +#' +#' @param M1 Sparse matrix (dgCMatrix) of inclusion counts (events x cells) +#' @param group_ids Integer vector of group IDs for each event +#' @param n_threads Number of threads for OpenMP (default 1) +#' +#' @return Sparse matrix M2 with same dimensions as M1 +#' +#' @keywords internal +make_m2_cpp <- function(M1, group_ids, n_threads = 1L) { + .Call(`_splikit_make_m2_cpp`, M1, group_ids, n_threads) +} + rowVariance_cpp <- function(mat) { .Call(`_splikit_rowVariance_cpp`, mat) } diff --git a/R/SplikitObject.R b/R/SplikitObject.R new file mode 100644 index 0000000..28a1f53 --- /dev/null +++ b/R/SplikitObject.R @@ -0,0 +1,546 @@ +#' @title SplikitObject +#' @description +#' R6 class for splicing analysis in single-cell RNA-seq data. +#' Provides a modern object-oriented interface to splikit functionality +#' while maintaining backward compatibility with existing functions. +#' +#' @details +#' The SplikitObject encapsulates the core data structures for splicing analysis: +#' \itemize{ +#' \item \code{m1}: Inclusion matrix (sparse dgCMatrix) +#' \item \code{m2}: Exclusion matrix (sparse dgCMatrix) +#' \item \code{eventData}: Event metadata (data.table) +#' \item \code{geneExpression}: Optional gene expression matrix +#' } +#' +#' @examples +#' \dontrun{ +#' # Create from junction abundance data +#' junction_ab <- load_toy_SJ_object() +#' obj <- SplikitObject$new(junction_ab = junction_ab) +#' +#' # Compute M2 and find variable events +#' obj$makeM2() +#' hve <- obj$findVariableEvents(min_row_sum = 50) +#' +#' # Or create from existing matrices +#' obj <- SplikitObject$new(m1 = my_m1, m2 = my_m2, eventData = my_eventdata) +#' +#' # Chain operations +#' results <- obj$makeM2()$findVariableEvents() +#' } +#' +#' @importFrom R6 R6Class +#' @importFrom Matrix nnzero rowSums +#' @importFrom data.table is.data.table data.table copy +#' @export +SplikitObject <- R6::R6Class("SplikitObject", + + public = list( + + #' @field m1 Inclusion matrix (dgCMatrix). Rows are events, columns are cells. + m1 = NULL, + + #' @field m2 Exclusion matrix (dgCMatrix). Same dimensions as m1. + m2 = NULL, + + #' @field eventData Event metadata (data.table). One row per event. + eventData = NULL, + + #' @field geneExpression Optional gene expression matrix (dgCMatrix). + geneExpression = NULL, + + #' @field metadata List containing summary statistics and analysis results. + metadata = NULL, + + #' @description + #' Create a new SplikitObject. + #' + #' @param junction_ab A junction abundance object from \code{make_junction_ab()}. + #' If provided, m1 and eventData are computed automatically. + #' @param m1 An existing inclusion matrix (dgCMatrix). + #' @param m2 An existing exclusion matrix (dgCMatrix). + #' @param eventData A data.table with event metadata. + #' @param min_counts Minimum count threshold for filtering events (default: 1). + #' @param verbose Print progress messages (default: FALSE). + #' + #' @return A new SplikitObject instance. + initialize = function(junction_ab = NULL, + m1 = NULL, m2 = NULL, eventData = NULL, + min_counts = 1, verbose = FALSE) { + + if (!is.null(junction_ab)) { + # Build from raw junction abundance data + if (verbose) cat("Building SplikitObject from junction abundance data...\n") + + result <- make_m1( + junction_ab_object = junction_ab, + min_counts = min_counts, + verbose = verbose + ) + + self$m1 <- result$m1_inclusion_matrix + self$eventData <- result$event_data + self$metadata <- list( + creation_method = "junction_ab", + summary = result$summary + ) + + if (verbose) cat("SplikitObject created with", nrow(self$m1), "events and", + ncol(self$m1), "cells.\n") + + } else if (!is.null(m1)) { + # Build from existing matrices + private$validateInputs(m1, m2, eventData) + + self$m1 <- private$ensureSparse(m1) + if (!is.null(m2)) { + self$m2 <- private$ensureSparse(m2) + } + self$eventData <- eventData + self$metadata <- list(creation_method = "matrices") + + } else { + private$error("Must provide either 'junction_ab' or 'm1' matrix") + } + + invisible(self) + }, + + #' @description + #' Compute the M2 exclusion matrix from M1 and eventData. + #' + #' @param batch_size Number of groups per batch for memory management (default: 5000). + #' @param memory_threshold Maximum rows before switching to batched processing. + #' @param force_fast Force fast processing regardless of size (default: FALSE). + #' @param multi_thread Use parallel processing for batched operations (default: FALSE). + #' @param n_threads Number of threads for C++ OpenMP parallelization (default: 1). + #' @param use_cpp Use fast C++ implementation (default: TRUE). + #' @param verbose Print progress messages (default: FALSE). + #' + #' @return Self (invisibly), for method chaining. + makeM2 = function(batch_size = 5000, memory_threshold = 2e9, + force_fast = FALSE, multi_thread = FALSE, + n_threads = 1, use_cpp = TRUE, verbose = FALSE) { + + if (is.null(self$m1)) { + private$error("M1 matrix not initialized") + } + if (is.null(self$eventData)) { + private$error("eventData not initialized. Cannot compute M2 without event metadata.") + } + + self$m2 <- make_m2( + m1_inclusion_matrix = self$m1, + eventdata = self$eventData, + batch_size = batch_size, + memory_threshold = memory_threshold, + force_fast = force_fast, + multi_thread = multi_thread, + n_threads = n_threads, + use_cpp = use_cpp, + verbose = verbose + ) + + invisible(self) + }, + + #' @description + #' Find highly variable splicing events. + #' + #' @param min_row_sum Minimum row sum threshold for filtering (default: 50). + #' @param n_threads Number of threads for parallel computation (default: 1). + #' @param verbose Print progress messages (default: FALSE). + #' + #' @return A data.table with event names and sum_deviance scores. + findVariableEvents = function(min_row_sum = 50, n_threads = 1, verbose = FALSE) { + + private$ensureM2Computed() + + # Pre-compute row sums efficiently + m1_sums <- Matrix::rowSums(self$m1) + m2_sums <- Matrix::rowSums(self$m2) + + # Check for empty results + valid_count <- sum(m1_sums > min_row_sum & m2_sums > min_row_sum) + if (valid_count == 0) { + private$error( + "No events pass min_row_sum threshold of ", min_row_sum, + ". Consider lowering threshold. ", + "Current range: m1 [", min(m1_sums), "-", max(m1_sums), "], ", + "m2 [", min(m2_sums), "-", max(m2_sums), "]" + ) + } + + result <- find_variable_events( + m1_matrix = self$m1, + m2_matrix = self$m2, + min_row_sum = min_row_sum, + n_threads = n_threads, + verbose = verbose + ) + + # Store in metadata + self$metadata$variableEvents <- result + self$metadata$variableEvents_params <- list( + min_row_sum = min_row_sum, + n_events_tested = valid_count + ) + + return(result) + }, + + #' @description + #' Find highly variable genes from gene expression data. + #' + #' @param method Method for variable gene selection: "vst" or "sum_deviance" (default: "vst"). + #' @param n_threads Number of threads for parallel computation (default: 1). + #' @param verbose Print progress messages (default: FALSE). + #' + #' @return A data.table with gene names and variability scores. + findVariableGenes = function(method = "vst", n_threads = 1, verbose = FALSE) { + + if (is.null(self$geneExpression)) { + private$error("Gene expression matrix not set. Use $setGeneExpression() first.") + } + + result <- find_variable_genes( + gene_expression_matrix = self$geneExpression, + method = method, + n_threads = n_threads, + verbose = verbose + ) + + self$metadata$variableGenes <- result + self$metadata$variableGenes_params <- list(method = method) + + return(result) + }, + + #' @description + #' Compute pseudo-correlation between splicing and external data. + #' + #' @param ZDB_matrix Dense matrix of external data (e.g., gene expression PCs). + #' Must have same dimensions as m1. + #' @param metric R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell"). + #' @param suppress_warnings Suppress computation warnings (default: TRUE). + #' + #' @return A data.table with event names, pseudo_correlation, and null_distribution. + getPseudoCorrelation = function(ZDB_matrix, metric = "CoxSnell", + suppress_warnings = TRUE) { + + private$ensureM2Computed() + + # Dimension validation + if (!is.matrix(ZDB_matrix)) { + private$error("ZDB_matrix must be a dense matrix") + } + if (nrow(ZDB_matrix) != nrow(self$m1)) { + private$error( + "Row mismatch: m1 has ", nrow(self$m1), + " rows but ZDB_matrix has ", nrow(ZDB_matrix), " rows" + ) + } + if (ncol(ZDB_matrix) != ncol(self$m1)) { + private$error( + "Column mismatch: m1 has ", ncol(self$m1), + " columns but ZDB_matrix has ", ncol(ZDB_matrix), " columns" + ) + } + + result <- get_pseudo_correlation( + ZDB_matrix = ZDB_matrix, + m1_inclusion = self$m1, + m2_exclusion = self$m2, + metric = metric, + suppress_warnings = suppress_warnings + ) + + # Warn about NA removal + n_before <- nrow(self$m1) + n_after <- nrow(result) + if (n_before > n_after) { + n_removed <- n_before - n_after + message("Removed ", n_removed, " event(s) with NA values (", + round(100 * n_removed / n_before, 1), "% of total).") + message("Reasons: insufficient data (n<2), no variation, or convergence failure.") + } + + return(result) + }, + + #' @description + #' Subset the object by events and/or cells. + #' + #' @param events Event indices or names to keep. + #' @param cells Cell indices or names to keep. + #' + #' @return Self (invisibly), for method chaining. + subset = function(events = NULL, cells = NULL) { + + if (!is.null(events)) { + # Convert names to indices if needed + if (is.character(events)) { + events <- match(events, rownames(self$m1)) + if (any(is.na(events))) { + n_missing <- sum(is.na(events)) + warning(n_missing, " event name(s) not found and will be ignored") + events <- events[!is.na(events)] + } + } + + if (length(events) == 0) { + private$error("Subsetting would remove all events") + } + + self$m1 <- self$m1[events, , drop = FALSE] + if (!is.null(self$m2)) { + self$m2 <- self$m2[events, , drop = FALSE] + } + if (!is.null(self$eventData)) { + self$eventData <- self$eventData[events, ] + } + } + + if (!is.null(cells)) { + # Convert names to indices if needed + if (is.character(cells)) { + cells <- match(cells, colnames(self$m1)) + if (any(is.na(cells))) { + n_missing <- sum(is.na(cells)) + warning(n_missing, " cell name(s) not found and will be ignored") + cells <- cells[!is.na(cells)] + } + } + + if (length(cells) == 0) { + private$error("Subsetting would remove all cells") + } + + self$m1 <- self$m1[, cells, drop = FALSE] + if (!is.null(self$m2)) { + self$m2 <- self$m2[, cells, drop = FALSE] + } + if (!is.null(self$geneExpression)) { + self$geneExpression <- self$geneExpression[, cells, drop = FALSE] + } + } + + invisible(self) + }, + + #' @description + #' Set the gene expression matrix. + #' + #' @param gene_matrix A gene expression matrix (will be converted to dgCMatrix). + #' + #' @return Self (invisibly), for method chaining. + setGeneExpression = function(gene_matrix) { + + # Validate dimensions + if (ncol(gene_matrix) != ncol(self$m1)) { + private$error( + "Gene expression must have same number of cells as m1 (", + ncol(self$m1), "), got ", ncol(gene_matrix) + ) + } + + # Ensure sparse + self$geneExpression <- private$ensureSparse(gene_matrix) + + invisible(self) + }, + + #' @description + #' Annotate events with gene information from a GTF file. + #' + #' @param GTF_file Path to a GTF annotation file. + #' + #' @return Self (invisibly), for method chaining. + annotateEvents = function(GTF_file) { + + # File validation + if (!file.exists(GTF_file)) { + private$error("GTF file not found: ", GTF_file) + } + + if (is.null(self$eventData)) { + private$error("eventData not initialized. Cannot annotate events.") + } + + self$eventData <- make_eventdata_plus( + eventdata = data.table::copy(self$eventData), + GTF_file_direction = GTF_file + ) + + invisible(self) + }, + + #' @description + #' Get a summary of the object. + #' + #' @return A list with object statistics. + summary = function() { + m1_nnz <- Matrix::nnzero(self$m1) + m1_total <- prod(dim(self$m1)) + + list( + events = nrow(self$m1), + cells = ncol(self$m1), + has_m2 = !is.null(self$m2), + has_gene_expression = !is.null(self$geneExpression), + n_genes = if (!is.null(self$geneExpression)) nrow(self$geneExpression) else 0, + sparsity_m1 = round(1 - (m1_nnz / m1_total), 4), + nnz_m1 = m1_nnz, + eventData_columns = if (!is.null(self$eventData)) names(self$eventData) else NULL, + metadata_keys = names(self$metadata) + ) + }, + + #' @description + #' Print a human-readable summary of the object. + print = function() { + cat("SplikitObject\n") + cat(paste(rep("-", 40), collapse = ""), "\n") + cat("Events: ", format(nrow(self$m1), big.mark = ","), "\n") + cat("Cells: ", format(ncol(self$m1), big.mark = ","), "\n") + cat("M2 computed: ", !is.null(self$m2), "\n") + + if (!is.null(self$geneExpression)) { + cat("Gene expression: ", format(nrow(self$geneExpression), big.mark = ","), " genes\n") + } else { + cat("Gene expression: not set\n") + } + + m1_nnz <- Matrix::nnzero(self$m1) + m1_total <- prod(dim(self$m1)) + cat("Sparsity (M1): ", round((1 - m1_nnz / m1_total) * 100, 1), "%\n") + cat("Memory (M1): ", format(object.size(self$m1), units = "auto"), "\n") + + if (length(self$metadata) > 0) { + cat("Metadata: ", paste(names(self$metadata), collapse = ", "), "\n") + } + + invisible(self) + }, + + #' @description + #' Create a deep copy of the object. + #' + #' @param deep If TRUE, creates a deep copy of all data. + #' + #' @return A new SplikitObject with copied data. + deepCopy = function() { + new_obj <- SplikitObject$new( + m1 = self$m1, + m2 = self$m2, + eventData = if (!is.null(self$eventData)) data.table::copy(self$eventData) else NULL + ) + if (!is.null(self$geneExpression)) { + new_obj$geneExpression <- self$geneExpression + } + new_obj$metadata <- self$metadata + return(new_obj) + } + ), + + private = list( + + #' Validate input matrices and eventData + validateInputs = function(m1, m2, eventData) { + + # Check m1 + if (!inherits(m1, "Matrix") && !is.matrix(m1)) { + private$error("m1 must be a matrix or sparse Matrix object") + } + + # Check m2 if provided + if (!is.null(m2)) { + if (!inherits(m2, "Matrix") && !is.matrix(m2)) { + private$error("m2 must be a matrix or sparse Matrix object") + } + if (!identical(dim(m1), dim(m2))) { + private$error( + "m1 and m2 must have identical dimensions. ", + "m1: ", nrow(m1), "x", ncol(m1), ", ", + "m2: ", nrow(m2), "x", ncol(m2) + ) + } + if (!is.null(rownames(m1)) && !is.null(rownames(m2))) { + if (!identical(rownames(m1), rownames(m2))) { + private$error("m1 and m2 must have identical row names") + } + } + if (!is.null(colnames(m1)) && !is.null(colnames(m2))) { + if (!identical(colnames(m1), colnames(m2))) { + private$error("m1 and m2 must have identical column names") + } + } + } + + # Check eventData if provided + if (!is.null(eventData)) { + if (!data.table::is.data.table(eventData) && !is.data.frame(eventData)) { + private$error("eventData must be a data.table or data.frame") + } + if (nrow(eventData) != nrow(m1)) { + private$error( + "eventData must have same number of rows as m1. ", + "eventData: ", nrow(eventData), " rows, ", + "m1: ", nrow(m1), " rows" + ) + } + } + }, + + #' Ensure M2 is computed + ensureM2Computed = function() { + if (is.null(self$m2)) { + private$error("M2 not computed. Call $makeM2() first.") + } + }, + + #' Ensure matrix is sparse dgCMatrix + ensureSparse = function(mat) { + if (inherits(mat, "dgCMatrix")) { + return(mat) + } else if (inherits(mat, "Matrix")) { + return(methods::as(mat, "dgCMatrix")) + } else if (is.matrix(mat)) { + return(methods::as(mat, "dgCMatrix")) + } else { + private$error("Cannot convert to sparse matrix: unknown type") + } + }, + + #' Standardized error reporting + error = function(...) { + msg <- paste0(...) + stop(msg, call. = FALSE) + } + ) +) + + +#' Create a SplikitObject +#' +#' Convenience function to create a SplikitObject. +#' +#' @param ... Arguments passed to \code{SplikitObject$new()}. +#' +#' @return A new SplikitObject instance. +#' +#' @examples +#' \dontrun{ +#' # From junction abundance +#' junction_ab <- load_toy_SJ_object() +#' obj <- splikit(junction_ab = junction_ab) +#' +#' # From existing matrices +#' obj <- splikit(m1 = my_m1, m2 = my_m2, eventData = my_eventdata) +#' } +#' +#' @export +splikit <- function(...) { + SplikitObject$new(...) +} diff --git a/R/feature_selection.R b/R/feature_selection.R index a78f9e2..a831e8f 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -19,7 +19,23 @@ #' # printing the results #' print(HVE[order(-sum_deviance)]) #' @export -find_variable_events <- function(m1_matrix, m2_matrix, min_row_sum = 50, n_threads=1, verbose=TRUE, ...) { +find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, n_threads=1, verbose=TRUE, ...) { + + + # Handle SplikitObject input + if (inherits(m1_matrix, "SplikitObject")) { + obj <- m1_matrix + if (is.null(obj$m2)) { + stop("SplikitObject has no M2 matrix. Call obj$makeM2() first.", call. = FALSE) + } + m1_matrix <- obj$m1 + m2_matrix <- obj$m2 + } + + # Check if m2_matrix is provided + if (is.null(m2_matrix)) { + stop("m2_matrix is required. Provide either both matrices or a SplikitObject.", call. = FALSE) + } # Check if matrices are sparse if (!(inherits(m1_matrix, "Matrix") && inherits(m2_matrix, "Matrix"))) { @@ -34,7 +50,19 @@ find_variable_events <- function(m1_matrix, m2_matrix, min_row_sum = 50, n_threa } # Filter rows based on minimum row sum criteria - to_keep_events <- which(rowSums(m1_matrix) > min_row_sum & rowSums(m2_matrix) > min_row_sum) + m1_sums <- Matrix::rowSums(m1_matrix) + m2_sums <- Matrix::rowSums(m2_matrix) + to_keep_events <- which(m1_sums > min_row_sum & m2_sums > min_row_sum) + + # Check for empty results (Issue #23 from deep analysis) + if (length(to_keep_events) == 0) { + stop("No events pass the min_row_sum threshold of ", min_row_sum, + ". Consider lowering the threshold or checking your data. ", + "Current range: m1 [", min(m1_sums), "-", max(m1_sums), "], ", + "m2 [", min(m2_sums), "-", max(m2_sums), "]", + call. = FALSE) + } + m1_matrix <- m1_matrix[to_keep_events, , drop = FALSE] m2_matrix <- m2_matrix[to_keep_events, , drop = FALSE] @@ -99,8 +127,9 @@ find_variable_events <- function(m1_matrix, m2_matrix, min_row_sum = 50, n_threa #' toy_obj <- load_toy_M1_M2_object() #' #' # getting high variable genes -#' HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") # vst method -#' HVG_DEV <- find_variable_genes(toy_obj$gene_expression, method = "sum_deviance") # sum_deviance method +#' HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") +#' HVG_DEV <- find_variable_genes(toy_obj$gene_expression, +#' method = "sum_deviance") #' #' # Using multi-threading for faster computation (sum_deviance method only) #' HVG_DEV_MT <- find_variable_genes(toy_obj$gene_expression, diff --git a/R/general_tools.R b/R/general_tools.R index 1bee279..f61dff0 100644 --- a/R/general_tools.R +++ b/R/general_tools.R @@ -56,14 +56,34 @@ NULL #' print(pseudo_r_square_nagel) #' #' @export -get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion, m2_exclusion, metric = "CoxSnell", suppress_warnings=TRUE) { +get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings=TRUE) { + + # Handle SplikitObject input + if (inherits(ZDB_matrix, "SplikitObject")) { + # First arg is SplikitObject, second should be ZDB_matrix + obj <- ZDB_matrix + if (is.null(m1_inclusion) || !is.matrix(m1_inclusion)) { + stop("When using SplikitObject, provide ZDB_matrix as second argument.", call. = FALSE) + } + ZDB_matrix <- m1_inclusion + m1_inclusion <- obj$m1 + m2_exclusion <- obj$m2 + if (is.null(m2_exclusion)) { + stop("SplikitObject has no M2 matrix. Call obj$makeM2() first.", call. = FALSE) + } + } + + # Check m1 and m2 are provided + if (is.null(m1_inclusion) || is.null(m2_exclusion)) { + stop("m1_inclusion and m2_exclusion are required.", call. = FALSE) + } # Validate metric parameter metric <- match.arg(metric, choices = c("CoxSnell", "Nagelkerke")) - + # Check ZDB_matrix (must be dense) if (!is.matrix(ZDB_matrix)) stop("ZDB_matrix must be a dense matrix.") - + # Check m1 and m2 (can be sparse or dense) if (!is.matrix(m1_inclusion) && !inherits(m1_inclusion, "Matrix")) { stop("m1_inclusion must be either a dense matrix or a sparse Matrix.") @@ -209,7 +229,7 @@ get_rowVar <- function(M, verbose=FALSE) { #' # Preparing the inputs #' set.seed(42) #' pc_matrix <- matrix(data = rnorm(n = 10000 * 15, sd = 2), nrow = 10000, ncol = 15) -#' cluster_numbers <- runif(n = 10000, min = 1, max = 10) |> as.integer() +#' cluster_numbers <- as.integer(runif(n = 10000, min = 1, max = 10)) #' #' # Getting the mean silhouette score #' n_threads <- parallel::detectCores() diff --git a/R/splikit-package.R b/R/splikit-package.R new file mode 100644 index 0000000..ca32ae5 --- /dev/null +++ b/R/splikit-package.R @@ -0,0 +1,9 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @importFrom methods as +#' @importFrom stats na.omit setNames +#' @useDynLib splikit, .registration = TRUE +## usethis namespace: end +NULL diff --git a/R/star_solo_processing.R b/R/star_solo_processing.R index a615880..09b8c6b 100644 --- a/R/star_solo_processing.R +++ b/R/star_solo_processing.R @@ -60,6 +60,8 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl # Helper function to process one sample process_sj_sample <- function(STARsolo_SJ_dir, white_barcode_list, sample_id) { + # Avoid R CMD check NOTE for data.table NSE + unique_mapped <- NULL # Define paths mtx_dir <- file.path(STARsolo_SJ_dir, "raw", "matrix.mtx") feature_dir <- file.path(STARsolo_SJ_dir, "../../SJ.out.tab") @@ -586,6 +588,10 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { #' WARNING: This may cause memory errors on large datasets. #' @param multi_thread A logical flag to enable parallel processing for batched operations (default: FALSE). #' Only used when batched processing is triggered. Requires parallel package. +#' @param n_threads Number of threads for C++ parallel processing (default: 1). +#' Only used when use_cpp = TRUE. +#' @param use_cpp Logical flag to use fast C++ implementation (default: TRUE). +#' Falls back to R implementation if FALSE. #' @param verbose A logical flag for detailed progress reporting (default: FALSE). #' #' @return A sparse matrix M2 with the dummy row removed and proper adjustments made. @@ -606,7 +612,8 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { #' @export make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, memory_threshold = 2e9, force_fast = FALSE, - multi_thread = FALSE, verbose = FALSE) { + multi_thread = FALSE, n_threads = 1, use_cpp = TRUE, + verbose = FALSE) { # Input validation if (missing(m1_inclusion_matrix) || !inherits(m1_inclusion_matrix, "sparseMatrix")) { @@ -638,6 +645,35 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, cat("Starting M2 matrix creation...\n") + # Use C++ implementation if requested + if (use_cpp) { + cat("+-- Using C++ implementation for faster computation\n") + + # Convert group_id to numeric indices (0-based for C++) + unique_groups <- unique(eventdata$group_id) + group_map <- setNames(seq_along(unique_groups) - 1L, unique_groups) + group_ids_numeric <- as.integer(group_map[eventdata$group_id]) + + cat("| |-- Events: ", nrow(m1_inclusion_matrix), "\n") + cat("| |-- Cells: ", ncol(m1_inclusion_matrix), "\n") + cat("| +-- Groups: ", length(unique_groups), "\n") + + # Call C++ function + M2 <- make_m2_cpp_parallel( + M1 = methods::as(m1_inclusion_matrix, "dgCMatrix"), + group_ids = group_ids_numeric, + n_threads = n_threads + ) + + # Set row and column names + rownames(M2) <- rownames(m1_inclusion_matrix) + colnames(M2) <- colnames(m1_inclusion_matrix) + + cat("Finished M2 matrix creation.\n") + return(M2) + } + + # Fallback to R implementation # Add an index column to eventdata eventdata[, i := .I] @@ -762,10 +798,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, return(result) } -#' Fast M2 Processing (Internal Function) -#' -#' Implements the original fast approach using data.table operations. -#' This approach creates the full operation in memory at once. +#' @keywords internal .make_m2_fast <- function(m1_inclusion_matrix, group_ids, verbose) { cat("+-- Step 3 | Creating M2 (fast approach)\n") @@ -820,10 +853,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, return(M2) } -#' Batched M2 Processing (Internal Function) -#' -#' Implements the batched triplet combination approach for memory-efficient processing. -#' Processes groups in batches using lapply/mclapply and combines results efficiently. +#' @keywords internal .make_m2_batched <- function(m1_inclusion_matrix, group_ids, batch_size, unique_groups, multi_thread, verbose) { diff --git a/R/zzz.R b/R/zzz.R index 4b44563..2621dea 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -2,12 +2,12 @@ packageStartupMessage( "\n", "=======================================================================================================\n", - " Welcome to Splikit — Developed by the Computational and Statistical Genomics Laboratory\n", + " Welcome to Splikit - Developed by the Computational and Statistical Genomics Laboratory\n", " at McGill University, Montreal. Distributed under the MIT License.\n", " For documentation and examples, see the package vignette.\n", "-------------------------------------------------------------------------------------------------------\n", - " Bienvenue dans Splikit — Developpe par le Laboratoire de genomique computationnelle et statistique\n", - " de l’Universite McGill, a Montreal. Distribue sous licence MIT.\n", + " Bienvenue dans Splikit - Developpe par le Laboratoire de genomique computationnelle et statistique\n", + " de l'Universite McGill, a Montreal. Distribue sous licence MIT.\n", " Pour la documentation et des exemples, consultez la vignette du package.\n", "=======================================================================================================\n" ) diff --git a/README.md b/README.md index ccc42fe..c132dd6 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ ## **Requirements** - [R version 3.5.0](http://www.r-project.org/) or later. -- R libraries: [Rcpp](https://cran.r-project.org/web/packages/Rcpp/index.html), [RcppArmadillo](https://cran.r-project.org/web/packages/RcppArmadillo/index.html), [Matrix](https://cran.r-project.org/web/packages/Matrix/index.html), [data.table](https://cran.r-project.org/web/packages/data.table/index.html) +- R libraries: [Rcpp](https://cran.r-project.org/web/packages/Rcpp/index.html), [RcppArmadillo](https://cran.r-project.org/web/packages/RcppArmadillo/index.html), [Matrix](https://cran.r-project.org/web/packages/Matrix/index.html), [data.table](https://cran.r-project.org/web/packages/data.table/index.html), [R6](https://cran.r-project.org/web/packages/R6/index.html) ## Installation @@ -25,6 +25,38 @@ devtools::install_github("csglab/splikit") ``` ## Usage +Splikit provides two ways to work with your data: a modern **R6 class interface** (recommended) and **traditional functions** (for backward compatibility). + +### R6 Class Interface (Recommended) + +The `SplikitObject` class provides a clean, chainable interface with built-in validation: + +```r +library(splikit) + +# Load data and create SplikitObject +junction_ab <- load_toy_SJ_object() +obj <- splikit(junction_ab = junction_ab, min_counts = 1) + +# Compute M2 exclusion matrix (uses fast C++ implementation) +obj$makeM2(n_threads = 4) + +# Find highly variable splicing events +HVE <- obj$findVariableEvents(min_row_sum = 50, n_threads = 4) + +# View object summary +obj$summary() + +# Access data directly +m1_matrix <- obj$m1 +m2_matrix <- obj$m2 +event_data <- obj$eventData +``` + +### Traditional Function Interface + +The original function-based approach remains fully supported: + ```r # Create m1 matrix from a junction abundance object junction_abundance_object <- load_toy_SJ_object() @@ -37,15 +69,22 @@ m1_object <- make_m1( # Create m2 matrix from the m1 inclusion matrix m2_matrix <- make_m2( m1_inclusion_matrix = m1_object$m1_inclusion_matrix, - eventdata = m1_object$eventdata + eventdata = m1_object$eventdata, + n_threads = 4 # Uses C++ with OpenMP ) -# Perform feature selection for splicing joint measurements +# Perform feature selection for splicing events m1_matrix <- m1_object$m1_inclusion_matrix HVE_Info <- find_variable_events(m1_matrix, m2_matrix, n_threads = 32) - ``` +### Key Features + +- **High-performance C++ backend**: Core computations use RcppArmadillo with OpenMP parallelization +- **Memory-efficient sparse matrices**: Handles large single-cell datasets +- **Comprehensive validation**: Input checking with informative error messages +- **Flexible API**: Choose between R6 methods or traditional functions + ## Documentation diff --git a/_pkgdown.yml b/_pkgdown.yml index 1e8d11e..3a919b1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -44,6 +44,11 @@ articles: - STARsolo_guide reference: +- title: R6 Object Interface + desc: Modern object-oriented interface for splicing analysis + contents: + - SplikitObject + - title: Main Functions desc: Core functions for splicing analysis contents: @@ -74,3 +79,8 @@ reference: contents: - load_toy_M1_M2_object - load_toy_SJ_object + +- title: Package + desc: Package information + contents: + - splikit diff --git a/docs/.nojekyll b/docs/.nojekyll deleted file mode 100644 index e69de29..0000000 diff --git a/inst/extdata/toy_m1_m2_obj.rds b/inst/extdata/toy_m1_m2_obj.rds index a6fa186..fc6344f 100644 Binary files a/inst/extdata/toy_m1_m2_obj.rds and b/inst/extdata/toy_m1_m2_obj.rds differ diff --git a/man/SplikitObject.Rd b/man/SplikitObject.Rd new file mode 100644 index 0000000..338e692 --- /dev/null +++ b/man/SplikitObject.Rd @@ -0,0 +1,355 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SplikitObject.R +\name{SplikitObject} +\alias{SplikitObject} +\title{SplikitObject} +\description{ +R6 class for splicing analysis in single-cell RNA-seq data. +Provides a modern object-oriented interface to splikit functionality +while maintaining backward compatibility with existing functions. +} +\details{ +The SplikitObject encapsulates the core data structures for splicing analysis: +\itemize{ +\item \code{m1}: Inclusion matrix (sparse dgCMatrix) +\item \code{m2}: Exclusion matrix (sparse dgCMatrix) +\item \code{eventData}: Event metadata (data.table) +\item \code{geneExpression}: Optional gene expression matrix +} +} +\examples{ +\dontrun{ +# Create from junction abundance data +junction_ab <- load_toy_SJ_object() +obj <- SplikitObject$new(junction_ab = junction_ab) + +# Compute M2 and find variable events +obj$makeM2() +hve <- obj$findVariableEvents(min_row_sum = 50) + +# Or create from existing matrices +obj <- SplikitObject$new(m1 = my_m1, m2 = my_m2, eventData = my_eventdata) + +# Chain operations +results <- obj$makeM2()$findVariableEvents() +} + +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{m1}}{Inclusion matrix (dgCMatrix). Rows are events, columns are cells.} + +\item{\code{m2}}{Exclusion matrix (dgCMatrix). Same dimensions as m1.} + +\item{\code{eventData}}{Event metadata (data.table). One row per event.} + +\item{\code{geneExpression}}{Optional gene expression matrix (dgCMatrix).} + +\item{\code{metadata}}{List containing summary statistics and analysis results.} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-SplikitObject-new}{\code{SplikitObject$new()}} +\item \href{#method-SplikitObject-makeM2}{\code{SplikitObject$makeM2()}} +\item \href{#method-SplikitObject-findVariableEvents}{\code{SplikitObject$findVariableEvents()}} +\item \href{#method-SplikitObject-findVariableGenes}{\code{SplikitObject$findVariableGenes()}} +\item \href{#method-SplikitObject-getPseudoCorrelation}{\code{SplikitObject$getPseudoCorrelation()}} +\item \href{#method-SplikitObject-subset}{\code{SplikitObject$subset()}} +\item \href{#method-SplikitObject-setGeneExpression}{\code{SplikitObject$setGeneExpression()}} +\item \href{#method-SplikitObject-annotateEvents}{\code{SplikitObject$annotateEvents()}} +\item \href{#method-SplikitObject-summary}{\code{SplikitObject$summary()}} +\item \href{#method-SplikitObject-print}{\code{SplikitObject$print()}} +\item \href{#method-SplikitObject-deepCopy}{\code{SplikitObject$deepCopy()}} +\item \href{#method-SplikitObject-clone}{\code{SplikitObject$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-new}{}}} +\subsection{Method \code{new()}}{ +Create a new SplikitObject. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$new( + junction_ab = NULL, + m1 = NULL, + m2 = NULL, + eventData = NULL, + min_counts = 1, + verbose = FALSE +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{junction_ab}}{A junction abundance object from \code{make_junction_ab()}. +If provided, m1 and eventData are computed automatically.} + +\item{\code{m1}}{An existing inclusion matrix (dgCMatrix).} + +\item{\code{m2}}{An existing exclusion matrix (dgCMatrix).} + +\item{\code{eventData}}{A data.table with event metadata.} + +\item{\code{min_counts}}{Minimum count threshold for filtering events (default: 1).} + +\item{\code{verbose}}{Print progress messages (default: FALSE).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A new SplikitObject instance. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-makeM2}{}}} +\subsection{Method \code{makeM2()}}{ +Compute the M2 exclusion matrix from M1 and eventData. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$makeM2( + batch_size = 5000, + memory_threshold = 2e+09, + force_fast = FALSE, + multi_thread = FALSE, + n_threads = 1, + use_cpp = TRUE, + verbose = FALSE +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{batch_size}}{Number of groups per batch for memory management (default: 5000).} + +\item{\code{memory_threshold}}{Maximum rows before switching to batched processing.} + +\item{\code{force_fast}}{Force fast processing regardless of size (default: FALSE).} + +\item{\code{multi_thread}}{Use parallel processing for batched operations (default: FALSE).} + +\item{\code{n_threads}}{Number of threads for C++ OpenMP parallelization (default: 1).} + +\item{\code{use_cpp}}{Use fast C++ implementation (default: TRUE).} + +\item{\code{verbose}}{Print progress messages (default: FALSE).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Self (invisibly), for method chaining. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-findVariableEvents}{}}} +\subsection{Method \code{findVariableEvents()}}{ +Find highly variable splicing events. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$findVariableEvents( + min_row_sum = 50, + n_threads = 1, + verbose = FALSE +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{min_row_sum}}{Minimum row sum threshold for filtering (default: 50).} + +\item{\code{n_threads}}{Number of threads for parallel computation (default: 1).} + +\item{\code{verbose}}{Print progress messages (default: FALSE).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table with event names and sum_deviance scores. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-findVariableGenes}{}}} +\subsection{Method \code{findVariableGenes()}}{ +Find highly variable genes from gene expression data. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$findVariableGenes(method = "vst", n_threads = 1, verbose = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{method}}{Method for variable gene selection: "vst" or "sum_deviance" (default: "vst").} + +\item{\code{n_threads}}{Number of threads for parallel computation (default: 1).} + +\item{\code{verbose}}{Print progress messages (default: FALSE).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table with gene names and variability scores. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-getPseudoCorrelation}{}}} +\subsection{Method \code{getPseudoCorrelation()}}{ +Compute pseudo-correlation between splicing and external data. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$getPseudoCorrelation( + ZDB_matrix, + metric = "CoxSnell", + suppress_warnings = TRUE +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{ZDB_matrix}}{Dense matrix of external data (e.g., gene expression PCs). +Must have same dimensions as m1.} + +\item{\code{metric}}{R-squared metric: "CoxSnell" or "Nagelkerke" (default: "CoxSnell").} + +\item{\code{suppress_warnings}}{Suppress computation warnings (default: TRUE).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A data.table with event names, pseudo_correlation, and null_distribution. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-subset}{}}} +\subsection{Method \code{subset()}}{ +Subset the object by events and/or cells. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$subset(events = NULL, cells = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{events}}{Event indices or names to keep.} + +\item{\code{cells}}{Cell indices or names to keep.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Self (invisibly), for method chaining. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-setGeneExpression}{}}} +\subsection{Method \code{setGeneExpression()}}{ +Set the gene expression matrix. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$setGeneExpression(gene_matrix)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{gene_matrix}}{A gene expression matrix (will be converted to dgCMatrix).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Self (invisibly), for method chaining. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-annotateEvents}{}}} +\subsection{Method \code{annotateEvents()}}{ +Annotate events with gene information from a GTF file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$annotateEvents(GTF_file)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{GTF_file}}{Path to a GTF annotation file.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Self (invisibly), for method chaining. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-summary}{}}} +\subsection{Method \code{summary()}}{ +Get a summary of the object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$summary()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +A list with object statistics. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-print}{}}} +\subsection{Method \code{print()}}{ +Print a human-readable summary of the object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$print()}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-deepCopy}{}}} +\subsection{Method \code{deepCopy()}}{ +Create a deep copy of the object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$deepCopy()}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{If TRUE, creates a deep copy of all data.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A new SplikitObject with copied data. +Validate input matrices and eventData +Ensure M2 is computed +Ensure matrix is sparse dgCMatrix +Standardized error reporting +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SplikitObject-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SplikitObject$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/dot-make_m2_batched.Rd b/man/dot-make_m2_batched.Rd deleted file mode 100644 index 3574df2..0000000 --- a/man/dot-make_m2_batched.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/star_solo_processing.R -\name{.make_m2_batched} -\alias{.make_m2_batched} -\title{Batched M2 Processing (Internal Function)} -\usage{ -.make_m2_batched( - m1_inclusion_matrix, - group_ids, - batch_size, - unique_groups, - multi_thread, - verbose -) -} -\description{ -Implements the batched triplet combination approach for memory-efficient processing. -Processes groups in batches using lapply/mclapply and combines results efficiently. -} diff --git a/man/dot-make_m2_fast.Rd b/man/dot-make_m2_fast.Rd deleted file mode 100644 index 8b3683c..0000000 --- a/man/dot-make_m2_fast.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/star_solo_processing.R -\name{.make_m2_fast} -\alias{.make_m2_fast} -\title{Fast M2 Processing (Internal Function)} -\usage{ -.make_m2_fast(m1_inclusion_matrix, group_ids, verbose) -} -\description{ -Implements the original fast approach using data.table operations. -This approach creates the full operation in memory at once. -} diff --git a/man/find_variable_events.Rd b/man/find_variable_events.Rd index 188aa07..e38eb2b 100644 --- a/man/find_variable_events.Rd +++ b/man/find_variable_events.Rd @@ -6,7 +6,7 @@ \usage{ find_variable_events( m1_matrix, - m2_matrix, + m2_matrix = NULL, min_row_sum = 50, n_threads = 1, verbose = TRUE, diff --git a/man/find_variable_genes.Rd b/man/find_variable_genes.Rd index 0a4d49b..f296575 100644 --- a/man/find_variable_genes.Rd +++ b/man/find_variable_genes.Rd @@ -41,8 +41,9 @@ library(Matrix) toy_obj <- load_toy_M1_M2_object() # getting high variable genes -HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") # vst method -HVG_DEV <- find_variable_genes(toy_obj$gene_expression, method = "sum_deviance") # sum_deviance method +HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") +HVG_DEV <- find_variable_genes(toy_obj$gene_expression, + method = "sum_deviance") # Using multi-threading for faster computation (sum_deviance method only) HVG_DEV_MT <- find_variable_genes(toy_obj$gene_expression, diff --git a/man/get_pseudo_correlation.Rd b/man/get_pseudo_correlation.Rd index ff4a38f..bbd7b6d 100644 --- a/man/get_pseudo_correlation.Rd +++ b/man/get_pseudo_correlation.Rd @@ -6,8 +6,8 @@ \usage{ get_pseudo_correlation( ZDB_matrix, - m1_inclusion, - m2_exclusion, + m1_inclusion = NULL, + m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE ) diff --git a/man/get_silhouette_mean.Rd b/man/get_silhouette_mean.Rd index 08c3ba4..d0aa469 100644 --- a/man/get_silhouette_mean.Rd +++ b/man/get_silhouette_mean.Rd @@ -26,7 +26,7 @@ This process can be very slow for large matrices if single-threaded. Use multipl # Preparing the inputs set.seed(42) pc_matrix <- matrix(data = rnorm(n = 10000 * 15, sd = 2), nrow = 10000, ncol = 15) -cluster_numbers <- runif(n = 10000, min = 1, max = 10) |> as.integer() +cluster_numbers <- as.integer(runif(n = 10000, min = 1, max = 10)) # Getting the mean silhouette score n_threads <- parallel::detectCores() diff --git a/man/make_m2.Rd b/man/make_m2.Rd index 21e2b5d..d34df25 100644 --- a/man/make_m2.Rd +++ b/man/make_m2.Rd @@ -11,6 +11,8 @@ make_m2( memory_threshold = 2e+09, force_fast = FALSE, multi_thread = FALSE, + n_threads = 1, + use_cpp = TRUE, verbose = FALSE ) } @@ -31,6 +33,12 @@ WARNING: This may cause memory errors on large datasets.} \item{multi_thread}{A logical flag to enable parallel processing for batched operations (default: FALSE). Only used when batched processing is triggered. Requires parallel package.} +\item{n_threads}{Number of threads for C++ parallel processing (default: 1). +Only used when use_cpp = TRUE.} + +\item{use_cpp}{Logical flag to use fast C++ implementation (default: TRUE). +Falls back to R implementation if FALSE.} + \item{verbose}{A logical flag for detailed progress reporting (default: FALSE).} } \value{ diff --git a/man/make_m2_cpp.Rd b/man/make_m2_cpp.Rd new file mode 100644 index 0000000..d0523e5 --- /dev/null +++ b/man/make_m2_cpp.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{make_m2_cpp} +\alias{make_m2_cpp} +\title{Legacy function - redirects to memory-efficient version} +\usage{ +make_m2_cpp(M1, group_ids, n_threads = 1L) +} +\arguments{ +\item{M1}{Sparse matrix (dgCMatrix) of inclusion counts (events x cells)} + +\item{group_ids}{Integer vector of group IDs for each event} + +\item{n_threads}{Number of threads for OpenMP (default 1)} +} +\value{ +Sparse matrix M2 with same dimensions as M1 +} +\description{ +Legacy function - redirects to memory-efficient version +} +\keyword{internal} diff --git a/man/make_m2_cpp_parallel.Rd b/man/make_m2_cpp_parallel.Rd new file mode 100644 index 0000000..5670756 --- /dev/null +++ b/man/make_m2_cpp_parallel.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{make_m2_cpp_parallel} +\alias{make_m2_cpp_parallel} +\title{Memory-efficient M2 computation building CSC format directly} +\usage{ +make_m2_cpp_parallel(M1, group_ids, n_threads = 1L) +} +\arguments{ +\item{M1}{Sparse matrix (dgCMatrix) of inclusion counts (events x cells)} + +\item{group_ids}{Integer vector of group IDs for each event} + +\item{n_threads}{Number of threads for OpenMP (default 1)} +} +\value{ +Sparse matrix M2 with same dimensions as M1 +} +\description{ +This version minimizes memory usage by: +\itemize{ +\item Building CSC format directly (no triplet intermediate) +\item Using O(n_groups) workspace per column instead of dense group_sums matrix +\item Two-pass algorithm: count then fill +} +} +\details{ +Memory usage: O(nnz_output) + O(n_groups) workspace +vs previous: O(n_groups * n_cells) + O(6 * nnz_output) +} +\keyword{internal} diff --git a/man/splikit-package.Rd b/man/splikit-package.Rd new file mode 100644 index 0000000..81001f6 --- /dev/null +++ b/man/splikit-package.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splikit-package.R +\docType{package} +\name{splikit-package} +\alias{splikit-package} +\title{splikit: A Toolkit for Analysing RNA Splicing in scRNA-Seq Data} +\description{ +\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} + +A toolkit designed for the analysis of high-dimensional single-cell splicing data. Provides a framework to extract and work with ratio-based data structures derived from single-cell RNA sequencing experiments. Offers both a modern R6 object-oriented interface and direct matrix manipulation functions. Core functionalities are implemented in C++ via 'Rcpp' to ensure high performance and scalability on large datasets. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://csglab.github.io/splikit/} + \item \url{https://github.com/csglab/splikit} + \item Report bugs at \url{https://github.com/csglab/splikit/issues} +} + +} +\author{ +\strong{Maintainer}: Arsham Mikaeili Namini \email{arsham.mikaeilinamini@mail.mcgill.ca} (\href{https://orcid.org/0000-0002-9453-6951}{ORCID}) + +} +\keyword{internal} diff --git a/man/splikit.Rd b/man/splikit.Rd new file mode 100644 index 0000000..207cf4f --- /dev/null +++ b/man/splikit.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SplikitObject.R +\name{splikit} +\alias{splikit} +\title{Create a SplikitObject} +\usage{ +splikit(...) +} +\arguments{ +\item{...}{Arguments passed to \code{SplikitObject$new()}.} +} +\value{ +A new SplikitObject instance. +} +\description{ +Convenience function to create a SplikitObject. +} +\examples{ +\dontrun{ +# From junction abundance +junction_ab <- load_toy_SJ_object() +obj <- splikit(junction_ab = junction_ab) + +# From existing matrices +obj <- splikit(m1 = my_m1, m2 = my_m2, eventData = my_eventdata) +} + +} diff --git a/pkgdown/favicon/apple-touch-icon.png b/pkgdown/favicon/apple-touch-icon.png new file mode 100644 index 0000000..3539cad Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon.png differ diff --git a/pkgdown/favicon/favicon-96x96.png b/pkgdown/favicon/favicon-96x96.png new file mode 100644 index 0000000..95be33e Binary files /dev/null and b/pkgdown/favicon/favicon-96x96.png differ diff --git a/pkgdown/favicon/favicon.ico b/pkgdown/favicon/favicon.ico new file mode 100644 index 0000000..612df58 Binary files /dev/null and b/pkgdown/favicon/favicon.ico differ diff --git a/pkgdown/favicon/favicon.svg b/pkgdown/favicon/favicon.svg new file mode 100644 index 0000000..0122a12 --- /dev/null +++ b/pkgdown/favicon/favicon.svg @@ -0,0 +1,3 @@ + \ No newline at end of file diff --git a/pkgdown/favicon/site.webmanifest b/pkgdown/favicon/site.webmanifest new file mode 100644 index 0000000..4ebda26 --- /dev/null +++ b/pkgdown/favicon/site.webmanifest @@ -0,0 +1,21 @@ +{ + "name": "", + "short_name": "", + "icons": [ + { + "src": "/web-app-manifest-192x192.png", + "sizes": "192x192", + "type": "image/png", + "purpose": "maskable" + }, + { + "src": "/web-app-manifest-512x512.png", + "sizes": "512x512", + "type": "image/png", + "purpose": "maskable" + } + ], + "theme_color": "#ffffff", + "background_color": "#ffffff", + "display": "standalone" +} \ No newline at end of file diff --git a/pkgdown/favicon/web-app-manifest-192x192.png b/pkgdown/favicon/web-app-manifest-192x192.png new file mode 100644 index 0000000..a9cb17c Binary files /dev/null and b/pkgdown/favicon/web-app-manifest-192x192.png differ diff --git a/pkgdown/favicon/web-app-manifest-512x512.png b/pkgdown/favicon/web-app-manifest-512x512.png new file mode 100644 index 0000000..3fe7ea1 Binary files /dev/null and b/pkgdown/favicon/web-app-manifest-512x512.png differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ce1a11d..569b668 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -127,6 +127,32 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// make_m2_cpp_parallel +arma::sp_mat make_m2_cpp_parallel(const arma::sp_mat& M1, const IntegerVector& group_ids, int n_threads); +RcppExport SEXP _splikit_make_m2_cpp_parallel(SEXP M1SEXP, SEXP group_idsSEXP, SEXP n_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type M1(M1SEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type group_ids(group_idsSEXP); + Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(make_m2_cpp_parallel(M1, group_ids, n_threads)); + return rcpp_result_gen; +END_RCPP +} +// make_m2_cpp +arma::sp_mat make_m2_cpp(const arma::sp_mat& M1, const IntegerVector& group_ids, int n_threads); +RcppExport SEXP _splikit_make_m2_cpp(SEXP M1SEXP, SEXP group_idsSEXP, SEXP n_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::sp_mat& >::type M1(M1SEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type group_ids(group_idsSEXP); + Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(make_m2_cpp(M1, group_ids, n_threads)); + return rcpp_result_gen; +END_RCPP +} // rowVariance_cpp NumericVector rowVariance_cpp(SEXP mat); RcppExport SEXP _splikit_rowVariance_cpp(SEXP matSEXP) { @@ -149,6 +175,8 @@ static const R_CallMethodDef CallEntries[] = { {"_splikit_cppBetabinPseudoR2_mixed2", (DL_FUNC) &_splikit_cppBetabinPseudoR2_mixed2, 4}, {"_splikit_calcNBDeviancesWithThetaEstimation", (DL_FUNC) &_splikit_calcNBDeviancesWithThetaEstimation, 2}, {"_splikit_standardizeSparse_variance_vst", (DL_FUNC) &_splikit_standardizeSparse_variance_vst, 2}, + {"_splikit_make_m2_cpp_parallel", (DL_FUNC) &_splikit_make_m2_cpp_parallel, 3}, + {"_splikit_make_m2_cpp", (DL_FUNC) &_splikit_make_m2_cpp, 3}, {"_splikit_rowVariance_cpp", (DL_FUNC) &_splikit_rowVariance_cpp, 1}, {NULL, NULL, 0} }; diff --git a/src/calcDeviances.cpp b/src/calcDeviances.cpp index d4e618a..d51aa1f 100755 --- a/src/calcDeviances.cpp +++ b/src/calcDeviances.cpp @@ -1,6 +1,5 @@ // calcDeviances_ratio.cpp #include -#include #ifdef _OPENMP #include #endif @@ -16,27 +15,6 @@ arma::vec calcDeviances_ratio(const arma::sp_mat& M1, int n_cols = M1.n_cols; arma::vec dev(n_rows, arma::fill::zeros); - // Notify about OpenMP availability only once (thread-safe) - static std::atomic has_printed(false); - bool expected = false; - if (has_printed.compare_exchange_strong(expected, true)) { - if (num_threads <= 1) { -#ifdef _OPENMP - Rcpp::Rcout << "Note: OpenMP is available. " - "You can speed up this computation by passing " - "num_threads > 1 to enable multi-threading.\n"; -#else - Rcpp::Rcout << "OpenMP not detected: running in single-thread mode.\n"; -#endif - } else { -#ifdef _OPENMP - Rcpp::Rcout << "Running with " << num_threads << " threads.\n"; -#else - Rcpp::Rcout << "OpenMP not detected: running in single-thread mode.\n"; -#endif - } - } - // Set the number of threads if OpenMP is available #ifdef _OPENMP if (num_threads > 1) { diff --git a/src/deviance_gene.cpp b/src/deviance_gene.cpp index e17c0ce..410c915 100644 --- a/src/deviance_gene.cpp +++ b/src/deviance_gene.cpp @@ -1,5 +1,4 @@ #include -#include #ifdef _OPENMP #include #endif @@ -14,21 +13,6 @@ arma::vec calcNBDeviancesWithThetaEstimation(const arma::sp_mat& gene_expression int n_cols = gene_expression.n_cols; // number of cells arma::vec dev(n_rows, arma::fill::zeros); - // Notify about OpenMP availability only once (thread-safe) - static std::atomic has_printed(false); - bool expected = false; - if (has_printed.compare_exchange_strong(expected, true)) { - #ifdef _OPENMP - if (num_threads > 1) { - Rcpp::Rcout << "OpenMP is available. Using " << num_threads << " threads for NB deviance calculation.\n"; - } - #else - if (num_threads > 1) { - Rcpp::Rcout << "OpenMP is not available. Running in single-threaded mode.\n"; - } - #endif - } - #ifdef _OPENMP if (num_threads > 1) { omp_set_num_threads(num_threads); diff --git a/src/hvf_gene_expression.cpp b/src/hvf_gene_expression.cpp index fbc09a6..a119e6e 100644 --- a/src/hvf_gene_expression.cpp +++ b/src/hvf_gene_expression.cpp @@ -11,7 +11,6 @@ // - Computes per-row means and variances (including zero entries) // - Fits a loess curve to log10(var) ~ log10(mean) // - Returns standardized variances (squared z-scores) per row -// - Logs progress via Rcpp::Rcout // // Only 32-bit indexing is supported, consistent with R's native dgCMatrix. @@ -21,8 +20,6 @@ using namespace Rcpp; // [[Rcpp::export]] NumericVector standardizeSparse_variance_vst(SEXP matSEXP, bool display_progress = false) { - Rcout << "[SSVL] entering\n"; - // 1) validate & extract if (!Rf_isS4(matSEXP) || !Rf_inherits(matSEXP, "dgCMatrix")) stop("`mat` must be a dgCMatrix"); @@ -121,7 +118,6 @@ NumericVector standardizeSparse_variance_vst(SEXP matSEXP, result[r] = (ncol > 1) ? ( result[r] / (ncol - 1) ) : 0.0; } - Rcout << "[SSVL] exiting\n"; return result; } diff --git a/src/make_m2_cpp.cpp b/src/make_m2_cpp.cpp new file mode 100644 index 0000000..515e9ae --- /dev/null +++ b/src/make_m2_cpp.cpp @@ -0,0 +1,286 @@ +// [[Rcpp::depends(RcppArmadillo)]] +#include + +#ifdef _OPENMP +#include +#endif + +using namespace Rcpp; +using namespace arma; + +//' Memory-efficient M2 computation building CSC format directly +//' +//' This version minimizes memory usage by: +//' - Building CSC format directly (no triplet intermediate) +//' - Using O(n_groups) workspace per column instead of dense group_sums matrix +//' - Two-pass algorithm: count then fill +//' +//' Memory usage: O(nnz_output) + O(n_groups) workspace +//' vs previous: O(n_groups * n_cells) + O(6 * nnz_output) +//' +//' @param M1 Sparse matrix (dgCMatrix) of inclusion counts (events x cells) +//' @param group_ids Integer vector of group IDs for each event +//' @param n_threads Number of threads for OpenMP (default 1) +//' +//' @return Sparse matrix M2 with same dimensions as M1 +//' +//' @keywords internal +// [[Rcpp::export]] +arma::sp_mat make_m2_cpp_parallel(const arma::sp_mat& M1, + const IntegerVector& group_ids, + int n_threads = 1) { + + int n_events = M1.n_rows; + int n_cells = M1.n_cols; + + // Validate inputs + if (group_ids.size() != n_events) { + stop("group_ids length must match number of rows in M1"); + } + + // Find number of unique groups + int max_group = 0; + for (int i = 0; i < n_events; i++) { + if (group_ids[i] > max_group) { + max_group = group_ids[i]; + } + } + int n_groups = max_group + 1; + +#ifdef _OPENMP + if (n_threads > 1) { + omp_set_num_threads(n_threads); + } +#endif + + // Pre-compute group members + // group_members[g] = vector of event indices in group g + std::vector> group_members(n_groups); + for (int i = 0; i < n_events; i++) { + group_members[group_ids[i]].push_back(i); + } + + // ============================================================================ + // PASS 1: Count non-zeros per column + // ============================================================================ + + std::vector col_nnz(n_cells, 0); + +#ifdef _OPENMP +#pragma omp parallel if(n_threads > 1) +{ + // Thread-local workspace for group sums + std::vector group_sum_local(n_groups); + +#pragma omp for schedule(dynamic) + for (int j = 0; j < n_cells; j++) { + // Reset workspace + std::fill(group_sum_local.begin(), group_sum_local.end(), 0.0); + + // Compute group sums for this column + for (sp_mat::const_col_iterator it = M1.begin_col(j); it != M1.end_col(j); ++it) { + int row = it.row(); + group_sum_local[group_ids[row]] += *it; + } + + // Count non-zeros for this column + size_t count = 0; + for (int g = 0; g < n_groups; g++) { + double total = group_sum_local[g]; + if (total == 0) continue; + + const std::vector& members = group_members[g]; + for (int i : members) { + double m1_val = M1(i, j); + double m2_val = total - m1_val; + if (m2_val != 0) { + count++; + } + } + } + col_nnz[j] = count; + } +} +#else + // Single-threaded version + std::vector group_sum(n_groups); + for (int j = 0; j < n_cells; j++) { + std::fill(group_sum.begin(), group_sum.end(), 0.0); + + for (sp_mat::const_col_iterator it = M1.begin_col(j); it != M1.end_col(j); ++it) { + int row = it.row(); + group_sum[group_ids[row]] += *it; + } + + size_t count = 0; + for (int g = 0; g < n_groups; g++) { + double total = group_sum[g]; + if (total == 0) continue; + + const std::vector& members = group_members[g]; + for (int i : members) { + double m1_val = M1(i, j); + double m2_val = total - m1_val; + if (m2_val != 0) { + count++; + } + } + } + col_nnz[j] = count; + } +#endif + + // ============================================================================ + // Build column pointers (CSC format) + // ============================================================================ + + std::vector col_ptr(n_cells + 1); + col_ptr[0] = 0; + for (int j = 0; j < n_cells; j++) { + col_ptr[j + 1] = col_ptr[j] + col_nnz[j]; + } + size_t total_nnz = col_ptr[n_cells]; + + // Allocate output arrays (exactly sized - no waste) + std::vector row_indices(total_nnz); + std::vector values(total_nnz); + + // ============================================================================ + // PASS 2: Fill in values + // ============================================================================ + +#ifdef _OPENMP +#pragma omp parallel if(n_threads > 1) +{ + // Thread-local workspace + std::vector group_sum_local(n_groups); + +#pragma omp for schedule(dynamic) + for (int j = 0; j < n_cells; j++) { + // Reset workspace + std::fill(group_sum_local.begin(), group_sum_local.end(), 0.0); + + // Compute group sums for this column + for (sp_mat::const_col_iterator it = M1.begin_col(j); it != M1.end_col(j); ++it) { + int row = it.row(); + group_sum_local[group_ids[row]] += *it; + } + + // Fill in M2 values for this column + size_t write_idx = col_ptr[j]; + + // We need to write rows in sorted order for valid CSC format + // Collect (row, value) pairs first, then sort + std::vector> col_entries; + col_entries.reserve(col_nnz[j]); + + for (int g = 0; g < n_groups; g++) { + double total = group_sum_local[g]; + if (total == 0) continue; + + const std::vector& members = group_members[g]; + for (int i : members) { + double m1_val = M1(i, j); + double m2_val = total - m1_val; + if (m2_val != 0) { + col_entries.emplace_back(static_cast(i), m2_val); + } + } + } + + // Sort by row index (required for CSC format) + std::sort(col_entries.begin(), col_entries.end(), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + }); + + // Write to output arrays + for (const auto& entry : col_entries) { + row_indices[write_idx] = entry.first; + values[write_idx] = entry.second; + write_idx++; + } + } +} +#else + // Single-threaded version + std::vector group_sum2(n_groups); + for (int j = 0; j < n_cells; j++) { + std::fill(group_sum2.begin(), group_sum2.end(), 0.0); + + for (sp_mat::const_col_iterator it = M1.begin_col(j); it != M1.end_col(j); ++it) { + int row = it.row(); + group_sum2[group_ids[row]] += *it; + } + + size_t write_idx = col_ptr[j]; + std::vector> col_entries; + col_entries.reserve(col_nnz[j]); + + for (int g = 0; g < n_groups; g++) { + double total = group_sum2[g]; + if (total == 0) continue; + + const std::vector& members = group_members[g]; + for (int i : members) { + double m1_val = M1(i, j); + double m2_val = total - m1_val; + if (m2_val != 0) { + col_entries.emplace_back(static_cast(i), m2_val); + } + } + } + + std::sort(col_entries.begin(), col_entries.end(), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + }); + + for (const auto& entry : col_entries) { + row_indices[write_idx] = entry.first; + values[write_idx] = entry.second; + write_idx++; + } + } +#endif + + // ============================================================================ + // Construct sparse matrix directly from CSC components + // ============================================================================ + + // Convert to arma types + arma::uvec rowind(total_nnz); + arma::vec vals(total_nnz); + arma::uvec colptr(n_cells + 1); + + for (size_t k = 0; k < total_nnz; k++) { + rowind[k] = row_indices[k]; + vals[k] = values[k]; + } + for (int j = 0; j <= n_cells; j++) { + colptr[j] = col_ptr[j]; + } + + // Create sparse matrix from CSC components + arma::sp_mat M2(rowind, colptr, vals, n_events, n_cells); + + return M2; +} + + +//' Legacy function - redirects to memory-efficient version +//' +//' @param M1 Sparse matrix (dgCMatrix) of inclusion counts (events x cells) +//' @param group_ids Integer vector of group IDs for each event +//' @param n_threads Number of threads for OpenMP (default 1) +//' +//' @return Sparse matrix M2 with same dimensions as M1 +//' +//' @keywords internal +// [[Rcpp::export]] +arma::sp_mat make_m2_cpp(const arma::sp_mat& M1, + const IntegerVector& group_ids, + int n_threads = 1) { + // Redirect to memory-efficient parallel version + return make_m2_cpp_parallel(M1, group_ids, n_threads); +} diff --git a/src/row_variance.cpp b/src/row_variance.cpp index 7895fc9..81026f1 100644 --- a/src/row_variance.cpp +++ b/src/row_variance.cpp @@ -3,14 +3,11 @@ using namespace Rcpp; // [[Rcpp::export]] NumericVector rowVariance_cpp(SEXP mat) { - Rcout << "[rowVariance_cpp] Entering function\n"; - // Dense matrix path + // Dense numeric matrix path if (Rf_isMatrix(mat) && TYPEOF(mat) == REALSXP) { NumericMatrix M(mat); int nrow = M.nrow(), ncol = M.ncol(); - Rcout << "[rowVariance_cpp] Detected dense matrix: " - << nrow << "×" << ncol << "\n"; NumericVector out(nrow); for (int i = 0; i < nrow; ++i) { @@ -24,7 +21,26 @@ NumericVector rowVariance_cpp(SEXP mat) { out[i] = sum2 / ncol - mean * mean; } - Rcout << "[rowVariance_cpp] Dense computation complete\n"; + return out; + } + + // Dense integer matrix path (Issue #16 from deep analysis) + if (Rf_isMatrix(mat) && TYPEOF(mat) == INTSXP) { + IntegerMatrix M(mat); + int nrow = M.nrow(), ncol = M.ncol(); + + NumericVector out(nrow); + for (int i = 0; i < nrow; ++i) { + double sum = 0.0, sum2 = 0.0; + for (int j = 0; j < ncol; ++j) { + double v = static_cast(M(i, j)); + sum += v; + sum2 += v * v; + } + double mean = sum / ncol; + out[i] = sum2 / ncol - mean * mean; + } + return out; } @@ -37,8 +53,6 @@ NumericVector rowVariance_cpp(SEXP mat) { NumericVector x = M.slot("x"); int nrow = dims[0], ncol = dims[1]; - Rcout << "[rowVariance_cpp] Detected sparse dgCMatrix: " - << nrow << "×" << ncol << "\n"; NumericVector rowSum(nrow, 0.0), rowSum2(nrow, 0.0); @@ -58,7 +72,6 @@ NumericVector rowVariance_cpp(SEXP mat) { out[row] = rowSum2[row] / ncol - mean * mean; } - Rcout << "[rowVariance_cpp] Sparse computation complete\n"; return out; } diff --git a/tests/testthat/test-SplikitObject.R b/tests/testthat/test-SplikitObject.R new file mode 100644 index 0000000..3b7deb1 --- /dev/null +++ b/tests/testthat/test-SplikitObject.R @@ -0,0 +1,639 @@ +# Tests for SplikitObject R6 class +# Uses test_splikit.rds to verify old and new methods produce identical results + +test_that("SplikitObject initializes from matrices", { + # Load test data + test_data <- load_toy_M1_M2_object() + + # Create object from existing matrices + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + expect_s3_class(obj, "SplikitObject") + expect_equal(nrow(obj$m1), 2000) + expect_equal(ncol(obj$m1), 2000) + expect_true(!is.null(obj$eventData)) + expect_true(is.null(obj$m2)) +}) + +test_that("SplikitObject validates dimension mismatches", { + test_data <- load_toy_M1_M2_object() + + # Wrong number of rows in eventData + bad_eventdata <- test_data$eventdata[1:100, ] + + expect_error( + SplikitObject$new(m1 = test_data$m1, eventData = bad_eventdata), + "same number of rows" + ) +}) + +test_that("SplikitObject validates m1/m2 dimension mismatch", { + test_data <- load_toy_M1_M2_object() + + # Create m2 with wrong dimensions + bad_m2 <- test_data$m1[1:100, ] + + expect_error( + SplikitObject$new(m1 = test_data$m1, m2 = bad_m2), + "identical dimensions" + ) +}) + +test_that("SplikitObject$makeM2 produces same result as make_m2 function", { + skip_if_not_installed("Matrix") + + test_data <- load_toy_M1_M2_object() + + # Old method + m2_old <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + # New method via R6 + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + obj$makeM2(verbose = FALSE) + m2_new <- obj$m2 + + # Compare results + expect_equal(dim(m2_old), dim(m2_new)) + expect_equal(rownames(m2_old), rownames(m2_new)) + expect_equal(colnames(m2_old), colnames(m2_new)) + + # Check values are identical (within floating point tolerance) + expect_equal(Matrix::nnzero(m2_old), Matrix::nnzero(m2_new)) + expect_equal(sum(m2_old), sum(m2_new)) +}) + +test_that("SplikitObject$findVariableEvents produces same result as function", { + skip_if_not_installed("Matrix") + + test_data <- load_toy_M1_M2_object() + + # Compute M2 first + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + # Old method + hve_old <- find_variable_events( + m1_matrix = test_data$m1, + m2_matrix = m2, + min_row_sum = 50, + n_threads = 1, + verbose = FALSE + ) + + # New method via R6 + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + hve_new <- obj$findVariableEvents(min_row_sum = 50, n_threads = 1, verbose = FALSE) + + # Compare results + expect_equal(nrow(hve_old), nrow(hve_new)) + expect_equal(names(hve_old), names(hve_new)) + + # Sort by events to ensure same order + hve_old <- hve_old[order(events)] + hve_new <- hve_new[order(events)] + + expect_equal(hve_old$events, hve_new$events) + expect_equal(hve_old$sum_deviance, hve_new$sum_deviance, tolerance = 1e-10) +}) + +test_that("SplikitObject$subset works correctly", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Subset by indices + original_events <- nrow(obj$m1) + original_cells <- ncol(obj$m1) + + obj$subset(events = 1:100, cells = 1:500) + + expect_equal(nrow(obj$m1), 100) + expect_equal(ncol(obj$m1), 500) + expect_equal(nrow(obj$eventData), 100) +}) + +test_that("SplikitObject$subset validates empty results", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Empty events + expect_error( + obj$subset(events = integer(0)), + "remove all events" + ) +}) + +test_that("SplikitObject$findVariableEvents validates threshold", { + test_data <- load_toy_M1_M2_object() + + # Compute M2 + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + + # Very high threshold that removes all events + expect_error( + obj$findVariableEvents(min_row_sum = 1e9), + "No events pass" + ) +}) + +test_that("SplikitObject requires M2 for findVariableEvents", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # M2 not computed yet + expect_error( + obj$findVariableEvents(), + "M2 not computed" + ) +}) + +test_that("SplikitObject$summary returns correct information", { + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + + summ <- obj$summary() + + expect_equal(summ$events, 2000) + expect_equal(summ$cells, 2000) + expect_true(summ$has_m2) + expect_false(summ$has_gene_expression) + expect_true(is.numeric(summ$sparsity_m1)) + expect_true(summ$sparsity_m1 >= 0 && summ$sparsity_m1 <= 1) +}) + +test_that("SplikitObject$print works without error", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Should print without error + expect_output(obj$print(), "SplikitObject") + expect_output(obj$print(), "Events:") + expect_output(obj$print(), "Cells:") +}) + +test_that("SplikitObject method chaining works", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Chain makeM2 and then check m2 is not null + result <- obj$makeM2(verbose = FALSE) + + # makeM2 should return self for chaining + expect_identical(result, obj) + expect_true(!is.null(obj$m2)) +}) + +test_that("splikit() convenience function works", { + test_data <- load_toy_M1_M2_object() + + obj <- splikit( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + expect_s3_class(obj, "SplikitObject") + expect_equal(nrow(obj$m1), 2000) +}) + +test_that("SplikitObject$setGeneExpression validates dimensions", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Wrong number of columns + bad_gene_matrix <- Matrix::rsparsematrix(1000, 100, 0.1) + + expect_error( + obj$setGeneExpression(bad_gene_matrix), + "same number of cells" + ) +}) + +test_that("SplikitObject$getPseudoCorrelation validates dimensions", { + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + + # Wrong dimensions + bad_zdb <- matrix(rnorm(100), nrow = 10, ncol = 10) + + expect_error( + obj$getPseudoCorrelation(bad_zdb), + "mismatch" + ) +}) + +test_that("SplikitObject$deepCopy creates independent copy", { + test_data <- load_toy_M1_M2_object() + + obj1 <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + obj2 <- obj1$deepCopy() + + # Modify obj1 + obj1$subset(events = 1:100) + + # obj2 should be unchanged + expect_equal(nrow(obj1$m1), 100) + expect_equal(nrow(obj2$m1), 2000) +}) + +test_that("SplikitObject stores results in metadata", { + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + + obj$findVariableEvents(min_row_sum = 100, verbose = FALSE) + + # Check metadata was updated + expect_true("variableEvents" %in% names(obj$metadata)) + expect_true("variableEvents_params" %in% names(obj$metadata)) + expect_equal(obj$metadata$variableEvents_params$min_row_sum, 100) +}) + +test_that("SplikitObject handles sparse matrix conversion", { + test_data <- load_toy_M1_M2_object() + + # Convert to dense and back + m1_dense <- as.matrix(test_data$m1[1:100, 1:100]) + + obj <- SplikitObject$new( + m1 = m1_dense, + eventData = test_data$eventdata[1:100, ] + ) + + # Should be converted to dgCMatrix + expect_s4_class(obj$m1, "dgCMatrix") +}) + +test_that("SplikitObject$annotateEvents validates file existence", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + expect_error( + obj$annotateEvents("/nonexistent/file.gtf"), + "not found" + ) +}) + +test_that("Backward compatibility: old functions still work", { + skip_if_not_installed("Matrix") + + test_data <- load_toy_M1_M2_object() + + # Old workflow should still work exactly as before + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + expect_s4_class(m2, "dgCMatrix") + expect_equal(dim(m2), dim(test_data$m1)) + + hve <- find_variable_events( + m1_matrix = test_data$m1, + m2_matrix = m2, + min_row_sum = 50, + verbose = FALSE + ) + + expect_true(nrow(hve) > 0) + expect_true("events" %in% names(hve)) + expect_true("sum_deviance" %in% names(hve)) +}) + +# ============================================================================ +# Edge Case Tests (from deep analysis Issue #1) +# ============================================================================ + +test_that("rowVariance_cpp handles integer matrices", { + # Issue #16 from deep analysis + m_int <- matrix(1:20, nrow = 4) + m_num <- matrix(as.numeric(1:20), nrow = 4) + + result_int <- rowVariance_cpp(m_int) + result_num <- rowVariance_cpp(m_num) + + expect_equal(result_int, result_num) +}) + +test_that("rowVariance_cpp handles all-zero sparse matrix", { + m <- Matrix::Matrix(0, nrow = 10, ncol = 5, sparse = TRUE) + result <- rowVariance_cpp(m) + + expect_equal(length(result), 10) + expect_true(all(result == 0)) +}) + +test_that("get_pseudo_correlation catches dimension mismatches", { + # Issue #14 from deep analysis + skip_if_not_installed("Matrix") + + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + # Wrong row dimension + bad_zdb <- matrix(rnorm(100 * ncol(test_data$m1)), nrow = 100, ncol = ncol(test_data$m1)) + + expect_error( + get_pseudo_correlation(bad_zdb, test_data$m1, m2), + "same number of rows" + ) +}) + +test_that("find_variable_events handles very high threshold gracefully", { + # Issue #23 from deep analysis + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + # Threshold so high no events pass + expect_error( + find_variable_events(test_data$m1, m2, min_row_sum = 1e9, verbose = FALSE), + "No events pass" + ) +}) + +test_that("SplikitObject validates negative threshold", { + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + + # Negative threshold should still work (all events pass) + result <- obj$findVariableEvents(min_row_sum = -1, verbose = FALSE) + expect_true(nrow(result) > 0) +}) + +test_that("SplikitObject handles subset by names", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Get first 10 event names + event_names <- rownames(obj$m1)[1:10] + + obj$subset(events = event_names) + + expect_equal(nrow(obj$m1), 10) + expect_equal(rownames(obj$m1), event_names) +}) + +test_that("SplikitObject subset warns about missing names", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Mix of valid and invalid names + mixed_names <- c(rownames(obj$m1)[1:5], "nonexistent_event_1", "nonexistent_event_2") + + expect_warning( + obj$subset(events = mixed_names), + "not found" + ) + + expect_equal(nrow(obj$m1), 5) +}) + +test_that("SplikitObject handles single event subset", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + obj$subset(events = 1) + + expect_equal(nrow(obj$m1), 1) + expect_equal(nrow(obj$eventData), 1) +}) + +test_that("SplikitObject handles single cell subset", { + test_data <- load_toy_M1_M2_object() + + obj <- SplikitObject$new( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + obj$subset(cells = 1) + + expect_equal(ncol(obj$m1), 1) +}) + +test_that("make_m2 produces symmetric results for group operations", { + # Verify M2 = group_sum - M1 for each event + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + # For any event, M1 + M2 should equal the group sum + # Check first few groups + unique_groups <- unique(test_data$eventdata$group_id)[1:5] + + for (grp in unique_groups) { + grp_events <- which(test_data$eventdata$group_id == grp) + if (length(grp_events) > 1) { + # Group sum should be the same for all events in group + for (i in grp_events) { + m1_val <- test_data$m1[i, 1] + m2_val <- m2[i, 1] + group_sum <- sum(test_data$m1[grp_events, 1]) + expect_equal(m1_val + m2_val, group_sum, info = paste("Group:", grp, "Event:", i)) + } + } + } +}) + +# ============================================================================ +# Integration Tests (from deep analysis recommendations) +# ============================================================================ + +test_that("Full R6 pipeline runs without errors", { + skip_if_not_installed("Matrix") + + test_data <- load_toy_M1_M2_object() + + # Create object + +obj <- splikit( + m1 = test_data$m1, + eventData = test_data$eventdata + ) + + # Compute M2 + obj$makeM2(verbose = FALSE) + + expect_true(!is.null(obj$m2)) + expect_equal(dim(obj$m2), dim(obj$m1)) + + # Find variable events + hve <- obj$findVariableEvents(min_row_sum = 50, verbose = FALSE) + + expect_true(nrow(hve) > 0) + expect_true("events" %in% names(hve)) + expect_true("sum_deviance" %in% names(hve)) + + # Check metadata was updated + expect_true("variableEvents" %in% names(obj$metadata)) +}) + +test_that("SplikitObject works with very small matrices", { + # Edge case: minimal viable input + m1_small <- Matrix::rsparsematrix(10, 5, 0.5) + rownames(m1_small) <- paste0("event_", 1:10) + colnames(m1_small) <- paste0("cell_", 1:5) + + eventdata_small <- data.table::data.table( + event_id = paste0("event_", 1:10), + group_id = rep(c("group1", "group2"), each = 5) + ) + + obj <- SplikitObject$new( + m1 = m1_small, + eventData = eventdata_small + ) + + expect_equal(nrow(obj$m1), 10) + expect_equal(ncol(obj$m1), 5) + + obj$makeM2(verbose = FALSE) + expect_true(!is.null(obj$m2)) +}) + +test_that("n_threads parameter is passed correctly", { + test_data <- load_toy_M1_M2_object() + + m2 <- make_m2( + m1_inclusion_matrix = test_data$m1, + eventdata = test_data$eventdata, + verbose = FALSE + ) + + obj <- SplikitObject$new( + m1 = test_data$m1, + m2 = m2, + eventData = test_data$eventdata + ) + + # Should run without error with multiple threads + result <- obj$findVariableEvents(min_row_sum = 100, n_threads = 2, verbose = FALSE) + expect_true(nrow(result) > 0) +}) diff --git a/vignettes/splikit_manual.Rmd b/vignettes/splikit_manual.Rmd index 62ce0a1..18ccda6 100644 --- a/vignettes/splikit_manual.Rmd +++ b/vignettes/splikit_manual.Rmd @@ -15,13 +15,14 @@ vignette: > 2. [Installation](#installation) 3. [Core Concepts](#core-concepts) 4. [Getting Started](#getting-started) -5. [Main Workflow Functions](#main-workflow-functions) -6. [Feature Selection Functions](#feature-selection-functions) -7. [Utility Functions](#utility-functions) -8. [Complete Workflow Example](#complete-workflow-example) -9. [Advanced Usage](#advanced-usage) -10. [Troubleshooting](#troubleshooting) -11. [Function Reference](#function-reference) +5. [R6 Object-Oriented Interface](#r6-object-oriented-interface) +6. [Main Workflow Functions](#main-workflow-functions) +7. [Feature Selection Functions](#feature-selection-functions) +8. [Utility Functions](#utility-functions) +9. [Complete Workflow Example](#complete-workflow-example) +10. [Advanced Usage](#advanced-usage) +11. [Troubleshooting](#troubleshooting) +12. [Function Reference](#function-reference) --- @@ -175,6 +176,178 @@ head(variable_events[order(-sum_deviance)]) --- +## R6 Object-Oriented Interface + +Starting with version 2.0.0, splikit introduces a modern R6 object-oriented interface through the `SplikitObject` class. This provides a more intuitive and streamlined workflow with method chaining support. + +### Creating a SplikitObject + +```r +library(splikit) + +# Load toy data for demonstration +toy_sj_data <- load_toy_SJ_object() + +# Create SplikitObject from junction abundance data +spl <- SplikitObject$new(junction_ab_object = toy_sj_data) + +# View object summary +print(spl) +``` + +### Complete Workflow with Method Chaining + +The R6 interface supports method chaining for a clean, pipeline-style workflow: + +```r +# Complete analysis pipeline with method chaining +spl <- SplikitObject$new(junction_ab_object = toy_sj_data)$ + makeM1(min_counts = 1)$ + makeM2(n_threads = 4, verbose = TRUE)$ + findVariableEvents(min_row_sum = 30, n_threads = 4) + +# Access results +m1_matrix <- spl$m1 +m2_matrix <- spl$m2 +variable_events <- spl$variableEvents +event_data <- spl$eventData +``` + +### Step-by-Step Workflow + +For more control, you can call methods individually: + +```r +# Create object +spl <- SplikitObject$new(junction_ab_object = toy_sj_data) + +# Step 1: Create M1 matrix +spl$makeM1(min_counts = 1, verbose = TRUE) +print(dim(spl$m1)) # Check dimensions + +# Step 2: Create M2 matrix with multi-threading +spl$makeM2( + n_threads = 4, # Number of OpenMP threads + use_cpp = TRUE, # Use optimized C++ implementation + verbose = TRUE +) +print(dim(spl$m2)) # Same dimensions as M1 + +# Step 3: Find highly variable events +spl$findVariableEvents( + min_row_sum = 30, + n_threads = 4 +) + +# View top variable events +top_events <- spl$variableEvents[order(-sum_deviance)][1:10] +print(top_events) +``` + +### Subsetting SplikitObjects + +SplikitObjects support flexible subsetting by events, cells, or both: + +```r +# Subset by events (rows) +top_100_events <- spl$variableEvents$events[1:100] +spl_subset <- spl$subset(events = top_100_events) + +# Subset by cells (columns) +selected_cells <- colnames(spl$m1)[1:500] +spl_cells <- spl$subset(cells = selected_cells) + +# Subset both events and cells +spl_combined <- spl$subset( + events = top_100_events, + cells = selected_cells +) + +# Check dimensions +print(dim(spl_subset$m1)) +print(dim(spl_cells$m1)) +print(dim(spl_combined$m1)) +``` + +### Working with Existing Matrices + +You can also create a SplikitObject from pre-computed matrices: + +```r +# Load pre-computed M1/M2 data +toy_m1m2 <- load_toy_M1_M2_object() + +# Create SplikitObject from existing matrices +spl <- SplikitObject$new( + m1_matrix = toy_m1m2$m1, + m2_matrix = toy_m1m2$m2 +) + +# Directly find variable events (M1 and M2 already computed) +spl$findVariableEvents(min_row_sum = 50, n_threads = 4) +``` + +### Accessing SplikitObject Fields + +The SplikitObject provides access to all data through fields: + +```r +# Core matrices +spl$m1 # Inclusion matrix (events × cells) +spl$m2 # Exclusion matrix (events × cells) + +# Metadata +spl$eventData # Event metadata data.table +spl$variableEvents # Variable events results + +# Original data +spl$junctionAbObject # Raw junction abundance data +``` + +### R6 vs Function-Based Interface Comparison + +Both interfaces produce identical results. Choose based on your workflow preference: + +**R6 Interface** (recommended for new users): +```r +spl <- SplikitObject$new(junction_ab_object = toy_sj_data)$ + makeM1(min_counts = 1)$ + makeM2(n_threads = 4)$ + findVariableEvents(min_row_sum = 30) + +m1 <- spl$m1 +m2 <- spl$m2 +hve <- spl$variableEvents +``` + +**Function-Based Interface** (traditional): +```r +m1_result <- make_m1(toy_sj_data, min_counts = 1) +m2 <- make_m2(m1_result$m1_inclusion_matrix, m1_result$event_data, n_threads = 4) +hve <- find_variable_events(m1_result$m1_inclusion_matrix, m2, min_row_sum = 30) +``` + +### Performance Features + +The SplikitObject methods support the same performance optimizations as the underlying functions: + +- **Multi-threading**: Set `n_threads` parameter for OpenMP parallelization +- **Memory-efficient M2**: Version 2.0.0 includes a completely rewritten `make_m2` implementation with ~30x memory reduction +- **C++ optimization**: Set `use_cpp = TRUE` (default) for maximum performance + +```r +# Check if OpenMP is available +check_openmp_enabled() + +# Use all available cores for large datasets +n_cores <- parallel::detectCores() - 1 + +spl$makeM2(n_threads = n_cores, use_cpp = TRUE, verbose = TRUE) +spl$findVariableEvents(n_threads = n_cores, min_row_sum = 50) +``` + +--- + ## Main Workflow Functions ### `make_junction_ab()` @@ -307,9 +480,10 @@ The M1 matrix represents inclusion evidence: **Function Signature**: ```r -make_m2(m1_inclusion_matrix, eventdata, batch_size = 5000, - memory_threshold = 2e9, force_fast = FALSE, - multi_thread = FALSE, verbose = FALSE) +make_m2(m1_inclusion_matrix, eventdata, batch_size = 5000, + memory_threshold = 2e9, force_fast = FALSE, + multi_thread = FALSE, n_threads = 1, use_cpp = TRUE, + verbose = FALSE) ``` **Parameters**: @@ -319,8 +493,17 @@ make_m2(m1_inclusion_matrix, eventdata, batch_size = 5000, - `memory_threshold`: Maximum rows before switching to batched processing (default 2e9) - `force_fast`: Force fast processing regardless of memory estimates (default FALSE) - `multi_thread`: Enable parallel processing for batched operations (default FALSE) +- `n_threads`: Number of OpenMP threads for C++ implementation (default 1) +- `use_cpp`: Use optimized C++ implementation (default TRUE, recommended) - `verbose`: Logical for detailed progress reporting (default FALSE) +**Performance Notes (v2.0.0)**: +The C++ implementation has been completely rewritten for version 2.0.0 with significant improvements: +- **~8x faster execution** through two-pass CSC format construction +- **~30x lower memory usage** by eliminating intermediate dense matrices +- Memory usage now scales with output size only: O(nnz_output) + O(n_groups) +- Previous versions used O(n_groups × n_cells) for intermediate storage + **The Exclusion Calculation Logic**: For each coordinate group, M2 values are calculated as: @@ -1147,4 +1330,4 @@ For additional support, examples, and updates, visit the package documentation a --- -*This manual was generated for splikit version 1.0.5 For the most up-to-date information, please refer to the package documentation and vignettes.* +*This manual was generated for splikit version 2.0.0. For the most up-to-date information, please refer to the package documentation and vignettes.*