diff --git a/.Rbuildignore b/.Rbuildignore index 674edbc..540d634 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,5 @@ ^pkgdown$ ^\.github$ ^build_pkgdown_site\.R$ +^CLAUDE\.md$ +^\.claude$ diff --git a/DESCRIPTION b/DESCRIPTION index 21685a9..16d7de9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,15 @@ Package: splikit -Title: A Toolkit for Analysing RNA Splicing in Single-Cell RNA Sequencing Data +Title: Analysing RNA Splicing in scRNA-Seq Data Version: 2.0.0 Authors@R: person("Arsham", "Mikaeili Namini", , "arsham.mikaeilinamini@mail.mcgill.ca", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9453-6951")) -Description: A toolkit designed for the analysis of high-dimensional single-cell - splicing data. Provides a framework to extract and work with ratio-based - data structures derived from single-cell RNA sequencing experiments. - Offers both a modern R6 object-oriented interface and direct matrix - manipulation functions. Core functionalities are implemented in C++ via - 'Rcpp' to ensure high performance and scalability on large datasets. +Description: Provides analysis of high-dimensional single-cell splicing data. + Offers a framework to extract and work with ratio-based data structures + derived from single-cell RNA sequencing experiments. Provides both a modern + R6 object-oriented interface and direct matrix manipulation functions. Core + functionalities are implemented in C++ via 'Rcpp' to ensure high performance + and scalability on large datasets. URL: https://csglab.github.io/splikit/, https://github.com/csglab/splikit BugReports: https://github.com/csglab/splikit/issues License: MIT + file LICENSE @@ -19,6 +19,6 @@ RoxygenNote: 7.3.3 Imports: Matrix, data.table, methods, stats, Rcpp, R6 LinkingTo: Rcpp, RcppArmadillo Suggests: testthat (>= 3.0.0), knitr, rmarkdown -Config/testthat/edition: 3 VignetteBuilder: knitr +Config/testthat/edition: 3 Depends: R (>= 4.1.0) diff --git a/R/SplikitObject.R b/R/SplikitObject.R index 28a1f53..bff7193 100644 --- a/R/SplikitObject.R +++ b/R/SplikitObject.R @@ -113,15 +113,14 @@ SplikitObject <- R6::R6Class("SplikitObject", #' @param batch_size Number of groups per batch for memory management (default: 5000). #' @param memory_threshold Maximum rows before switching to batched processing. #' @param force_fast Force fast processing regardless of size (default: FALSE). - #' @param multi_thread Use parallel processing for batched operations (default: FALSE). - #' @param n_threads Number of threads for C++ OpenMP parallelization (default: 1). + #' @param n_threads Number of threads for parallel processing (default: 1). #' @param use_cpp Use fast C++ implementation (default: TRUE). #' @param verbose Print progress messages (default: FALSE). #' #' @return Self (invisibly), for method chaining. makeM2 = function(batch_size = 5000, memory_threshold = 2e9, - force_fast = FALSE, multi_thread = FALSE, - n_threads = 1, use_cpp = TRUE, verbose = FALSE) { + force_fast = FALSE, n_threads = 1, + use_cpp = TRUE, verbose = FALSE) { if (is.null(self$m1)) { private$error("M1 matrix not initialized") @@ -136,7 +135,6 @@ SplikitObject <- R6::R6Class("SplikitObject", batch_size = batch_size, memory_threshold = memory_threshold, force_fast = force_fast, - multi_thread = multi_thread, n_threads = n_threads, use_cpp = use_cpp, verbose = verbose diff --git a/R/feature_selection.R b/R/feature_selection.R index a831e8f..74734fe 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -4,7 +4,7 @@ #' @param m2_matrix A matrix representing the exclusion matrix. Rows are events, columns are barcodes. #' @param n_threads If the module OpenPM is available for your device, the function suggests using multi-thread processing for even faster computation. #' @param min_row_sum A numeric value specifying the minimum row sum threshold for filtering events. Defaults to 50. -#' @param verbose Logical. If \code{TRUE} (default), prints progress and informational messages. +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. #' @param ... Additional arguments to be passed. #' #' @return A \code{data.table} containing the events and their corresponding sum deviance values. @@ -19,8 +19,7 @@ #' # printing the results #' print(HVE[order(-sum_deviance)]) #' @export -find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, n_threads=1, verbose=TRUE, ...) { - +find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, n_threads = 1, verbose = FALSE, ...) { # Handle SplikitObject input if (inherits(m1_matrix, "SplikitObject")) { @@ -72,7 +71,7 @@ find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, meta <- temp_current_barcodes libraries <- unique(meta$ID) - cat("There are", length(libraries), "libraries detected...\n") + if (verbose) message("There are ", length(libraries), " libraries detected...") # Initialize deviance sum vector sum_deviances <- numeric(nrow(m1_matrix)) @@ -90,8 +89,8 @@ find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, }) deviance_values <- c(deviance_values) names(deviance_values) <- rownames(M1_sub) - if(verbose) { - cat("Calculating the deviances for sample", lib, "has been completed!\n") + if (verbose) { + message("Calculating the deviances for sample ", lib, " has been completed!") } return(deviance_values) }) @@ -115,7 +114,7 @@ find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, #' \code{"vst"} uses a variance-stabilizing transformation to identify variable genes. #' \code{"sum_deviance"} computes per-library deviances and combines them with a row variance metric. #' @param n_threads If OpenMP is available for your device, the function suggests using multi-thread processing for even faster computation (only for sum_deviance method). -#' @param verbose Logical. If \code{TRUE} (default), prints progress and informational messages. +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. #' @param ... Additional arguments (currently unused). #' #' @return A \code{data.table} containing gene names (column \code{events}) and computed metrics. @@ -128,9 +127,9 @@ find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, #' #' # getting high variable genes #' HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") -#' HVG_DEV <- find_variable_genes(toy_obj$gene_expression, -#' method = "sum_deviance") -#' +#' # sum_deviance method +#' HVG_DEV <- find_variable_genes(toy_obj$gene_expression, method = "sum_deviance") +#' #' # Using multi-threading for faster computation (sum_deviance method only) #' HVG_DEV_MT <- find_variable_genes(toy_obj$gene_expression, #' method = "sum_deviance", @@ -145,7 +144,7 @@ find_variable_events <- function(m1_matrix, m2_matrix = NULL, min_row_sum = 50, #' @import Matrix #' @importClassesFrom Matrix dgCMatrix dsCMatrix dgTMatrix dsTMatrix #' @export -find_variable_genes <- function(gene_expression_matrix, method = "vst", n_threads = 1, verbose = TRUE, ...) { +find_variable_genes <- function(gene_expression_matrix, method = "vst", n_threads = 1, verbose = FALSE, ...) { # adding the vst method as the default method <- match.arg(method, choices = c("vst", "sum_deviance")) @@ -155,7 +154,7 @@ find_variable_genes <- function(gene_expression_matrix, method = "vst", n_thread } if (method == "vst") { - if(verbose) cat("The method we are using is vst (Seurat)...\n") + if (verbose) message("The method we are using is vst (Seurat)...") if (!exists("standardizeSparse_variance_vst")) { stop("The function 'standardizeSparse_variance_vst' is not available. Check your C++ source files.") } @@ -167,7 +166,7 @@ find_variable_genes <- function(gene_expression_matrix, method = "vst", n_thread rez <- data.table::data.table(events = rownames(gene_expression_matrix), standardize_variance = rez_vector) } else { - if(verbose) cat("The method we are using is like deviance summarion per library...\n") + if (verbose) message("The method we are using is like deviance summarion per library...") # Filter rows based on minimum row sum criteria to_keep_features <- which(rowSums(gene_expression_matrix) > 0) @@ -182,7 +181,7 @@ find_variable_genes <- function(gene_expression_matrix, method = "vst", n_thread meta <- temp_current_barcodes libraries <- unique(meta$ID) - cat("There are", length(libraries), "libraries detected...\n") + if (verbose) message("There are ", length(libraries), " libraries detected...") # Initialize deviance sum vector with gene names sum_deviances <- numeric(nrow(gene_expression_matrix)) @@ -207,8 +206,8 @@ find_variable_genes <- function(gene_expression_matrix, method = "vst", n_thread }) deviance_values <- c(deviance_values) names(deviance_values) <- rownames(gene_expression_matrix_sub) - if(verbose) { - cat("Calculating the deviances for sample", lib, "has been completed!\n") + if (verbose) { + message("Calculating the deviances for sample ", lib, " has been completed!") } return(deviance_values) }) diff --git a/R/general_tools.R b/R/general_tools.R index f61dff0..50842f2 100644 --- a/R/general_tools.R +++ b/R/general_tools.R @@ -1,7 +1,12 @@ #' @useDynLib splikit, .registration = TRUE #' @importFrom Rcpp evalCpp +#' @importFrom methods as +#' @importFrom stats na.omit NULL +# Declare global variables used in data.table operations to avoid R CMD check NOTEs +utils::globalVariables(c("unique_mapped", "i", "j", "x_1", "x_tot", "x_2", "group_id", "ID")) + #' Compute Pseudo-Correlation Using Beta-Binomial Model #' #' This function calculates a pseudo R²-like correlation metric using a beta-binomial model @@ -13,8 +18,8 @@ NULL #' @param m1_inclusion A numeric matrix (dense or sparse) of the same number of rows as `ZDB_matrix`, representing inclusion features. #' @param m2_exclusion A numeric matrix (dense or sparse) of the same number of rows as `ZDB_matrix`, representing exclusion features. #' @param metric Character string specifying which R² metric to compute. Options are "CoxSnell" (default) or "Nagelkerke". -#' @param suppress_warnings Logical. If \code{TRUE} (default), suppresses warnings during any warnings triggered during -#' computation (e.g., due to ill-conditioned inputs) +#' @param suppress_warnings Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs). +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. #' #' @return A `data.table` with the following columns: #' \describe{ @@ -56,9 +61,9 @@ NULL #' print(pseudo_r_square_nagel) #' #' @export -get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings=TRUE) { +get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", suppress_warnings = TRUE, verbose = FALSE) { - # Handle SplikitObject input + # Handle SplikitObject input if (inherits(ZDB_matrix, "SplikitObject")) { # First arg is SplikitObject, second should be ZDB_matrix obj <- ZDB_matrix @@ -99,81 +104,87 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion stop("m2_exclusion must have the same number of rows as ZDB_matrix.") } - cat("Input dimension check passed. Proceeding with computation.\n") - if(suppress_warnings){suppressWarnings({ - - # Determine which C++ function to call based on matrix types - is_m1_sparse <- inherits(m1_inclusion, "Matrix") - is_m2_sparse <- inherits(m2_exclusion, "Matrix") - - if (!is_m1_sparse && !is_m2_sparse) { - # Both dense - ensure they are matrices - correls <- cppBetabinPseudoR2(Z = ZDB_matrix, + if (verbose) message("Input dimension check passed. Proceeding with computation.") + + # Define the computation function + run_computation <- function() { + # Determine which C++ function to call based on matrix types + is_m1_sparse <- inherits(m1_inclusion, "Matrix") + is_m2_sparse <- inherits(m2_exclusion, "Matrix") + + if (!is_m1_sparse && !is_m2_sparse) { + # Both dense - ensure they are matrices + correls <- cppBetabinPseudoR2(Z = ZDB_matrix, + m1 = as.matrix(m1_inclusion), + m2 = as.matrix(m2_exclusion), + metric = metric) + } else if (is_m1_sparse && is_m2_sparse) { + # Both sparse + correls <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix, + m1 = m1_inclusion, + m2 = m2_exclusion, + metric = metric) + } else if (is_m1_sparse && !is_m2_sparse) { + # m1 sparse, m2 dense + correls <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix, + m1 = m1_inclusion, + m2 = as.matrix(m2_exclusion), + metric = metric) + } else { + # m1 dense, m2 sparse + correls <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix, + m1 = as.matrix(m1_inclusion), + m2 = m2_exclusion, + metric = metric) + } + + if (is.null(rownames(ZDB_matrix))) { + warning("ZDB_matrix has no row names; assigning default event names.") + events <- paste0("Event_", seq_len(nrow(ZDB_matrix))) + } else { + events <- rownames(ZDB_matrix) + } + + # Calculate null distribution with the same metric and matrix types + if (!is_m1_sparse && !is_m2_sparse) { + null_dist <- cppBetabinPseudoR2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], m1 = as.matrix(m1_inclusion), m2 = as.matrix(m2_exclusion), metric = metric) - } else if (is_m1_sparse && is_m2_sparse) { - # Both sparse - correls <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix, + } else if (is_m1_sparse && is_m2_sparse) { + null_dist <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], m1 = m1_inclusion, m2 = m2_exclusion, metric = metric) - } else if (is_m1_sparse && !is_m2_sparse) { - # m1 sparse, m2 dense - correls <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix, + } else if (is_m1_sparse && !is_m2_sparse) { + null_dist <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], m1 = m1_inclusion, m2 = as.matrix(m2_exclusion), metric = metric) - } else { - # m1 dense, m2 sparse - correls <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix, + } else { + null_dist <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], m1 = as.matrix(m1_inclusion), m2 = m2_exclusion, metric = metric) - } - - if (is.null(rownames(ZDB_matrix))) { - warning("ZDB_matrix has no row names; assigning default event names.") - events <- paste0("Event_", seq_len(nrow(ZDB_matrix))) - } else { - events <- rownames(ZDB_matrix) - } - - # Calculate null distribution with the same metric and matrix types - if (!is_m1_sparse && !is_m2_sparse) { - null_dist <- cppBetabinPseudoR2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = as.matrix(m1_inclusion), - m2 = as.matrix(m2_exclusion), - metric = metric) - } else if (is_m1_sparse && is_m2_sparse) { - null_dist <- cppBetabinPseudoR2_sparse(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = m1_inclusion, - m2 = m2_exclusion, - metric = metric) - } else if (is_m1_sparse && !is_m2_sparse) { - null_dist <- cppBetabinPseudoR2_mixed1(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = m1_inclusion, - m2 = as.matrix(m2_exclusion), - metric = metric) - } else { - null_dist <- cppBetabinPseudoR2_mixed2(Z = ZDB_matrix[, sample.int(ncol(ZDB_matrix))], - m1 = as.matrix(m1_inclusion), - m2 = m2_exclusion, - metric = metric) - } - - results <- data.table::data.table( - event = events, - pseudo_correlation = correls, - null_distribution = null_dist - - ) - }) + } + + data.table::data.table( + event = events, + pseudo_correlation = correls, + null_distribution = null_dist + ) + } + + # Run computation with or without warning suppression + if (suppress_warnings) { + results <- suppressWarnings(run_computation()) + } else { + results <- run_computation() } results <- na.omit(results) - cat("Computation completed successfully.\n") + if (verbose) message("Computation completed successfully.") return(results) } @@ -184,7 +195,7 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion #' or a sparse dgCMatrix, via a single Rcpp entry point. Logs progress messages #' to the R console. #' @param M A numeric matrix (base R matrix) or a sparse matrix of class \code{"dgCMatrix"}. -#' @param verbose Logical. If \code{TRUE} (default), prints progress and informational messages. +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. #' @return A numeric vector of length \code{nrow(M)} containing the variance of each row. #' @details #' Dispatches in C++ between dense and sparse implementations to avoid unnecessary @@ -203,13 +214,13 @@ get_pseudo_correlation <- function(ZDB_matrix, m1_inclusion = NULL, m2_exclusion #' Only 32-bit integer indices are supported, due to limitations in R's internal matrix representations. #' This function will not work with matrices that exceed the 32-bit integer indexing range. #' @export -get_rowVar <- function(M, verbose=FALSE) { - if (! (is.matrix(M) || inherits(M, "dgCMatrix")) ) { +get_rowVar <- function(M, verbose = FALSE) { + if (!(is.matrix(M) || inherits(M, "dgCMatrix"))) { stop("`M` must be either a dense numeric matrix or a dgCMatrix.") } - if(verbose) message("[get_rowVar] Starting computation") + if (verbose) message("[get_rowVar] Starting computation") result <- rowVariance_cpp(M) - if(verbose) message("[get_rowVar] Computation finished") + if (verbose) message("[get_rowVar] Computation finished") result } @@ -220,6 +231,7 @@ get_rowVar <- function(M, verbose=FALSE) { #' @param X A numeric matrix where rows are observations and columns are features. #' @param cluster_assignments An integer vector of cluster assignments, which must be the same length as the number of rows in \code{X}. #' @param n_threads Number of threads to use for parallel processing. +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. #' #' @note This process can be very slow for large matrices if single-threaded. Use multiple threads to take advantage of parallel computation for significantly faster results. #' @@ -237,13 +249,13 @@ get_rowVar <- function(M, verbose=FALSE) { #' print(score) #' #' @export -get_silhouette_mean <- function(X, cluster_assignments, n_threads = 1) { +get_silhouette_mean <- function(X, cluster_assignments, n_threads = 1, verbose = FALSE) { stopifnot(is.matrix(X), is.numeric(X)) stopifnot(is.integer(cluster_assignments) || is.numeric(cluster_assignments)) stopifnot(nrow(X) == length(cluster_assignments)) - cat("[silhouette_avg] Starting computation...\n") - cat(sprintf("[silhouette_avg] Using %d threads...", n_threads), "\n") + if (verbose) message("[silhouette_avg] Starting computation...") + if (verbose) message("[silhouette_avg] Using ", n_threads, " threads...") score <- silhouette_avg(X, as.integer(cluster_assignments), as.integer(n_threads)) return(score) diff --git a/R/star_solo_processing.R b/R/star_solo_processing.R index 09b8c6b..8c5e431 100644 --- a/R/star_solo_processing.R +++ b/R/star_solo_processing.R @@ -41,7 +41,7 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl # Automatically set use_internal_whitelist to FALSE if white_barcode_lists is provided if (!is.null(white_barcode_lists)) { use_internal_whitelist <- FALSE - if(verbose) cat("External whitelist provided: automatically disabling internal whitelist\n") + if (verbose) message("External whitelist provided: automatically disabling internal whitelist") } # If white_barcode_lists is NULL, create a list of NULL values @@ -69,12 +69,12 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl internal_whitelist_dir <- file.path(STARsolo_SJ_dir, "..", "Gene", "filtered", "barcodes.tsv") # Debug: Print paths if verbose - if(verbose) { - cat("|-- Debug paths for sample:", sample_id, "\n") - cat(" Matrix file:", mtx_dir, "\n") - cat(" Feature file:", feature_dir, "\n") - cat(" Barcodes file:", barcodes_dir, "\n") - cat(" Internal whitelist:", internal_whitelist_dir, "\n") + if (verbose) { + message("|-- Debug paths for sample: ", sample_id) + message(" Matrix file: ", mtx_dir) + message(" Feature file: ", feature_dir) + message(" Barcodes file: ", barcodes_dir) + message(" Internal whitelist: ", internal_whitelist_dir) } # Check for required files @@ -92,18 +92,18 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl } # Read splicing data with error handling - if(verbose) cat("|-- Processing sample: ", sample_id, "\n") + if (verbose) message("|-- Processing sample: ", sample_id) tryCatch({ mtx <- Matrix::readMM(mtx_dir) - if(verbose) cat(" Matrix dimensions: ", nrow(mtx), "x", ncol(mtx), "\n") + if (verbose) message(" Matrix dimensions: ", nrow(mtx), "x", ncol(mtx)) }, error = function(e) { stop("Error reading matrix file for sample ", sample_id, ": ", e$message, call. = FALSE) }) tryCatch({ raw_brc <- data.table::fread(barcodes_dir, header = FALSE, showProgress = FALSE) - if(verbose) cat(" Barcodes read: ", nrow(raw_brc), "\n") + if (verbose) message(" Barcodes read: ", nrow(raw_brc)) }, error = function(e) { stop("Error reading barcodes file for sample ", sample_id, ": ", e$message, call. = FALSE) }) @@ -115,7 +115,7 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl col.names = c('chr', 'start', 'end', 'strand', "intron_motif", 'is_annot', 'unique_mapped'), showProgress = FALSE ) - if(verbose) cat(" Features read: ", nrow(feature), "\n") + if (verbose) message(" Features read: ", nrow(feature)) }, error = function(e) { stop("Error reading feature file for sample ", sample_id, ": ", e$message, call. = FALSE) }) @@ -136,8 +136,8 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl if (file.exists(internal_whitelist_dir)) { tryCatch({ white_barcode_list <- data.table::fread(internal_whitelist_dir, header = FALSE, showProgress = FALSE)$V1 - if(verbose) cat(" |-- Using STARsolo internal whitelist for sample: ", sample_id, - " (", length(white_barcode_list), " barcodes)\n") + if (verbose) message(" |-- Using STARsolo internal whitelist for sample: ", sample_id, + " (", length(white_barcode_list), " barcodes)") }, error = function(e) { stop("Error reading internal whitelist for sample ", sample_id, ": ", e$message, call. = FALSE) }) @@ -173,8 +173,8 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl } eliminated_junctions <- old_junction_numbers - nrow(feature) - if(verbose && eliminated_junctions > 0) { - cat(" |-- Eliminated ", eliminated_junctions, " junctions due to only multi-mapped records\n") + if (verbose && eliminated_junctions > 0) { + message(" |-- Eliminated ", eliminated_junctions, " junctions due to only multi-mapped records") } } @@ -197,10 +197,10 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl stop("All barcodes were removed after trimming for sample: ", sample_id, call. = FALSE) } - if(verbose) cat("| |-- Trimmed junction abundance matrix for sample:", sample_id, - "(", final_barcodes, " barcodes remaining)\n") + if (verbose) message("| |-- Trimmed junction abundance matrix for sample: ", sample_id, + " (", final_barcodes, " barcodes remaining)") } else { - if(verbose) cat("| |-- No barcode filtration applied for sample: ", sample_id, "\n") + if (verbose) message("| |-- No barcode filtration applied for sample: ", sample_id) } # Append sample_id to column names @@ -220,7 +220,7 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl stringsAsFactors = FALSE ) - if(verbose) cat("| +-- Finished processing sample: ", sample_id, "\n") + if (verbose) message("| +-- Finished processing sample: ", sample_id) return(list(result = m1, summary = summary_row)) } @@ -238,8 +238,10 @@ make_junction_ab <- function(STARsolo_SJ_dirs, white_barcode_lists = NULL, sampl summary_table <- do.call(rbind, lapply(results, function(x) x$summary)) # Print summary table - cat("\nSummary of Processed Samples in M1 matrix:\n") - print(summary_table) + if (verbose) { + message("\nSummary of Processed Samples in M1 matrix:") + print(summary_table) + } # Always return a named list, even for a single sample names(final_results) <- unlist(sample_ids) @@ -298,7 +300,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { stop("`min_counts` must be a single non-negative number.", call. = FALSE) } - cat("Starting M1 matrix creation...\n") + if (verbose) message("Starting M1 matrix creation...") # Extract and combine all eventdata all_in_one_eventdata <- tryCatch({ @@ -320,11 +322,11 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { stop("`eventdata` must contain columns: ", paste(missing_cols, collapse = ", "), call. = FALSE) } - cat("Combined eventdata from", length(junction_ab_object), "samples\n") + if (verbose) message("Combined eventdata from ", length(junction_ab_object), " samples") # Remove sample_id and deduplicate events all_junctions <- unique(all_in_one_eventdata$row_names_mtx) - cat("Found", length(all_junctions), "unique junctions\n") + if (verbose) message("Found ", length(all_junctions), " unique junctions") # Create a copy and remove sample_id if it exists temp_eventdata <- data.table::copy(all_in_one_eventdata) @@ -340,7 +342,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { data.table::setDT(temp_eventdata) } - cat("Creating coordinate groups...\n") + if (verbose) message("Creating coordinate groups...") # Group by start and end coordinates tryCatch({ @@ -360,8 +362,8 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { temp_eventdata_grouped <- temp_eventdata if (verbose) { - cat("Start coordinate groups:", max(temp_eventdata_grouped$start_cor_group_id, na.rm = TRUE), "\n") - cat("End coordinate groups:", max(temp_eventdata_grouped$end_cor_group_id, na.rm = TRUE), "\n") + message("Start coordinate groups: ", max(temp_eventdata_grouped$start_cor_group_id, na.rm = TRUE)) + message("End coordinate groups: ", max(temp_eventdata_grouped$end_cor_group_id, na.rm = TRUE)) } # Create event data for start coordinate groups @@ -394,8 +396,8 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { new = c("raw_row_names_mtx", "row_names_mtx")) } - cat("Start coordinate alternative events:", nrow(temp_eventdata_grouped_start), "\n") - cat("End coordinate alternative events:", nrow(temp_eventdata_grouped_end), "\n") + if (verbose) message("Start coordinate alternative events: ", nrow(temp_eventdata_grouped_start)) + if (verbose) message("End coordinate alternative events: ", nrow(temp_eventdata_grouped_end)) # Combine start and end groups if (nrow(temp_eventdata_grouped_start) == 0 && nrow(temp_eventdata_grouped_end) == 0) { @@ -426,7 +428,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { new = c("group_id", "group_count")) } - cat("Combined eventdata has", nrow(eventdata), "alternative splicing events\n") + if (verbose) message("Combined eventdata has ", nrow(eventdata), " alternative splicing events") # Create a table for all events all_events <- eventdata[, .(raw_row_names_mtx, row_names_mtx)] @@ -434,7 +436,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { # Process and merge all junction abundance matrices process_junction_ab_matrices <- function(junction_ab_object, all_events, eventdata, min_counts, verbose) { - cat("Processing junction abundance matrices...\n") + if (verbose) message("Processing junction abundance matrices...") # Initialize the final merged matrix m1_raw <- Matrix::sparseMatrix( @@ -445,7 +447,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { ) for (j in seq_along(junction_ab_object)) { - if (verbose) cat("Processing sample", j, "of", length(junction_ab_object), "\n") + if (verbose) message("Processing sample ", j, " of ", length(junction_ab_object)) tryCatch({ # Extract current matrix and events @@ -525,7 +527,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { # Process all matrices m1 <- process_junction_ab_matrices(junction_ab_object, all_events, eventdata, min_counts, verbose) - cat("Applying count threshold filtering...\n") + if (verbose) message("Applying count threshold filtering...") # Filter based on rowSums threshold row_sums <- Matrix::rowSums(m1) @@ -539,15 +541,15 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { m1_filtered <- m1[events_to_keep, , drop = FALSE] eventdata_filtered <- eventdata[events_to_keep, ] - cat("Filtered from", nrow(m1), "to", nrow(m1_filtered), "events\n") - cat("Events removed:", sum(!events_to_keep), "\n") + if (verbose) message("Filtered from ", nrow(m1), " to ", nrow(m1_filtered), " events") + if (verbose) message("Events removed: ", sum(!events_to_keep)) # Verify matrix row order if (!identical(rownames(m1_filtered), eventdata_filtered$row_names_mtx)) { m1_filtered <- m1_filtered[eventdata_filtered$row_names_mtx, , drop = FALSE] } - cat("Finished processing M1.\n") + if (verbose) message("Finished processing M1.") # Return results with summary result <- list( @@ -563,11 +565,13 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { ) ) - cat("\nSummary:\n") - cat(" Input junctions:", result$summary$total_events_input, "\n") - cat(" Alternative splicing events:", result$summary$alternative_events_found, "\n") - cat(" Events passing threshold:", result$summary$events_passing_threshold, "\n") - cat(" Total cells:", result$summary$total_cells, "\n") + if (verbose) { + message("\nSummary:") + message(" Input junctions: ", result$summary$total_events_input) + message(" Alternative splicing events: ", result$summary$alternative_events_found) + message(" Events passing threshold: ", result$summary$events_passing_threshold) + message(" Total cells: ", result$summary$total_cells) + } return(result) } @@ -586,10 +590,8 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { #' summary before switching to batched processing (default: 2e9, which is ~93% of 2^31). #' @param force_fast A logical flag to force fast processing regardless of size estimates (default: FALSE). #' WARNING: This may cause memory errors on large datasets. -#' @param multi_thread A logical flag to enable parallel processing for batched operations (default: FALSE). -#' Only used when batched processing is triggered. Requires parallel package. -#' @param n_threads Number of threads for C++ parallel processing (default: 1). -#' Only used when use_cpp = TRUE. +#' @param n_threads Number of threads for parallel processing (default: 1). +#' Used for C++ implementation and batched R processing. Values > 1 require parallel package for batched mode. #' @param use_cpp Logical flag to use fast C++ implementation (default: TRUE). #' Falls back to R implementation if FALSE. #' @param verbose A logical flag for detailed progress reporting (default: FALSE). @@ -612,8 +614,7 @@ make_m1 <- function(junction_ab_object, min_counts = 1, verbose = FALSE) { #' @export make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, memory_threshold = 2e9, force_fast = FALSE, - multi_thread = FALSE, n_threads = 1, use_cpp = TRUE, - verbose = FALSE) { + n_threads = 1, use_cpp = TRUE, verbose = FALSE) { # Input validation if (missing(m1_inclusion_matrix) || !inherits(m1_inclusion_matrix, "sparseMatrix")) { @@ -634,29 +635,32 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, if (!is.logical(force_fast) || length(force_fast) != 1) { stop("Error: 'force_fast' must be a logical value (TRUE or FALSE).", call. = FALSE) } - if (!is.logical(multi_thread) || length(multi_thread) != 1) { - stop("Error: 'multi_thread' must be a logical value (TRUE or FALSE).", call. = FALSE) + if (!is.numeric(n_threads) || length(n_threads) != 1 || n_threads < 1) { + stop("Error: 'n_threads' must be a positive integer.", call. = FALSE) } + n_threads <- as.integer(n_threads) - # Check for parallel package if multi_thread is requested - if (multi_thread && !requireNamespace("parallel", quietly = TRUE)) { - stop("Error: 'parallel' package is required for multi_thread=TRUE but is not installed.", call. = FALSE) + # Check for parallel package if multi-threading is requested + if (n_threads > 1 && !requireNamespace("parallel", quietly = TRUE)) { + stop("Error: 'parallel' package is required for n_threads > 1 but is not installed.", call. = FALSE) } - cat("Starting M2 matrix creation...\n") + if (verbose) message("Starting M2 matrix creation...") # Use C++ implementation if requested if (use_cpp) { - cat("+-- Using C++ implementation for faster computation\n") + if (verbose) message("+-- Using C++ implementation for faster computation") # Convert group_id to numeric indices (0-based for C++) unique_groups <- unique(eventdata$group_id) group_map <- setNames(seq_along(unique_groups) - 1L, unique_groups) group_ids_numeric <- as.integer(group_map[eventdata$group_id]) - cat("| |-- Events: ", nrow(m1_inclusion_matrix), "\n") - cat("| |-- Cells: ", ncol(m1_inclusion_matrix), "\n") - cat("| +-- Groups: ", length(unique_groups), "\n") + if (verbose) { + message("| |-- Events: ", nrow(m1_inclusion_matrix)) + message("| |-- Cells: ", ncol(m1_inclusion_matrix)) + message("| +-- Groups: ", length(unique_groups)) + } # Call C++ function M2 <- make_m2_cpp_parallel( @@ -669,7 +673,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, rownames(M2) <- rownames(m1_inclusion_matrix) colnames(M2) <- colnames(m1_inclusion_matrix) - cat("Finished M2 matrix creation.\n") + if (verbose) message("Finished M2 matrix creation.") return(M2) } @@ -687,7 +691,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, stop("Error creating dummy row: ", e$message, call. = FALSE) }) - cat("+-- Step 1 | Modifying the m1_inclusion_matrix\n") + if (verbose) message("+-- Step 1 | Modifying the m1_inclusion_matrix") # Add dummy group to group_ids dummy_group <- data.table::data.table(i = nrow(m1_inclusion_matrix), group_id = "dummy") @@ -701,19 +705,23 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, unique_groups <- unique(group_ids$group_id) num_groups <- length(unique_groups) - cat("|-- Dataset Information:\n") - cat("| |-- Number of barcodes: ", formatC(num_barcodes, format = "d", big.mark = ","), "\n") - cat("| |-- Number of events: ", formatC(num_events, format = "d", big.mark = ","), "\n") - cat("| +-- Number of groups: ", formatC(num_groups, format = "d", big.mark = ","), "\n") + if (verbose) { + message("|-- Dataset Information:") + message("| |-- Number of barcodes: ", formatC(num_barcodes, format = "d", big.mark = ",")) + message("| |-- Number of events: ", formatC(num_events, format = "d", big.mark = ",")) + message("| +-- Number of groups: ", formatC(num_groups, format = "d", big.mark = ",")) + } # Estimate memory requirements for the fast approach - cat("+-- Step 2 | Analyzing memory requirements\n") + if (verbose) message("+-- Step 2 | Analyzing memory requirements") # Get the number of non-zero elements in the sparse matrix nnz_elements <- Matrix::nnzero(m1_inclusion_matrix) - cat("| |-- Non-zero elements in matrix: ", formatC(nnz_elements, format = "d", big.mark = ","), "\n") - cat("| |-- Memory threshold: ", formatC(memory_threshold, format = "d", big.mark = ","), " rows\n") + if (verbose) { + message("| |-- Non-zero elements in matrix: ", formatC(nnz_elements, format = "d", big.mark = ",")) + message("| |-- Memory threshold: ", formatC(memory_threshold, format = "d", big.mark = ","), " rows") + } # Estimate the size of operations based on matrix sparsity and group structure # This is a conservative estimate based on the summary size and potential Cartesian products @@ -739,16 +747,16 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, # Handle NA or overflow cases if (is.na(estimated_operations) || is.infinite(estimated_operations)) { estimated_operations <- Inf - cat("| |-- WARNING: Calculation overflow detected - dataset is very large\n") + if (verbose) message("| |-- WARNING: Calculation overflow detected - dataset is very large") } if (verbose) { - cat("| |-- Maximum group size: ", formatC(max_group_size, format = "d", big.mark = ","), "\n") - cat("| |-- Average group size: ", round(avg_group_size, 2), "\n") + message("| |-- Maximum group size: ", formatC(max_group_size, format = "d", big.mark = ",")) + message("| |-- Average group size: ", round(avg_group_size, 2)) if (is.finite(estimated_operations)) { - cat("| |-- Estimated operation size: ", formatC(estimated_operations, format = "d", big.mark = ","), "\n") + message("| |-- Estimated operation size: ", formatC(estimated_operations, format = "d", big.mark = ",")) } else { - cat("| |-- Estimated operation size: Very large (overflow detected)\n") + message("| |-- Estimated operation size: Very large (overflow detected)") } } @@ -761,61 +769,77 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, use_batched_approach <- should_use_batched && !force_fast if (force_fast && should_use_batched) { - if (is.finite(estimated_operations)) { - cat("| |-- WARNING: force_fast=TRUE but estimated size (", - formatC(estimated_operations, format = "d", big.mark = ","), - ") exceeds threshold!\n") - } else { - cat("| |-- WARNING: force_fast=TRUE but dataset size suggests overflow risk!\n") + if (verbose) { + if (is.finite(estimated_operations)) { + message("| |-- WARNING: force_fast=TRUE but estimated size (", + formatC(estimated_operations, format = "d", big.mark = ","), + ") exceeds threshold!") + } else { + message("| |-- WARNING: force_fast=TRUE but dataset size suggests overflow risk!") + } + message("| |-- Proceeding with fast approach as requested - monitor memory usage") } - cat("| |-- Proceeding with fast approach as requested - monitor memory usage\n") } if (use_batched_approach) { - if (is.finite(estimated_operations)) { - cat("| |-- Estimated size exceeds memory threshold!\n") - } else { - cat("| |-- Dataset size suggests high overflow risk!\n") + if (verbose) { + if (is.finite(estimated_operations)) { + message("| |-- Estimated size exceeds memory threshold!") + } else { + message("| |-- Dataset size suggests high overflow risk!") + } + message("| +-- Automatically switching to batched processing approach") } - cat("| +-- Automatically switching to batched processing approach\n") # Call the batched processing function result <- .make_m2_batched(m1_inclusion_matrix, group_ids, batch_size, - unique_groups, multi_thread, verbose) + unique_groups, n_threads, verbose) } else { - if (force_fast) { - cat("| |-- Using fast processing approach (forced by user)\n") - } else { - cat("| |-- Memory requirements within limits\n") - cat("| +-- Using fast processing approach\n") + if (verbose) { + if (force_fast) { + message("| |-- Using fast processing approach (forced by user)") + } else { + message("| |-- Memory requirements within limits") + message("| +-- Using fast processing approach") + } } # Call the fast processing function result <- .make_m2_fast(m1_inclusion_matrix, group_ids, verbose) } - cat("Finished M2 matrix creation.\n") + if (verbose) message("Finished M2 matrix creation.") return(result) } -#' @keywords internal +#' Fast M2 Processing (Internal Function) +#' +#' Implements the original fast approach using data.table operations. +#' This approach creates the full operation in memory at once. +#' +#' @param m1_inclusion_matrix The M1 inclusion matrix. +#' @param group_ids Data table with group IDs. +#' @param verbose Logical for verbose output. +#' +#' @return A sparse M2 matrix. +#' @noRd .make_m2_fast <- function(m1_inclusion_matrix, group_ids, verbose) { - cat("+-- Step 3 | Creating M2 (fast approach)\n") + if (verbose) message("+-- Step 3 | Creating M2 (fast approach)") tryCatch({ # Convert m1_inclusion_matrix to data.table m1 <- summary(m1_inclusion_matrix) |> data.table::as.data.table() data.table::setnames(m1, "x", "x_1") - if (verbose) cat("| |-- Converted matrix to data.table format\n") + if (verbose) message("| |-- Converted matrix to data.table format") # Merge group information m1 <- merge(m1, group_ids, by = "i") m1[, x_tot := sum(x_1), .(group_id, j)] m_tot <- m1[, .(group_id, j, x_tot)] |> unique() - if (verbose) cat("| |-- Calculated group totals\n") + if (verbose) message("| |-- Calculated group totals") # Filter and merge relevant data m_tot <- m_tot[x_tot > 0] @@ -824,12 +848,12 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, m_tot[is.na(x_1), x_1 := 0] m_tot[, x_2 := x_tot - x_1] - if (verbose) cat("| |-- Created final calculation table\n") + if (verbose) message("| |-- Created final calculation table") # Create sparse matrix for M2_train M2_train <- m_tot[, Matrix::sparseMatrix(i = i, j = j, x = x_2)] - if (verbose) cat("| +-- Generated M2 sparse matrix\n") + if (verbose) message("| +-- Generated M2 sparse matrix") }, error = function(e) { if (grepl("2\\^31", e$message) || grepl("vecseq", e$message)) { @@ -840,7 +864,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, } }) - cat("+-- Step 4 | Finalizing M2 creation\n") + if (verbose) message("+-- Step 4 | Finalizing M2 creation") # Set row and column names rownames(M2_train) <- rownames(m1_inclusion_matrix) @@ -849,26 +873,39 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, # Remove dummy row from M2_train M2 <- M2_train[-nrow(M2_train), ] - cat("+-- All done!\n") + if (verbose) message("+-- All done!") return(M2) } -#' @keywords internal +#' Batched M2 Processing (Internal Function) +#' +#' Implements the batched triplet combination approach for memory-efficient processing. +#' Processes groups in batches using lapply/mclapply and combines results efficiently. +#' +#' @param m1_inclusion_matrix The M1 inclusion matrix. +#' @param group_ids Data table with group IDs. +#' @param batch_size Number of groups per batch. +#' @param unique_groups Vector of unique group identifiers. +#' @param n_threads Number of threads for parallel processing. +#' @param verbose Logical for verbose output. +#' +#' @return A sparse M2 matrix. +#' @noRd .make_m2_batched <- function(m1_inclusion_matrix, group_ids, batch_size, - unique_groups, multi_thread, verbose) { + unique_groups, n_threads, verbose) { - cat("+-- Step 3 | Creating M2 (batched processing approach)\n") + if (verbose) message("+-- Step 3 | Creating M2 (batched processing approach)") # Prepare batch processing n_groups <- length(unique_groups) n_batches <- ceiling(n_groups / batch_size) - cat("| |-- Processing ", n_groups, " groups in ", n_batches, " batches\n") + if (verbose) message("| |-- Processing ", n_groups, " groups in ", n_batches, " batches") - if (multi_thread) { - cat("| |-- Using parallel processing with mclapply\n") + if (n_threads > 1) { + if (verbose) message("| |-- Using parallel processing with mclapply (", n_threads, " threads)") } else { - cat("| |-- Using sequential processing with lapply\n") + if (verbose) message("| |-- Using sequential processing with lapply") } # Store target dimensions for final sparse matrix creation @@ -885,7 +922,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, m1[, x_tot := sum(x_1), .(group_id, j)] group_cols <- unique(m1[x_tot > 0, .(group_id, j, x_tot)]) - if (verbose) cat("| |-- Pre-calculated group totals for all groups\n") + if (verbose) message("| |-- Pre-calculated group totals for all groups") }, error = function(e) { stop("Error in initial data preparation for batched processing: ", e$message, call. = FALSE) @@ -899,8 +936,8 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, current_groups <- unique_groups[start_idx:end_idx] if (verbose) { - cat("| |---- Processing batch ", batch_idx, " of ", n_batches, - " (groups ", start_idx, " to ", end_idx, ")\n") + message("| |---- Processing batch ", batch_idx, " of ", n_batches, + " (groups ", start_idx, " to ", end_idx, ")") } # Get subset for current batch @@ -930,8 +967,8 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, # Process all batches using lapply or mclapply batch_triplets <- tryCatch({ - if (multi_thread) { - parallel::mclapply(1:n_batches, process_batch, mc.cores = parallel::detectCores()) + if (n_threads > 1) { + parallel::mclapply(1:n_batches, process_batch, mc.cores = n_threads) } else { lapply(1:n_batches, process_batch) } @@ -940,7 +977,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, }) # Combine all triplets and create final sparse matrix - cat("| +---- Combining all triplets and creating final sparse matrix\n") + if (verbose) message("| +---- Combining all triplets and creating final sparse matrix") tryCatch({ # Combine all triplet data.tables @@ -966,7 +1003,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, stop("Error combining triplets and creating final matrix: ", e$message, call. = FALSE) }) - cat("+-- Step 4 | Finalizing M2 creation\n") + if (verbose) message("+-- Step 4 | Finalizing M2 creation") # Set row and column names rownames(M2_train) <- rownames(m1_inclusion_matrix) @@ -975,7 +1012,7 @@ make_m2 <- function(m1_inclusion_matrix, eventdata, batch_size = 5000, # Remove the dummy row M2 <- M2_train[-nrow(M2_train), ] - cat("+-- All done!\n") + if (verbose) message("+-- All done!") return(M2) } @@ -1076,6 +1113,7 @@ make_eventdata_plus <- function(eventdata, GTF_file_direction) { #' @param use_internal_whitelist Logical (default \code{TRUE}). If \code{TRUE} and \code{whitelist_barcodes} is \code{NULL}, #' the function will attempt to use the default filtered barcode list from the input directory. #' If \code{FALSE}, no internal filtration will be applied unless a whitelist is explicitly provided. +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. #' #' @return #' If a single sample is provided, returns a sparse matrix of class \code{"dgCMatrix"} with genes as rows and barcodes as columns. @@ -1094,7 +1132,7 @@ make_eventdata_plus <- function(eventdata, GTF_file_direction) { #' Requires the \pkg{Matrix} package for sparse matrix handling and potentially \pkg{data.table} for efficient I/O. #' #' @export -make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NULL, use_internal_whitelist = TRUE) { +make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NULL, use_internal_whitelist = TRUE, verbose = FALSE) { # Handle single sample input by converting to lists if (!is.list(expression_dirs)) expression_dirs <- list(expression_dirs) @@ -1127,7 +1165,7 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU if (!file.exists(expression_features_dir)) stop("No features file found for sample: ", sample_id, call. = FALSE) # Read gene expression data - cat("|-- Processing gene expression data for sample: ", sample_id, "\n") + if (verbose) message("|-- Processing gene expression data for sample: ", sample_id) g_mtx <- Matrix::readMM(expression_matrix_dir) g_brc <- data.table::fread(expression_barcodes_dir, header = FALSE, showProgress = FALSE)$V1 g_feature <- data.table::fread(expression_features_dir, header = FALSE, showProgress = FALSE) @@ -1138,10 +1176,10 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU # Apply barcode filtration if (!is.null(whitelist_barcode)) { - cat("| |-- Applying provided whitelist for sample: ", sample_id, "\n") + if (verbose) message("| |-- Applying provided whitelist for sample: ", sample_id) } else if (use_internal_whitelist && file.exists(filtered_barcodes_dir)) { whitelist_barcode <- data.table::fread(filtered_barcodes_dir, header = FALSE, showProgress = FALSE)$V1 - cat("| |-- Using filtered barcodes for sample: ", sample_id, "\n") + if (verbose) message("| |-- Using filtered barcodes for sample: ", sample_id) } if (!is.null(whitelist_barcode)) { @@ -1156,10 +1194,10 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU stop("All barcodes were removed after filtration for sample: ", sample_id, call. = FALSE) } - cat("| |-- Filtered barcodes applied for sample: ", sample_id, - " (Remaining barcodes: ", final_barcodes, ")\n") + if (verbose) message("| |-- Filtered barcodes applied for sample: ", sample_id, + " (Remaining barcodes: ", final_barcodes, ")") } else { - cat("| |-- No barcode filtration applied for sample: ", sample_id, "\n") + if (verbose) message("| |-- No barcode filtration applied for sample: ", sample_id) } # Append sample_id to column names @@ -1175,7 +1213,7 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU # Return processed matrix and summary g_mtx <- as(g_mtx, "CsparseMatrix") - cat("| +-- Finished processing gene expression data for sample: ", sample_id, "\n") + if (verbose) message("| +-- Finished processing gene expression data for sample: ", sample_id) return(list(gene_expression = g_mtx, summary = summary_row)) } @@ -1193,8 +1231,10 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU summary_table <- do.call(rbind, lapply(results, function(x) x$summary)) # Print summary table - cat("\nSummary of Processed Samples:\n") - print(summary_table) + if (verbose) { + message("\nSummary of Processed Samples:") + print(summary_table) + } # Return results if (length(final_results) == 1) { @@ -1229,6 +1269,8 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU #' @param merge_counts Logical (default \code{FALSE}). If \code{TRUE}, spliced and unspliced matrices across all samples are merged #' into two combined matrices (one for spliced, one for unspliced). If \code{FALSE}, the results are returned per sample. #' +#' @param verbose Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}. +#' #' @return #' A list containing processed gene expression matrices: #' \itemize{ @@ -1256,7 +1298,7 @@ make_gene_count <- function(expression_dirs, sample_ids, whitelist_barcodes = NU #' #' @export -make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL, use_internal_whitelist = TRUE, merge_counts = FALSE) { +make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL, use_internal_whitelist = TRUE, merge_counts = FALSE, verbose = FALSE) { # Handle single sample input by converting to lists if (!is.list(velocyto_dirs)) velocyto_dirs <- list(velocyto_dirs) @@ -1291,7 +1333,7 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL if (!file.exists(features_dir)) stop("No features file found for sample: ", sample_id, call. = FALSE) # Read Velocyto data - cat("|-- Processing Velocyto data for sample: ", sample_id, "\n") + if (verbose) message("|-- Processing Velocyto data for sample: ", sample_id) spliced_mtx <- Matrix::readMM(spliced_dir) unspliced_mtx <- Matrix::readMM(unspliced_dir) stab_barcode <- data.table::fread(barcodes_dir, header = FALSE, showProgress = FALSE)$V1 @@ -1305,10 +1347,10 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL # Apply barcode filtration if (!is.null(whitelist_barcode)) { - cat("| |-- Applying provided whitelist for sample: ", sample_id, "\n") + if (verbose) message("| |-- Applying provided whitelist for sample: ", sample_id) } else if (use_internal_whitelist && file.exists(filtered_barcodes_dir)) { whitelist_barcode <- data.table::fread(filtered_barcodes_dir, header = FALSE, showProgress = FALSE)$V1 - cat("| |-- Using filtered barcodes for sample: ", sample_id, "\n") + if (verbose) message("| |-- Using filtered barcodes for sample: ", sample_id) } if (!is.null(whitelist_barcode)) { @@ -1325,10 +1367,10 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL stop("All barcodes were removed after filtration for sample: ", sample_id, call. = FALSE) } - cat("| |-- Filtered barcodes applied for sample: ", sample_id, - " (Remaining barcodes: ", final_barcodes, ")\n") + if (verbose) message("| |-- Filtered barcodes applied for sample: ", sample_id, + " (Remaining barcodes: ", final_barcodes, ")") } else { - cat("| |-- No barcode filtration applied for sample: ", sample_id, "\n") + if (verbose) message("| |-- No barcode filtration applied for sample: ", sample_id) } # Append sample ID to column names @@ -1344,7 +1386,7 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL ) # Return processed matrices and summary - cat("| +-- Finished processing Velocyto data for sample: ", sample_id, "\n") + if (verbose) message("| +-- Finished processing Velocyto data for sample: ", sample_id) return(list( spliced = as(spliced_mtx, "CsparseMatrix"), unspliced = as(unspliced_mtx, "CsparseMatrix"), @@ -1369,8 +1411,10 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL spliced_combined <- do.call(cbind, lapply(results, function(x) x$spliced)) unspliced_combined <- do.call(cbind, lapply(results, function(x) x$unspliced)) - cat("\nSummary of Merged Velocyto Counts:\n") - print(summary_table) + if (verbose) { + message("\nSummary of Merged Velocyto Counts:") + print(summary_table) + } return(list( spliced = spliced_combined, @@ -1379,8 +1423,10 @@ make_velo_count <- function(velocyto_dirs, sample_ids, whitelist_barcodes = NULL } # Print summary table if not merging - cat("\nSummary of Processed Samples:\n") - print(summary_table) + if (verbose) { + message("\nSummary of Processed Samples:") + print(summary_table) + } # Return individual results if not merging final_results <- lapply(results, function(x) list(spliced = x$spliced, unspliced = x$unspliced)) diff --git a/R/zzz.R b/R/zzz.R index 2621dea..81f42da 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -2,11 +2,11 @@ packageStartupMessage( "\n", "=======================================================================================================\n", - " Welcome to Splikit - Developed by the Computational and Statistical Genomics Laboratory\n", + " Welcome to Splikit -- Developed by the Computational and Statistical Genomics Laboratory\n", " at McGill University, Montreal. Distributed under the MIT License.\n", " For documentation and examples, see the package vignette.\n", "-------------------------------------------------------------------------------------------------------\n", - " Bienvenue dans Splikit - Developpe par le Laboratoire de genomique computationnelle et statistique\n", + " Bienvenue dans Splikit -- Developpe par le Laboratoire de genomique computationnelle et statistique\n", " de l'Universite McGill, a Montreal. Distribue sous licence MIT.\n", " Pour la documentation et des exemples, consultez la vignette du package.\n", "=======================================================================================================\n" diff --git a/README.md b/README.md index 3f9038a..c2b381d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ## **Requirements** -- R version 3.5.0 or later. +- [R version 4.1.0](https://www.R-project.org/) or later. - R libraries: [Rcpp](https://CRAN.R-project.org/package=Rcpp), [RcppArmadillo](https://CRAN.R-project.org/package=RcppArmadillo), [Matrix](https://CRAN.R-project.org/package=Matrix), [data.table](https://CRAN.R-project.org/package=data.table), [R6](https://CRAN.R-project.org/package=R6) diff --git a/man/SplikitObject.Rd b/man/SplikitObject.Rd index 338e692..f656a86 100644 --- a/man/SplikitObject.Rd +++ b/man/SplikitObject.Rd @@ -115,7 +115,6 @@ Compute the M2 exclusion matrix from M1 and eventData. batch_size = 5000, memory_threshold = 2e+09, force_fast = FALSE, - multi_thread = FALSE, n_threads = 1, use_cpp = TRUE, verbose = FALSE @@ -131,9 +130,7 @@ Compute the M2 exclusion matrix from M1 and eventData. \item{\code{force_fast}}{Force fast processing regardless of size (default: FALSE).} -\item{\code{multi_thread}}{Use parallel processing for batched operations (default: FALSE).} - -\item{\code{n_threads}}{Number of threads for C++ OpenMP parallelization (default: 1).} +\item{\code{n_threads}}{Number of threads for parallel processing (default: 1).} \item{\code{use_cpp}}{Use fast C++ implementation (default: TRUE).} diff --git a/man/find_variable_events.Rd b/man/find_variable_events.Rd index e38eb2b..edfe966 100644 --- a/man/find_variable_events.Rd +++ b/man/find_variable_events.Rd @@ -9,7 +9,7 @@ find_variable_events( m2_matrix = NULL, min_row_sum = 50, n_threads = 1, - verbose = TRUE, + verbose = FALSE, ... ) } @@ -22,7 +22,7 @@ find_variable_events( \item{n_threads}{If the module OpenPM is available for your device, the function suggests using multi-thread processing for even faster computation.} -\item{verbose}{Logical. If \code{TRUE} (default), prints progress and informational messages.} +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} \item{...}{Additional arguments to be passed.} } diff --git a/man/find_variable_genes.Rd b/man/find_variable_genes.Rd index f296575..ddde121 100644 --- a/man/find_variable_genes.Rd +++ b/man/find_variable_genes.Rd @@ -8,7 +8,7 @@ find_variable_genes( gene_expression_matrix, method = "vst", n_threads = 1, - verbose = TRUE, + verbose = FALSE, ... ) } @@ -21,7 +21,7 @@ find_variable_genes( \item{n_threads}{If OpenMP is available for your device, the function suggests using multi-thread processing for even faster computation (only for sum_deviance method).} -\item{verbose}{Logical. If \code{TRUE} (default), prints progress and informational messages.} +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} \item{...}{Additional arguments (currently unused).} } @@ -42,8 +42,8 @@ toy_obj <- load_toy_M1_M2_object() # getting high variable genes HVG_VST <- find_variable_genes(toy_obj$gene_expression, method = "vst") -HVG_DEV <- find_variable_genes(toy_obj$gene_expression, - method = "sum_deviance") +# sum_deviance method +HVG_DEV <- find_variable_genes(toy_obj$gene_expression, method = "sum_deviance") # Using multi-threading for faster computation (sum_deviance method only) HVG_DEV_MT <- find_variable_genes(toy_obj$gene_expression, diff --git a/man/get_pseudo_correlation.Rd b/man/get_pseudo_correlation.Rd index 227c9ac..97d0d14 100644 --- a/man/get_pseudo_correlation.Rd +++ b/man/get_pseudo_correlation.Rd @@ -9,7 +9,8 @@ get_pseudo_correlation( m1_inclusion = NULL, m2_exclusion = NULL, metric = "CoxSnell", - suppress_warnings = TRUE + suppress_warnings = TRUE, + verbose = FALSE ) } \arguments{ @@ -21,8 +22,9 @@ get_pseudo_correlation( \item{metric}{Character string specifying which R² metric to compute. Options are "CoxSnell" (default) or "Nagelkerke".} -\item{suppress_warnings}{Logical. If \code{TRUE} (default), suppresses warnings during any warnings triggered during -computation (e.g., due to ill-conditioned inputs)} +\item{suppress_warnings}{Logical. If \code{TRUE} (default), suppresses warnings during computation (e.g., due to ill-conditioned inputs).} + +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} } \value{ A \code{data.table} with the following columns: @@ -49,8 +51,6 @@ m1_inclusion <- m1_obj$m1_inclusion_matrix eventdata <- m1_obj$event_data m2_exclusion <- make_m2(m1_inclusion, eventdata) -\donttest{ - # creating a dummy ZDB ZDB_matrix <- matrix(rnorm(n = (nrow(m1_inclusion) * ncol(m1_inclusion)), sd = 7), nrow = nrow(m1_inclusion), @@ -64,5 +64,12 @@ m2_dense <- as.matrix(m2_exclusion) pseudo_r_square_cox <- get_pseudo_correlation(ZDB_matrix, m1_dense, m2_dense) print(pseudo_r_square_cox) - } +# Example with sparse matrices (more memory efficient) +pseudo_r_square_sparse <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion) + +# Example using Nagelkerke R² instead of Cox-Snell +pseudo_r_square_nagel <- get_pseudo_correlation(ZDB_matrix, m1_inclusion, m2_exclusion, + metric = "Nagelkerke") +print(pseudo_r_square_nagel) + } diff --git a/man/get_rowVar.Rd b/man/get_rowVar.Rd index 2cf1ad1..3dc7180 100644 --- a/man/get_rowVar.Rd +++ b/man/get_rowVar.Rd @@ -9,7 +9,7 @@ get_rowVar(M, verbose = FALSE) \arguments{ \item{M}{A numeric matrix (base R matrix) or a sparse matrix of class \code{"dgCMatrix"}.} -\item{verbose}{Logical. If \code{TRUE} (default), prints progress and informational messages.} +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} } \value{ A numeric vector of length \code{nrow(M)} containing the variance of each row. diff --git a/man/get_silhouette_mean.Rd b/man/get_silhouette_mean.Rd index d0aa469..866c993 100644 --- a/man/get_silhouette_mean.Rd +++ b/man/get_silhouette_mean.Rd @@ -4,7 +4,7 @@ \alias{get_silhouette_mean} \title{Compute Average Silhouette Width with Logging} \usage{ -get_silhouette_mean(X, cluster_assignments, n_threads = 1) +get_silhouette_mean(X, cluster_assignments, n_threads = 1, verbose = FALSE) } \arguments{ \item{X}{A numeric matrix where rows are observations and columns are features.} @@ -12,6 +12,8 @@ get_silhouette_mean(X, cluster_assignments, n_threads = 1) \item{cluster_assignments}{An integer vector of cluster assignments, which must be the same length as the number of rows in \code{X}.} \item{n_threads}{Number of threads to use for parallel processing.} + +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} } \value{ A single numeric value: the average silhouette score. diff --git a/man/make_gene_count.Rd b/man/make_gene_count.Rd index 05ca7b0..476c1fb 100644 --- a/man/make_gene_count.Rd +++ b/man/make_gene_count.Rd @@ -8,7 +8,8 @@ make_gene_count( expression_dirs, sample_ids, whitelist_barcodes = NULL, - use_internal_whitelist = TRUE + use_internal_whitelist = TRUE, + verbose = FALSE ) } \arguments{ @@ -25,6 +26,8 @@ barcode file (e.g., \code{barcodes.tsv} or \code{barcodes_filtered.tsv}) if avai \item{use_internal_whitelist}{Logical (default \code{TRUE}). If \code{TRUE} and \code{whitelist_barcodes} is \code{NULL}, the function will attempt to use the default filtered barcode list from the input directory. If \code{FALSE}, no internal filtration will be applied unless a whitelist is explicitly provided.} + +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} } \value{ If a single sample is provided, returns a sparse matrix of class \code{"dgCMatrix"} with genes as rows and barcodes as columns. diff --git a/man/make_m2.Rd b/man/make_m2.Rd index d34df25..1d4d810 100644 --- a/man/make_m2.Rd +++ b/man/make_m2.Rd @@ -10,7 +10,6 @@ make_m2( batch_size = 5000, memory_threshold = 2e+09, force_fast = FALSE, - multi_thread = FALSE, n_threads = 1, use_cpp = TRUE, verbose = FALSE @@ -30,11 +29,8 @@ summary before switching to batched processing (default: 2e9, which is ~93\% of \item{force_fast}{A logical flag to force fast processing regardless of size estimates (default: FALSE). WARNING: This may cause memory errors on large datasets.} -\item{multi_thread}{A logical flag to enable parallel processing for batched operations (default: FALSE). -Only used when batched processing is triggered. Requires parallel package.} - -\item{n_threads}{Number of threads for C++ parallel processing (default: 1). -Only used when use_cpp = TRUE.} +\item{n_threads}{Number of threads for parallel processing (default: 1). +Used for C++ implementation and batched R processing. Values > 1 require parallel package for batched mode.} \item{use_cpp}{Logical flag to use fast C++ implementation (default: TRUE). Falls back to R implementation if FALSE.} diff --git a/man/make_velo_count.Rd b/man/make_velo_count.Rd index 2db80bc..fc70856 100644 --- a/man/make_velo_count.Rd +++ b/man/make_velo_count.Rd @@ -9,7 +9,8 @@ make_velo_count( sample_ids, whitelist_barcodes = NULL, use_internal_whitelist = TRUE, - merge_counts = FALSE + merge_counts = FALSE, + verbose = FALSE ) } \arguments{ @@ -29,6 +30,8 @@ raw matrix will be used unless a whitelist is explicitly provided.} \item{merge_counts}{Logical (default \code{FALSE}). If \code{TRUE}, spliced and unspliced matrices across all samples are merged into two combined matrices (one for spliced, one for unspliced). If \code{FALSE}, the results are returned per sample.} + +\item{verbose}{Logical. If \code{TRUE}, prints progress and informational messages. Default is \code{FALSE}.} } \value{ A list containing processed gene expression matrices: diff --git a/man/splikit-package.Rd b/man/splikit-package.Rd index 81001f6..b6cfa6b 100644 --- a/man/splikit-package.Rd +++ b/man/splikit-package.Rd @@ -3,11 +3,11 @@ \docType{package} \name{splikit-package} \alias{splikit-package} -\title{splikit: A Toolkit for Analysing RNA Splicing in scRNA-Seq Data} +\title{splikit: Analysing RNA Splicing in scRNA-Seq Data} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} -A toolkit designed for the analysis of high-dimensional single-cell splicing data. Provides a framework to extract and work with ratio-based data structures derived from single-cell RNA sequencing experiments. Offers both a modern R6 object-oriented interface and direct matrix manipulation functions. Core functionalities are implemented in C++ via 'Rcpp' to ensure high performance and scalability on large datasets. +Provides analysis of high-dimensional single-cell splicing data. Offers a framework to extract and work with ratio-based data structures derived from single-cell RNA sequencing experiments. Provides both a modern R6 object-oriented interface and direct matrix manipulation functions. Core functionalities are implemented in C++ via 'Rcpp' to ensure high performance and scalability on large datasets. } \seealso{ Useful links: diff --git a/src/cpp_pseudoR2.cpp b/src/cpp_pseudoR2.cpp index 3520c32..98b1edc 100644 --- a/src/cpp_pseudoR2.cpp +++ b/src/cpp_pseudoR2.cpp @@ -71,7 +71,9 @@ List fit_logistic_regression(const vec& x, const vec& m1, const vec& m2) { vec XtWz = X.t() * W * z; vec beta_new; - bool solved = solve(beta_new, XtWX, XtWz); + // Use no_approx option to disable approximation for ill-conditioned matrices + // Warnings are handled at R level via suppressWarnings() + bool solved = solve(beta_new, XtWX, XtWz, solve_opts::no_approx); if (!solved) { return List::create(Named("beta") = NumericVector::create(NA_REAL, NA_REAL), Named("deviance") = NA_REAL,