RaredonLab · huangyaqing-123 · Aug 9, 2025 · Dec 23, 2024 · Jan 2, 2025 · Jan 2, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,5 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^LICENSE\.md$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# DS Store
+.DS_Store
+
 # History files
 .Rhistory
 .Rapp.history
@@ -39,11 +42,10 @@ vignettes/*.pdf
 # R Environment Variables
 .Renviron
 
-# pkgdown site
-docs/
-
 # translation temp files
 po/*~
 
 # RStudio Connect folder
 rsconnect/
+inst/doc
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,35 @@
+Package: PathwayEmbed
+Title: Tools for Pathway-Level Embedding and Visualization in Single-Cell Data
+Version: 0.0.0.9000
+Authors@R: 
+    person("Yaqing", "Huang", email = "yaqing.huang@yale.edu", role = c("aut", "cre"))
+Description: Provides tools for analyzing and visualizing pathway-level activity 
+    in single-cell RNA-seq data. Includes functions for computing cell-wise pathway scores, 
+    visualizing transduction states, calculating activation percentages, 
+    and integrating pathway data with Seurat objects.
+License: MIT + file LICENSE
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.3.2
+Depends: 
+    R (>= 3.5)
+Imports:
+    readxl,
+    Seurat,
+    RColorBrewer,
+    ggplot2,
+    cowplot,
+    dplyr,
+    matrixStats,
+    viridis,
+    stats,
+    effsize,
+    tidyverse,
+    purrr
+Suggests: 
+    knitr,
+    rmarkdown,
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
+LazyData: true
+VignetteBuilder: knitr
diff --git a/LICENSE b/LICENSE
@@ -1,21 +1,2 @@
-MIT License
-
-Copyright (c) 2024 Raredon Lab
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+YEAR: 2025
+COPYRIGHT HOLDER: Raredon Lab
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2025 Raredon Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,25 @@
+# Generated by roxygen2: do not edit by hand
+
+export(CalculatePercentage)
+export(ComputeCellData)
+export(LoadPathway)
+export(PathwayMaxMin)
+export(PlotPathway)
+export(PreparePlotData)
+import(RColorBrewer)
+import(Seurat)
+import(cowplot)
+import(ggplot2)
+import(matrixStats)
+import(readxl)
+import(tidyverse)
+import(viridis)
+importFrom(dplyr,"%>%")
+importFrom(dplyr,bind_rows)
+importFrom(effsize,cohen.d)
+importFrom(matrixStats,rowMaxs)
+importFrom(matrixStats,rowMins)
+importFrom(purrr,map)
+importFrom(stats,cmdscale)
+importFrom(stats,dist)
+importFrom(stats,na.omit)
diff --git a/PathwayEmbed.Rproj b/PathwayEmbed.Rproj
@@ -0,0 +1,23 @@
+Version: 1.0
+ProjectId: 0c111876-39b0-460f-a888-db107bec1084
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+LineEndingConversion: Posix
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/R/CalculatePercentage.R b/R/CalculatePercentage.R
@@ -0,0 +1,63 @@
+#' CalculatePercentage
+#'
+#' This function calculates the percentage of cells in ON (scale > 0) and OFF (scale < 0)
+#' activation states within each group defined by `group_var`. If exactly two groups
+#' are provided, it also computes Cohen's d effect size between their activation values.
+#'
+#' @name CalculatePercentage
+#' @importFrom dplyr bind_rows
+#' @importFrom effsize cohen.d
+#' @importFrom stats na.omit
+#' @param to.plot A data frame containing at least a `scale` column and a grouping column.
+#' @param group_var A string specifying the grouping variable (e.g., "genotype", "treatment").
+#'
+#' @return A data frame with the percentage of ON/OFF cells and Cohen's d (if applicable).
+#' @examples
+#' data(fake_to_plot)
+#' CalculatePercentage(fake_to_plot, "genotype")
+#' @export
+CalculatePercentage <- function(to.plot, group_var){
+  # Make sure there is scale data
+  stopifnot("scale" %in% names(to.plot))
+
+  # Make sure no NA
+  groups <- unique(na.omit(to.plot[[group_var]]))
+  results <- list()
+
+  for (g in groups) {
+    subset_data <- to.plot[to.plot[[group_var]] == g, ]
+    total <- nrow(subset_data)
+
+    # Calculate how many cells are in on/off status
+    on <- sum(subset_data[["scale"]] > 0, na.rm = TRUE)
+    off <- sum(subset_data[["scale"]] < 0, na.rm = TRUE)
+
+    # Calculate percentages of on/off cells
+    results[[as.character(g)]] <- list(
+      percentage_on = round(100 * on / total, 2),
+      percentage_off = round(100 * off / total, 2)
+    )
+  }
+
+  # When there are two groups in comparison, Cohen's d — a measure of effect size — will be applied for statistic purpose
+  if (length(groups) == 2) {
+    g1 <- groups[1]
+    g2 <- groups[2]
+    vec1 <- to.plot[to.plot[[group_var]] == g1, "scale"]
+    vec2 <- to.plot[to.plot[[group_var]] == g2, "scale"]
+
+    # Computes Cohen's d between two numeric vectors (vec1 and vec2) and extracts the estimated value of the effect size.
+    cohens_d_val <- cohen.d(vec1, vec2)$estimate
+    # |d value|: 0 - 0.2, effect size is negligible
+    # |d value|: 0.2 - 0.5: small effect
+    # |d value|: 0.5 - 0.8: medium effect
+    # |d value|: > 0.8: large effect
+
+    results[[as.character(g1)]]$cohens_d <- cohens_d_val
+    results[[as.character(g2)]]$cohens_d <- cohens_d_val
+  }
+
+  # Make a dataframe for the output
+  df <- bind_rows(results, .id = "group")
+  return(df)
+}
diff --git a/R/ComputeCellData.R b/R/ComputeCellData.R
@@ -0,0 +1,145 @@
+#' ComputeCellData
+#'
+#' A function computes cell status for a given pathway in single-cell RNA-seq data,
+#' based on the distance between genes in a specified pathway. The distance is computed
+#' for each batch of cells, and classical multidimensional scaling (MDS) is used to
+#' visualize the pathway expression across cells.
+#'
+#' @name ComputeCellData
+#' @import Seurat
+#' @importFrom matrixStats rowMins rowMaxs
+#' @importFrom stats dist cmdscale
+#' @importFrom dplyr %>%
+#' @importFrom purrr map
+#' @import tidyverse
+#' @import viridis
+#'
+#' @param x A `Seurat` object containing single-cell RNA sequencing data.
+#' @param pathway A `character` string specifying the pathway name. This should match a pathway used by `LoadPathway()`.
+#' @param distance.method A `character` string specifying the distance metric to use.Default is "manhattan".
+#' Options include: `"manhattan"`, `"euclidean"`, `"canberra"`, `"binary"`, `"minkowski"`
+#' @param batch.size An `integer` specifying the number of cells to process per batch. Default is 1000.
+#' @param scale.data A `logical` indicating whether to use scaled data (`scale.data = TRUE`) or normalized data. Default is `TRUE`.
+#'
+#' @return A data frame of MDS results with normalized values per cell, suitable for thresholding or visualization.
+#'
+#' @examples
+#' data(fake_test_object)
+#' ComputeCellData(fake_test_object, pathway = "Wnt", distance.method = "manhattan", batch.size = 2000)
+#'
+#' @export
+ComputeCellData <- function(x, pathway, distance.method, batch.size = batch.size, scale.data = TRUE){
+
+  # Get pathway data
+  pathwaydata <- LoadPathway(pathway)
+  names <- c(pathwaydata[[1]])
+
+  # Use only genes present in Seurat object
+  valid_names <- intersect(names, rownames(x))
+  if (length(valid_names) == 0) {
+    stop("No valid pathway genes found in the Seurat object.")
+  }
+  x <- ScaleData(x, features = valid_names)
+
+  # Extract expression data from the desired slot
+  slot_use <- if (scale.data) "scale.data" else "data"
+  expr_data <- GetAssayData(x, assay = "RNA", slot = slot_use)[valid_names, , drop = FALSE]
+
+  # Pathway max and min
+  pathway.stat <- PathwayMaxMin(x, pathway)
+
+  # Get cell indices
+  cell_id <- colnames(expr_data)
+
+  # Shuffle cell indices
+  shuffled_cell_id <- sample(cell_id)
+
+  # Split shuffled indices into batches
+  # Check if batch.size is provided; if not, set default and message
+  if (missing(batch.size) || is.null(batch.size)) {
+    message("Parameter 'batch.size' is missing or NULL. Setting default batch size to 1000.")
+    batch.size <- 1000
+  }
+
+  # Define batch size
+  batch_size <- batch.size
+
+  batches <- split(shuffled_cell_id, ceiling(seq_along(shuffled_cell_id) / batch.size))
+
+  # Subset expression data into chunks based on sampled indices
+  expr_chunks <- lapply(batches, function(cols) expr_data[, cols, drop = FALSE])
+
+  # For each expr_chunks, do distance measuring
+  # Initialize list to store results
+  batch_results <- list()
+
+  # Loop through batches of 500 cells
+  for (i in seq_len(length(batches))) {
+
+    message("Processing batch ", i)
+
+    # Extract and convert expression chunk
+    expr_data <- expr_chunks[[i]]
+    temp.data.batch <- as.data.frame(expr_data)
+
+    # Merge along columns
+    pathwaytempdata <- cbind(pathway.stat, temp.data.batch)
+
+    # Check for enough cells (columns)
+    if (ncol(pathwaytempdata) < 2) {
+      warning("Batch ", i, " does not have enough cells for distance calculation. Skipping...")
+      next
+    }
+
+    # Check if distance.method is provided; if not, set default and message
+    if (missing(distance.method) || is.null(distance.method)) {
+      message("Parameter 'distance.method' is missing or NULL. Setting default distance.method to 'manhattan'.")
+      distance.method <- "manhattan"
+    }
+
+    # Distance calculation
+    message("Computing distance...")
+    d <- dist(t(pathwaytempdata), method = distance.method)
+    # "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression)
+    # "euclidean" is stratight-line distance, is useful for PCA clustering
+    # "canberra" is weighted distance, is also good for sparse data and when values have very different scales
+    # "binary" is distance based on presence/absence (0/1)
+    # "minkowski" is generalization of euclidean & manhattan, tunable using p parameter
+    # choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance
+
+    # MDS
+    message("Running MDS ...")
+    fit <- cmdscale(d, eig = TRUE, k = 1)
+    message("MDS finished")
+
+    # Normalize the MDS values
+    temp.data.mds <- as.data.frame(fit$points)
+    colnames(temp.data.mds) <- "V1"
+    V1_min <- min(temp.data.mds$V1, na.rm = TRUE)
+    V1_max <- max(temp.data.mds$V1, na.rm = TRUE)
+
+    if (V1_max == V1_min) {
+      temp.data.mds$normalized <- 0
+    } else {
+      temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min)
+    }
+
+    # Store result
+    batch_results[[i]] <- temp.data.mds
+
+    # Report
+    cat("Batch", i, "processed with", ncol(expr_data), "cells\n")
+  }
+
+  final_mds <- do.call(rbind, batch_results)  # Merge all batch MDS results
+
+  return(final_mds)
+}
+
+# using sample
+# barcode list (randomization)
+# list of data chunk
+# make these list independent
+# short loop
+# lappy, sapply (list-wide operation)
+# https://www.r-bloggers.com/2022/03/complete-tutorial-on-using-apply-functions-in-r/
diff --git a/R/LoadPathway.R b/R/LoadPathway.R
@@ -0,0 +1,27 @@
+#' LoadPathway
+#'
+#' This function reads pathway data from the package's built-in Excel file.
+#'
+#' @name LoadPathway
+#' @param pathway A `character` string specifying the pathway name.
+#' @return A data frame with pathway data.
+#' @examples
+#' LoadPathway("Wnt")
+#' @import readxl
+#' @export
+LoadPathway <- function(pathway) {
+  file_path <- system.file("extdata", "Pathway_Embedding.xlsx", package = "PathwayEmbed")
+
+  if (file_path == "") {
+    stop("Pathway data file not found. Ensure the package is installed correctly.")
+  }
+
+  # Read the specified sheet
+  data <- readxl::read_excel(file_path, sheet = pathway)
+  # extract the molecules in the pathway
+  pathway.molecules <- c(data[["Molecules"]])
+  # extract the coefficients of the molecules in the pathway
+  pathway.coefficients <- as.numeric(c(data[["Coefficients"]]))
+
+  return(data)
+}