diff --git a/R/Likelihood_functions.R b/R/Likelihood_functions.R index d42261a..39ddd4d 100644 --- a/R/Likelihood_functions.R +++ b/R/Likelihood_functions.R @@ -1,60 +1,60 @@ -#Likelihood functions for calling methylation states from bsseq objects generated from read.bedMethyl +#Likelihood functions for calling methylation status from MethylCounts objects generated from read.bedMethyl #estimateErrorRate estimateErrorRate <- function( - BSseq, - minCov = 10, - maxCov = 100, - minRatio = 0.8, - plotErrorProfile = FALSE) { - - # Input validation of BSseq - if (!inherits(BSseq, "BSseq")) { - stop("Error: The input object must be of class 'BSseq'.") - } - if (ncol(BSseq) != 1) { - stop("Error: The input 'BSseq' must have exactly one column.") - } - if (!all(c("D", "Cov") %in% names(assays(BSseq)))) { - stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.") - } - - # Input validation for thresholds - if (minCov < 0 || maxCov < 0) { - stop("Error: Both 'minCov' and 'maxCov' must be non-negative.") - } - if (maxCov <= minCov) { - stop("Error: 'maxCov' must be greater than 'minCov'.") - } - if (minRatio < 0 || minRatio > 1) { - stop("Error: 'minRatio' must be between 0 and 1.") - } - - # Access assay data safely - cov <- assays(BSseq)[["Cov"]] - d <- assays(BSseq)[["D"]] - - # Calculate CpG-to-non-CpG ratio and filter sites - ratio <- cov / (cov + d) - sites <- getCoverage(BSseq) >= minCov & - getCoverage(BSseq) <= maxCov & - ratio >= minRatio - - # Plot the error profile if requested - if (plotErrorProfile) { - if (any(sites)) { - p_all <- hist(ratio, breaks = 100, plot = FALSE) - p_used <- hist(ratio[sites], breaks = (1 - minRatio) * 100, plot = FALSE) - plot(p_all, xlim = c(0, 1), xlab = "CpG to non-CpG ratio", main = "Error Profile") - plot(p_used, col = "black", xlim = c(0, 1), add = TRUE) - } else { - warning("No sites satisfy the filtering criteria. No error profile will be plotted.") + MethylCounts, + minCov = 10, + maxCov = 100, + minRatio = 0.8, + plotErrorProfile = FALSE) { + + # Input validation of MethylCounts + if (!inherits(MethylCounts, "MethylCounts")) { + stop("Error: The input object must be of class 'MethylCounts'.") } - } - - # Calculate and return the error rate - error_rate <- sum(d[sites]) / (sum(d[sites]) + sum(cov[sites])) - return(error_rate) + if (ncol(MethylCounts) != 1) { + stop("Error: The input 'MethylCounts' must have exactly one column.") + } + if (!all(c("M", "U", "D") %in% names(assays(MethylCounts)))) { + stop("Error: The 'MethylCounts' must contain 'M', 'U', and 'D' assays.") + } + + # Input validation for thresholds + if (minCov < 0 || maxCov < 0) { + stop("Error: Both 'minCov' and 'maxCov' must be non-negative.") + } + if (maxCov <= minCov) { + stop("Error: 'maxCov' must be greater than 'minCov'.") + } + if (minRatio < 0 || minRatio > 1) { + stop("Error: 'minRatio' must be between 0 and 1.") + } + + # Access assay data safely + cov <- getMethylCounts(MethylCounts, type="Cov") + d <- getMethylCounts(MethylCounts, type="D") + + # Calculate CpG-to-non-CpG ratio and filter sites + ratio <- cov / (cov + d) + sites <- cov >= minCov & + cov <= maxCov & + ratio >= minRatio + + # Plot the error profile if requested + if (plotErrorProfile) { + if (any(sites)) { + p_all <- hist(ratio, breaks = 100, plot = FALSE) + p_used <- hist(ratio[sites], breaks = (1 - minRatio) * 100, plot = FALSE) + plot(p_all, xlim = c(0, 1), xlab = "CpG to non-CpG ratio", main = "Error Profile") + plot(p_used, col = "black", xlim = c(0, 1), add = TRUE) + } else { + warning("No sites satisfy the filtering criteria. No error profile will be plotted.") + } + } + + # Calculate and return the error rate + error_rate <- sum(d[sites]) / (sum(d[sites]) + sum(cov[sites])) + return(error_rate) } #genotype_likelihood @@ -76,54 +76,55 @@ estimateErrorRate <- function( return(likelihood) } + #.getScaledLikelihoods #internal function to calculate the scaled likelihoods for each genotype .getScaledLikelihoods <- function( - BSseq, - e = NULL) { - # Input validation of BSseq - if (!inherits(BSseq, "BSseq")) { - stop("Error: The input object must be of class 'BSseq'.") - } - if (ncol(BSseq) != 1) { - stop("Error: The input 'BSseq' must have exactly one column.") - } - if (!all(c("D", "Cov") %in% names(assays(BSseq)))) { - stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.") - } - # Input validation of e - if (!is.null(e) && (e < 0 || e > 1)) { - stop("Error: 'e' must be between 0 and 1.") - } - if (is.null(e)) { - e <- estimateErrorRate(BSseq) - } - cov <- assays(BSseq)[["Cov"]] - d <- assays(BSseq)[["D"]] - - l <- matrix(0, nrow = length(cov), ncol = 3) - l[, 1] <- .genotype_likelihood(g = 0, e = e, cov = cov, D = d) - l[, 2] <- .genotype_likelihood(g = 1, e = e, cov = cov, D = d) - l[, 3] <- .genotype_likelihood(g = 2, e = e, cov = cov, D = d) - - SL <- l / rowSums(l) - return(SL) + MethylCounts, + e = NULL) { + # Input validation of BSseq + if (!inherits(MethylCounts, "MethylCounts")) { + stop("Error: The input object must be of class 'MethylCounts'.") + } + if (ncol(MethylCounts) != 1) { + stop("Error: The input 'MethylCounts' must have exactly one column.") + } + if (!all(c("M", "U", "D") %in% names(assays(MethylCounts)))) { + stop("Error: The 'MethylCounts' must contain 'M', 'U', and 'D' assays.") + } + # Input validation of e + if (!is.null(e) && (e < 0 || e > 1)) { + stop("Error: 'e' must be between 0 and 1.") + } + if (is.null(e)) { + e <- estimateErrorRate(MethylCounts) + } + cov <- getMethylCounts(MethylCounts, type="Cov") + d <- getMethylCounts(MethylCounts, type="D") + + l <- matrix(0, nrow = length(cov), ncol = 3) + l[, 1] <- .genotype_likelihood(g = 0, e = e, cov = cov, D = d) + l[, 2] <- .genotype_likelihood(g = 1, e = e, cov = cov, D = d) + l[, 3] <- .genotype_likelihood(g = 2, e = e, cov = cov, D = d) + + SL <- l / rowSums(l) + return(SL) } getCpGs <- function( - BSseq, - type = c("homozygous", "heterozygous", "allCpG", "nonCpG"), - threshold = 0.99, + MethylCounts, + type = c("homozygous", "heterozygous", "allCpG", "nonCpG"), + threshold = 0.99, e = NULL) { - # Input validation of BSseq - if (!inherits(BSseq, "BSseq")) { - stop("Error: The input object must be of class 'BSseq'.") + # Input validation of MethylCounts + if (!inherits(MethylCounts, "MethylCounts")) { + stop("Error: The input object must be of class 'MethylCounts'.") } - if (ncol(BSseq) != 1) { - stop("Error: The input 'BSseq' must have exactly one column.") + if (ncol(MethylCounts) != 1) { + stop("Error: The input 'MethylCounts' must have exactly one column.") } - if (!all(c("D", "Cov") %in% names(assays(BSseq)))) { - stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.") + if (!all(c("M", "U", "D") %in% names(assays(MethylCounts)))) { + stop("Error: The 'MethylCounts' must contain 'M', 'U', and 'D' assays.") } # Input validation of type if (!type %in% c("homozygous", "heterozygous", "allCpG", "nonCpG")) { @@ -138,51 +139,51 @@ getCpGs <- function( stop("Error: 'e' must be between 0 and 1.") } # Get scaled likelihoods - SL <- .getScaledLikelihoods(BSseq, e) - + SL <- .getScaledLikelihoods(MethylCounts, e) + # Determine the CpG sites based on the type loci <- switch(type, "homozygous" = which(SL[, 1] > threshold), "heterozygous" = which(SL[, 2] > threshold), "allCpG" = which(rowSums(SL[, 1:2]) > threshold), "nonCpG" = which(SL[, 3] > threshold)) - + return(loci) } getCpGMatrix <- function( - BSseq, - e = NULL, + MethylCounts, + e = NULL, allCpG = FALSE) { - - # Input validation of BSseq - if (!inherits(BSseq, "BSseq")) { - stop("Error: The input object must be of class 'BSseq'.") - } - if (!all(c("D", "Cov") %in% names(assays(BSseq)))) { - stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.") - } - + + # Input validation of MethylCounts + if (!inherits(MethylCounts, "MethylCounts")) { + stop("Error: The input object must be of class 'MethylCounts'.") + } + if (!all(c("M", "U", "D") %in% names(assays(MethylCounts)))) { + stop("Error: The 'MethylCounts' must contain 'M', 'U', and 'D' assays.") + } + # Input validation of e if (!is.null(e)) { - if (!is.numeric(e) || (length(e) != ncol(BSseq))) { - stop("Error: 'e' must be NULL or a numeric vector with the same length as the number of columns in 'BSseq'.") + if (!is.numeric(e) || (length(e) != ncol(MethylCounts))) { + stop("Error: 'e' must be NULL or a numeric vector with the same length as the number of columns in 'MethylCounts'.") } if (any(e < 0 | e > 1)) { stop("Error: All elements of 'e' must be between 0 and 1.") } } - + # Initialize CpG matrix - G <- matrix(nrow = nrow(BSseq), ncol = ncol(BSseq), byrow = FALSE) - - for (i in 1:ncol(BSseq)) { + G <- matrix(nrow = nrow(MethylCounts), ncol = ncol(MethylCounts), byrow = FALSE) + + for (i in 1:ncol(MethylCounts)) { # Use the provided error rate or estimate it - e_i <- if (is.null(e)) estimateErrorRate(BSseq[, i]) else e[i] - + e_i <- if (is.null(e)) estimateErrorRate(MethylCounts[, i]) else e[i] + # Get scaled likelihoods - SL_i <- .getScaledLikelihoods(BSseq[, i], e_i) - + SL_i <- .getScaledLikelihoods(MethylCounts[, i], e_i) + # Calculate CpG matrix if (allCpG) { G[, i] <- ifelse(SL_i[, 3] >= 1/3, 2, 0) @@ -190,43 +191,43 @@ getCpGMatrix <- function( G[, i] <- max.col(SL_i, ties.method = "last") - 1 } } - + return(G) } getMaxLikelihoodMatrix <- function( - BSseq, - e = NULL, + MethylCounts, + e = NULL, allCpG = FALSE) { - - # Input validation of BSseq - if (!inherits(BSseq, "BSseq")) { - stop("Error: The input object must be of class 'BSseq'.") - } - if (!all(c("D", "Cov") %in% names(assays(BSseq)))) { - stop("Error: The 'BSseq' must contain 'D' and 'Cov' assays.") - } - + + # Input validation of MethylCounts + if (!inherits(MethylCounts, "MethylCounts")) { + stop("Error: The input object must be of class 'MethylCounts'.") + } + if (!all(c("M", "U", "D") %in% names(assays(MethylCounts)))) { + stop("Error: The 'MethylCounts' must contain 'M', 'U', and 'D' assays.") + } + # Input validation of e if (!is.null(e)) { - if (!is.numeric(e) || (length(e) != ncol(BSseq))) { - stop("Error: 'e' must be NULL or a numeric vector with the same length as the number of columns in 'BSseq'.") + if (!is.numeric(e) || (length(e) != ncol(MethylCounts))) { + stop("Error: 'e' must be NULL or a numeric vector with the same length as the number of columns in 'MethylCounts'.") } if (any(e < 0 | e > 1)) { stop("Error: All elements of 'e' must be between 0 and 1.") } } - + # Initialize quality matrix - Q <- matrix(nrow = nrow(BSseq), ncol = ncol(BSseq), byrow = FALSE) - - for (i in 1:ncol(BSseq)) { + Q <- matrix(nrow = nrow(MethylCounts), ncol = ncol(MethylCounts), byrow = FALSE) + + for (i in 1:ncol(MethylCounts)) { # Use the provided error rate or estimate it - e_i <- if (is.null(e)) estimateErrorRate(BSseq[, i]) else e[i] - + e_i <- if (is.null(e)) estimateErrorRate(MethylCounts[, i]) else e[i] + # Get scaled likelihoods - SL_i <- .getScaledLikelihoods(BSseq[, i], e_i) - + SL_i <- .getScaledLikelihoods(MethylCounts[, i], e_i) + # Calculate quality matrix if (allCpG) { Q[, i] <- ifelse(SL_i[, 3] >= 1/3, SL_i[, 3], (1 - SL_i[, 3])) @@ -234,7 +235,7 @@ getMaxLikelihoodMatrix <- function( Q[, i] <- DelayedMatrixStats::rowMaxs(SL_i) } } - + return(Q) } diff --git a/R/MethylCounts.R b/R/MethylCounts.R index 6ea9cf7..69a40ab 100644 --- a/R/MethylCounts.R +++ b/R/MethylCounts.R @@ -143,12 +143,18 @@ MethylCounts <- function(M = NULL, U = NULL, D = NULL, H = NULL, ## Move to BSseq-utils? getMethylCounts <- function(MethylCounts, - type = c("M", "U", "H", "D", "gr", "parameters"), + type = c("M", "U", "H", "D", "Cov", "gr", "parameters"), withDimnames = TRUE) { type <- match.arg(type) if (type %in% c("M", "U", "H", "D")) { return(assay(MethylCounts, type, withDimnames = withDimnames)) } + if (type == "Cov") { + M<-assay(MethylCounts, "M", withDimnames = withDimnames) + U<-assay(MethylCounts, "U", withDimnames = withDimnames) + H<-assay(MethylCounts, "H", withDimnames = withDimnames) + return(M+U+H) + } if (type == "parameters") { return(MethylCounts@parameters) } diff --git a/man/estimateErrorRate.Rd b/man/estimateErrorRate.Rd index a4f2a76..afd0558 100644 --- a/man/estimateErrorRate.Rd +++ b/man/estimateErrorRate.Rd @@ -1,30 +1,34 @@ \name{estimateErrorRate} \alias{estimateErrorRate} \title{ - Estimate CpG-specific error rate from BSseq object. + Estimate sample specific error rate for CpG calls in a MethylCounts object. } \description{ - This function estimates the CpG-specific error rate from a single sample BSseq object generated using \code{read.bedMethyl}. + This function estimates the CpG-specific error rate from a single sample MethylCounts object generated using \code{read.bedMethyl}. } \usage{ -estimateErrorRate(BSseq, minCov = 10, maxCov = 100, minRatio = 0.8, plotErrorProfile = FALSE) + estimateErrorRate(MethylCounts, + minCov = 10, + maxCov = 100, + minRatio = 0.8, + plotErrorProfile = FALSE) } \arguments{ - \item{BSseq}{A single sample object of class \code{BSseq}.} + \item{MethylCounts}{A single sample object of class \code{MethylCounts}.} \item{minCov}{A non-negative integer specifying the minimum coverage required for CpG loci to be considered.} \item{maxCov}{A non-negative integer specifying the maximum coverage allowed for CpG loci to be considered.} \item{minRatio}{A numeric value between 0 and 1 specifying the minimum ratio of CpG sites to non-CpG sites for a loci to be considered.} \item{plotErrorProfile}{A logical value indicating whether to plot the CpG to non-CpG ratio distribution for the filtered sites.} } \value{ - A numeric value representing the estimated CpG-specific error rate the BSseq object. + A numeric value representing the estimated CpG-specific error rate the MethylCounts object. } \author{ Søren Blikdal Hansen (soren.blikdal.hansen@sund.ku.dk) } \seealso{ - \code{\linkS4class{BSseq}} for the \code{BSseq} class, - \code{\link{read.bedMethyl}} for details on reading data into a \code{BSseq} object. + \code{\linkS4class{MethylCounts}} for the \code{MethylCounts} class, + \code{\link{read.bedMethyl}} for details on reading data into a \code{MethylCounts} object. } \examples{ # Example input files @@ -34,12 +38,13 @@ infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", package = "bsseq")) # Run the function to import data -bsseq <- read.bedMethyl(files = infiles, - colData = DataFrame(row.names = c("test_nanopore", +mc <- read.bedMethyl(files = infiles, + colData = DataFrame(row.names = c("test_nanopore", "test_pacbio")), strandCollapse = TRUE, + output = "MethylCounts", verbose = TRUE) # Estimate error rate -estimateErrorRate(bsseq[, 1], plotErrorProfile = FALSE) +estimateErrorRate(mc[, 1], plotErrorProfile = FALSE) } diff --git a/man/getCpGMatrix.Rd b/man/getCpGMatrix.Rd index 3271595..98e69c4 100644 --- a/man/getCpGMatrix.Rd +++ b/man/getCpGMatrix.Rd @@ -1,31 +1,33 @@ \name{getCpGMatrix} \alias{getCpGMatrix} \title{ - Generate a matrix of the most likely CpG status for a multi-sample BSseq object. + Generate a matrix of the most likely CpG status for a multi-sample MethylCounts object. } \description{ - This function generates a matrix of the most likely CpG status for all loci and samples in a BSseq object. Each element of the matrix represents the most likely CpG status (0 for homozygous CpG, 1 for heterozygous CpG, and 2 for non-CpG or if allCpG = TRUE 0 for homozygous or heterozygous CpG, and 2 for non-CpG) for a specific locus and sample. + This function generates a matrix of the most likely CpG status for all loci and samples in a MethylCounts object. Each element of the matrix represents the most likely CpG status (0 for homozygous CpG, 1 for heterozygous CpG, and 2 for non-CpG or if allCpG = TRUE 0 for homozygous or heterozygous CpG, and 2 for non-CpG) for a specific locus and sample. } \usage{ -getCpGMatrix(BSseq, e = NULL, allCpG = FALSE) + getCpGMatrix(MethylCounts, + e = NULL, + allCpG = FALSE) } \arguments{ - \item{BSseq}{An object of class \code{BSseq}.} + \item{MethylCounts}{An object of class \code{MethylCounts}.} \item{e}{An optional numeric vector representing error rates for each sample. If \code{NULL}, the error rate for each sample is estimated using \code{\link{estimateErrorRate}}.} \item{allCpG}{A logical value indicating whether to classify loci as allCpG (i.e. combine homozygous or heterozygous CpG) and non-CpG based on their likelihoods. Should be the same for \code{getCpGMatrix} and \code{getMaxLikelihoodMatrix} } } \value{ - A numeric matrix where each row represents a locus, and each column represents a sample, and the values correspond to the CpG status (same order as the BSseq object in input). + A numeric matrix where each row represents a locus, and each column represents a sample, and the values correspond to the CpG status (same order as the MethylCounts object in input). } \author{ Søren Blikdal Hansen (soren.blikdal.hansen@sund.ku.dk) } \seealso{ - \code{\linkS4class{BSseq}} for the \code{BSseq} class, - \code{\link{read.bedMethyl}} for details on reading data into a \code{BSseq} object, + \code{\linkS4class{MethylCounts}} for the \code{MethylCounts} class, + \code{\link{read.bedMethyl}} for details on reading data into a \code{MethylCounts} object, \code{\link{estimateErrorRate}} for estimating the CpG-specific error rate. \code{\link{getCpGs}} for filtering a single-sample BSseg object. - \code{\link{getMaxLikelihoodMatrix}} for generating a matrix with the maximum scaled likelihoods matching the CpGMatrix. + \code{\link{getMaxLikelihoodMatrix}} for generating a matrix with the maximum scaled likelihoods matching the CpGMatrix. } \examples{ @@ -36,27 +38,31 @@ infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", package = "bsseq")) # Run the function to import data -bsseq <- read.bedMethyl(files = infiles, - colData = DataFrame(row.names = c("test_nanopore", +mc <- read.bedMethyl(files = infiles, + colData = DataFrame(row.names = c("test_nanopore", "test_pacbio")), strandCollapse = TRUE, + output = "MethylCounts", verbose = TRUE) # Single samples can be filtered using the getCpGs function -bsseq_nano <- bsseq[, 1] -bsseq_nano_99All_filtered <- bsseq[getCpGs(bsseq_nano, - type = "allCpG", threshold = 0.99)] +mc_nano <- mc[, 1] +mc_nano_99All_filtered <- mc_nano[getCpGs(mc_nano, + type = "allCpG", + threshold = 0.99)] -bsseq_pacbio <- bsseq[, 2] -bsseq_pacbio_99All_filtered <- bsseq[getCpGs(bsseq_pacbio, - type = "allCpG", threshold = 0.99)] +mc_pacbio <- mc[, 2] +mc_pacbio_99All_filtered <- mc_pacbio[getCpGs(mc_pacbio, + type = "allCpG", + threshold = 0.99)] # For filtering multiple samples, we can use a CpGMatrix and a MaxLikelihoodMatrix -# Construct the CpGMatrix and getMaxLikelihoodMatrix for the bsseq object -CpGMatrix <- getCpGMatrix(bsseq, allCpG = TRUE) -MaxLikelihoodMatrix <- getMaxLikelihoodMatrix(bsseq, allCpG = TRUE) +# Construct the CpGMatrix and getMaxLikelihoodMatrix for the MethylCounts object + +CpGMatrix <- getCpGMatrix(mc, allCpG = TRUE) +MaxLikelihoodMatrix <- getMaxLikelihoodMatrix(mc, allCpG = TRUE) # Filter for allCpG loci with a likelihood > 0.99 in both samples -bsseq_combined_99All_filtered <- bsseq[which(rowAlls(CpGMatrix == 0) - & rowMins(MaxLikelihoodMatrix) > 0.99)] +mc_combined_99All_filtered <- mc[which(rowAlls(CpGMatrix == 0) & + rowMins(MaxLikelihoodMatrix) > 0.99)] } diff --git a/man/getCpGs.Rd b/man/getCpGs.Rd index c95eaa3..79a9723 100644 --- a/man/getCpGs.Rd +++ b/man/getCpGs.Rd @@ -1,16 +1,19 @@ \name{getCpGs} \alias{getCpGs} \title{ - Get CpG (and non-CpG) loci from a single sample BSseq object. + Get CpG (and non-CpG) loci from a single sample MethylCounts object. } \description{ - This function identifies CpG (and non-CpG) loci from a single sample BSseq object using scaled likelihoods computed from the count of CpG sites and Non-CpG sites mapped to the loci, based on the specified type, and minimum scaled likelihood (threshold). The function only work for BSseq objects generated using read.bedMethyl. + This function identifies CpG (and non-CpG) loci from a single sample MethylCounts object using scaled likelihoods computed from the count of CpG sites and Non-CpG sites mapped to the loci, based on the specified type, and minimum scaled likelihood (threshold). The function only work for MethylCounts objects generated using read.bedMethyl. } \usage{ -getCpGs(BSseq, type = c("homozygous", "heterozygous", "allCpG", "nonCpG"), threshold = 0.99, e = NULL) + getCpGs(MethylCounts, + type = c("homozygous", "heterozygous", "allCpG", "nonCpG"), + threshold = 0.99, + e = NULL) } \arguments{ - \item{BSseq}{An single sample object of class \code{BSseq}.} + \item{MethylCounts}{An single sample object of class \code{MethylCounts}.} \item{type}{A character string specifying the type of loci to extract. Must be one of \code{"homozygous"}, \code{"heterozygous"}, \code{"allCpG"}, or \code{"nonCpG"}.} \item{threshold}{A numeric value between 0 and 1 specifying the minimum likelihood threshold required for loci to be included.} \item{e}{An optional numeric value representing the error rate. If \code{NULL}, the error rate is estimated using \code{\link{estimateErrorRate}}.} @@ -22,8 +25,8 @@ getCpGs(BSseq, type = c("homozygous", "heterozygous", "allCpG", "nonCpG"), thres Søren Blikdal Hansen (soren.blikdal.hansen@sund.ku.dk) } \seealso{ - \code{\linkS4class{BSseq}} for the \code{BSseq} class, - \code{\link{read.bedMethyl}} for details on reading data into a \code{BSseq} object, + \code{\linkS4class{MethylCounts}} for the \code{MethylCounts} class, + \code{\link{read.bedMethyl}} for details on reading data into a \code{MethylCounts} object, \code{\link{estimateErrorRate}} for estimating the CpG-specific error rate. } \examples{ @@ -34,19 +37,20 @@ infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", package = "bsseq")) # Run the function to import data -bsseq <- read.bedMethyl(files = infiles, - colData = DataFrame(row.names = c("test_nanopore", +mc <- read.bedMethyl(files = infiles, + colData = DataFrame(row.names = c("test_nanopore", "test_pacbio")), strandCollapse = TRUE, + output = "MethylCounts", verbose = TRUE) # Filter CpG sites for the Nanopore dataset -bsseq_nano <- bsseq[, 1] -bsseq_nano_99All_filtered <- bsseq[getCpGs(bsseq_nano, +mc_nano <- mc[, 1] +mc_nano_99All_filtered <- mc_nano[getCpGs(mc_nano, type = "allCpG", threshold = 0.99)] # Filter CpG sites for the PacBio dataset -bsseq_pacbio <- bsseq[, 2] -bsseq_pacbio_99All_filtered <- bsseq[getCpGs(bsseq_pacbio, +mc_pacbio <- mc[, 2] +mc_pacbio_99All_filtered <- mc_pacbio[getCpGs(mc_pacbio, type = "allCpG", threshold = 0.99)] } diff --git a/man/getMaxLikelihoodMatrix.Rd b/man/getMaxLikelihoodMatrix.Rd index 6e7a04b..88c7298 100644 --- a/man/getMaxLikelihoodMatrix.Rd +++ b/man/getMaxLikelihoodMatrix.Rd @@ -1,16 +1,16 @@ \name{getMaxLikelihoodMatrix} \alias{getMaxLikelihoodMatrix} \title{ - Generate a matrix of the scaled likelihood of most likely CpG status for a multi-sample BSseq object. + Generate a matrix of the scaled likelihood of most likely CpG status for a multi-sample MethylCounts object. } \description{ - This function generates a matrix of the scaled likelihoods for most likely CpG status for a multi-sample BSseq object. Each element of the matrix represents the scaled likelihood of the most likely CpG status for the locus in the sample. If no data is available for a locus in a sample, the entry in the CpGMatrix is 2 (nonCpG) and the corresponding MaxLikelihood is 1/3. + This function generates a matrix of the scaled likelihoods for most likely CpG status for a multi-sample MethylCounts object. Each element of the matrix represents the scaled likelihood of the most likely CpG status for the locus in the sample. If no data is available for a locus in a sample, the entry in the CpGMatrix is 2 (nonCpG) and the corresponding MaxLikelihood is 1/3. } \usage{ -getMaxLikelihoodMatrix(BSseq, e = NULL, allCpG = FALSE) +getMaxLikelihoodMatrix(MethylCounts, e = NULL, allCpG = FALSE) } \arguments{ - \item{BSseq}{An object of class \code{BSseq}.} + \item{MethylCounts}{An object of class \code{MethylCounts}.} \item{e}{An optional numeric vector representing error rates for each sample. If \code{NULL}, the error rate for each sample is estimated using \code{\link{estimateErrorRate}}.} \item{allCpG}{A logical value indicating whether to classify loci as allCpG and non-CpG loci and sum the scaled likelihood of homozygous CpG and heterozygous CpG. Should be the same for \code{getMaxLikelihoodMatrix} and \code{getCpGMatrix} } } @@ -21,12 +21,13 @@ getMaxLikelihoodMatrix(BSseq, e = NULL, allCpG = FALSE) Søren Blikdal Hansen (soren.blikdal.hansen@sund.ku.dk) } \seealso{ - \code{\linkS4class{BSseq}} for the \code{BSseq} class, - \code{\link{read.bedMethyl}} for details on reading data into a \code{BSseq} object, + \code{\linkS4class{MethylCounts}} for the \code{MethylCounts} class, + \code{\link{read.bedMethyl}} for details on reading data into a \code{MethylCounts} object, \code{\link{estimateErrorRate}} for estimating the CpG-specific error rate. \code{\link{getCpGs}} for filtering a single-sample BSseg object. \code{\link{getCpGMatrix}} for generating a matrix with the most likely CpG status matching the getMaxLikelihoodMatrix. } + \examples{ # Example input files infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", @@ -35,27 +36,31 @@ infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", package = "bsseq")) # Run the function to import data -bsseq <- read.bedMethyl(files = infiles, - colData = DataFrame(row.names = c("test_nanopore", +mc <- read.bedMethyl(files = infiles, + colData = DataFrame(row.names = c("test_nanopore", "test_pacbio")), strandCollapse = TRUE, + output = "MethylCounts", verbose = TRUE) # Single samples can be filtered using the getCpGs function -bsseq_nano <- bsseq[, 1] -bsseq_nano_99All_filtered <- bsseq[getCpGs(bsseq_nano, - type = "allCpG", threshold = 0.99)] +mc_nano <- mc[, 1] +mc_nano_99All_filtered <- mc_nano[getCpGs(mc_nano, + type = "allCpG", + threshold = 0.99)] -bsseq_pacbio <- bsseq[, 2] -bsseq_pacbio_99All_filtered <- bsseq[getCpGs(bsseq_pacbio, - type = "allCpG", threshold = 0.99)] +mc_pacbio <- mc[, 2] +mc_pacbio_99All_filtered <- mc_pacbio[getCpGs(mc_pacbio, + type = "allCpG", + threshold = 0.99)] # For filtering multiple samples, we can use a CpGMatrix and a MaxLikelihoodMatrix -# Construct the CpGMatrix and QualityMatrix for the bsseq object -CpGMatrix <- getCpGMatrix(bsseq, allCpG = TRUE) -MaxLikelihoodMatrix <- getMaxLikelihoodMatrix(bsseq, allCpG = TRUE) +# Construct the CpGMatrix and getMaxLikelihoodMatrix for the MethylCounts object + +CpGMatrix <- getCpGMatrix(mc, allCpG = TRUE) +MaxLikelihoodMatrix <- getMaxLikelihoodMatrix(mc, allCpG = TRUE) # Filter for allCpG loci with a likelihood > 0.99 in both samples -bsseq_combined_99All_filtered <- bsseq[which(rowAlls(CpGMatrix == 0) - & rowMins(MaxLikelihoodMatrix) > 0.99)] +mc_combined_99All_filtered <- mc[which(rowAlls(CpGMatrix == 0) & + rowMins(MaxLikelihoodMatrix) > 0.99)] } diff --git a/tests/testthat/Test_likelihood_functions.R b/tests/testthat/Test_likelihood_functions.R new file mode 100644 index 0000000..037a451 --- /dev/null +++ b/tests/testthat/Test_likelihood_functions.R @@ -0,0 +1,37 @@ +context("MethylCounts") + +test_that("Likelihood functions works for bedMethyl files", { + +infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", + package = "bsseq"), + system.file("extdata/HG002_pacbio_test.bedMethyl.gz", + package = "bsseq")) + +mc <- read.bedMethyl(files = infiles, + colData = DataFrame(row.names = c("test_nanopore","test_pacbio")), + rmZeroCov = TRUE, + strandCollapse = TRUE, + output = "MethylCounts", + verbose = FALSE) + +# Filter CpG sites for the Nanopore dataset and test number of loci +mc_nano <- mc[, 1] +mc_nano_99All_filtered <- mc_nano[getCpGs(mc_nano, + type = "allCpG", threshold = 0.99)] +expect_equal(dim(mc_nano_99All_filtered), c(828L, 1L)) + +# Filter CpG sites for the PacBio dataset and test number of loci +mc_pacbio <- mc[, 2] +mc_pacbio_99All_filtered <- mc_pacbio[getCpGs(mc_pacbio, + type = "allCpG", threshold = 0.99)] +expect_equal(dim(mc_pacbio_99All_filtered), c(827L, 1L)) + +CpGMatrix <- getCpGMatrix(mc, allCpG = TRUE) +MaxLikelihoodMatrix <- getMaxLikelihoodMatrix(mc, allCpG = TRUE) + +# Filter for allCpG loci with a likelihood > 0.99 in both samples +mc_combined_99All_filtered <- mc[which(rowAlls(CpGMatrix == 0) + & rowMins(MaxLikelihoodMatrix) > 0.99)] +expect_equal(dim(mc_combined_99All_filtered), c(827L, 2L)) +}) + diff --git a/tests/testthat/test_read.bedMethyl.R b/tests/testthat/test_read.bedMethyl.R index 1b48ff0..3dab613 100644 --- a/tests/testthat/test_read.bedMethyl.R +++ b/tests/testthat/test_read.bedMethyl.R @@ -1,23 +1,143 @@ context("read.bedMethyl") -test_that("read.bedMethyl() works for bedMethyl file", { - infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", +test_that("read.bedMethyl() works for bedMethyl files", { +#load test data + infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", package = "bsseq"), - system.file("extdata/HG002_pacbio_test.bedMethyl.gz", + system.file("extdata/HG002_pacbio_test.bedMethyl.gz", package = "bsseq")) - bsseq <- read.bedMethyl(files = infiles, + +#read in files as bsseq object (Default: rmZeroCov=T, strandCollapse=T) + bs <- read.bedMethyl(files = infiles, colData = DataFrame(row.names = c("test_nanopore","test_pacbio")), rmZeroCov = TRUE, strandCollapse = TRUE, - verbose = FALSE) - expect_is(bsseq, "BSseq") - expect_equal(nrow(bsseq), 1111L) - expect_true(all(strand(bsseq) == "*")) - #test of get CpGs - bsseq_nano<-bsseq[,1] - expect_equal(length(getCpGs(bsseq_nano, type="allCpG")), 828L) - CpGMatrix<-getCpGMatrix(bsseq, allCpG = TRUE) - MaxLikelihoodMatrix<-getMaxLikelihoodMatrix(bsseq, allCpG = TRUE) - expect_equal(length(bsseq[which(rowAlls(CpGMatrix == 0) - & rowMins(MaxLikelihoodMatrix) > 0.99)]), 827L) + verbose = FALSE, + output = "BSseq" + ) + +#test class + expect_is(bs, "BSseq") +#test dimensions + expect_equal(dim(bs), c(1111L, 2L)) +#test strand collapse + expect_true(all(strand(bs) == "*")) +#test rm zeroCov + cov <- getCoverage(bs, type="Cov") + expect_false(any(rowAlls(cov == 0))) +#assay sums + expect_equal(sum(getBSseq(bs, type="M")), 26632L) + expect_equal(sum(getBSseq(bs, type="Cov")), 35708L) +getMeth(bs, type = "raw") + +#read in files as bsseq object (rmZeroCov=F, strandCollapse=T) +bs <- read.bedMethyl(files = infiles, + colData = DataFrame( + row.names = c("test_nanopore","test_pacbio")), + rmZeroCov = FALSE, + strandCollapse = TRUE, + verbose = FALSE, + output = "BSseq" +) + +#test class +expect_is(bs, "BSseq") +#test dimensions +expect_equal(dim(bs), c(1111L, 2L)) +#test strand collapse +expect_true(all(strand(bs) == "*")) +#test rm zeroCov +cov <- getCoverage(bs, type="Cov") +expect_false(any(rowAlls(cov == 0))) +#assay sums +expect_equal(sum(getBSseq(bs, type="M")), 26632L) +expect_equal(sum(getBSseq(bs, type="Cov")), 35708L) + +#read in files as bsseq object (rmZeroCov=T, strandCollapse=F) +bs <- read.bedMethyl(files = infiles, + colData = DataFrame( + row.names = c("test_nanopore","test_pacbio")), + rmZeroCov = TRUE, + strandCollapse = FALSE, + verbose = FALSE, + output = "BSseq" + ) + + #test class + expect_is(bs, "BSseq") + #test dimensions + expect_equal(dim(bs), c(1939L, 2L)) + #test rm zeroCov + cov <- getCoverage(bs, type="Cov") + expect_false(any(rowAlls(cov == 0))) + #assay sums + expect_equal(sum(getBSseq(bs, type="M")), 26632L) + expect_equal(sum(getBSseq(bs, type="Cov")), 35708L) + + #read in files as MethylCount object (Default: rmZeroCov=T, strandCollapse=T) + mc <- read.bedMethyl(files = infiles, + colData = DataFrame( + row.names = c("test_nanopore","test_pacbio")), + rmZeroCov = TRUE, + strandCollapse = TRUE, + verbose = FALSE, + output = "MethylCounts" + ) + + #test class + expect_is(mc, "MethylCounts") + #test dimensions + expect_equal(dim(mc), c(1111L, 2L)) + #test strand collapse + expect_true(all(strand(mc) == "*")) + #test rm zeroCov + cov <- getMethylCounts(mc, type="Cov") + expect_false(any(rowAlls(cov == 0))) + #assay sums + expect_equal(sum(getMethylCounts(mc, type="M")), 26632L) + expect_equal(sum(getMethylCounts(mc, type="Cov")), 35708L) + + #read in files as MethylCount object (rmZeroCov=F, strandCollapse=T) + mc <- read.bedMethyl(files = infiles, + colData = DataFrame( + row.names = c("test_nanopore","test_pacbio")), + rmZeroCov = FALSE, + strandCollapse = TRUE, + verbose = FALSE, + output = "MethylCounts" + ) + + #test class + expect_is(mc, "MethylCounts") + #test dimensions + expect_equal(dim(mc), c(1111L, 2L)) + #test strand collapse + expect_true(all(strand(mc) == "*")) + #test rm zeroCov + cov <- getMethylCounts(mc, type="Cov") + expect_false(any(rowAlls(cov == 0))) + #assay sums + expect_equal(sum(getMethylCounts(mc, type="M")), 26632L) + expect_equal(sum(getMethylCounts(mc, type="Cov")), 35708L) + + #read in files as MethylCount object (rmZeroCov=T, strandCollapse=F) + mc <- read.bedMethyl(files = infiles, + colData = DataFrame( + row.names = c("test_nanopore","test_pacbio")), + rmZeroCov = TRUE, + strandCollapse = FALSE, + verbose = FALSE, + output = "MethylCounts" + ) + + #test class + expect_is(mc, "MethylCounts") + #test dimensions + expect_equal(dim(mc), c(1939L, 2L)) + #test rm zeroCov + cov <- getMethylCounts(mc, type="Cov") + expect_false(any(rowAlls(cov == 0))) + #assay sums + expect_equal(sum(getMethylCounts(mc, type="M")), 26632L) + expect_equal(sum(getMethylCounts(mc, type="Cov")), 35708L) })