From 4c0d982a82b16669e43b69299803f26ed246b345 Mon Sep 17 00:00:00 2001 From: SrenBlikdal Date: Tue, 4 Nov 2025 11:37:13 +0100 Subject: [PATCH 1/5] Update R/read.bedMethyl.R from SrenBlikdal/MOD-lik-update to fix strandCollapse = F bug --- R/read.bedMethyl.R | 286 +++++++++++++++++++++++---------------------- 1 file changed, 146 insertions(+), 140 deletions(-) diff --git a/R/read.bedMethyl.R b/R/read.bedMethyl.R index e6c3654..a24d61a 100644 --- a/R/read.bedMethyl.R +++ b/R/read.bedMethyl.R @@ -1,34 +1,38 @@ -## +# read.bedMethyl files into a extended BSseq object .readbedMethylAsFWGRanges <- function(file, rmZeroCov = TRUE, - strandCollapse = TRUE, sort = TRUE, + strandCollapse = TRUE, sort = TRUE, nThread = 1L, verbose = FALSE) { stopifnot(isTRUEorFALSE(rmZeroCov)) stopifnot(isTRUEorFALSE(strandCollapse)) stopifnot(isTRUEorFALSE(sort)) M <- H <- C <- DE <- DI <- NO <- mod <- NULL if (rmZeroCov) { - dt <- .readbedMethylAsDT(file = file, - col_spec = "BSseq", + dt <- .readbedMethylAsDT(file = file, + col_spec = "BSseq", check = TRUE, verbose = verbose) - if (strandCollapse && !is.null(dt[["strand"]]) && - !dt[, all(strand == "*")]) { + if (strandCollapse && !is.null(dt[["strand"]]) && + !dt[, all(strand == "*")]) { dt[strand == "-", `:=`(start, start - 1L)][, `:=`(strand, NULL)] - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), by = c("seqnames", "start")]) } - dt <- dt[(M + H + C) > 0][, `:=`(c("M", "H", "C"), - list(NULL, NULL, NULL))] + dt <- dt[(M + H + C) > 0][, `:=`(c("M", "H", "C"), + list(NULL, NULL, NULL))] } else { - dt <- .readbedMethylAsDT(file = file, - col_spec = "BSseq", - check = TRUE, - nThread = nThread, + dt <- .readbedMethylAsDT(file = file, + col_spec = "BSseq", + check = TRUE, + nThread = nThread, verbose = verbose) - if (strandCollapse && !is.null(dt[["strand"]]) && !dt[,all(strand == "*")]) - { + if (strandCollapse && !is.null(dt[["strand"]]) && + !dt[, all(strand == "*")]) { dt[strand == "-", `:=`(start, start - 1L)][, `:=`(strand, NULL)] - dt <- data.table:::funique(dt) + dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + by = c("seqnames", "start")]) + } else { + dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + by = c("seqnames", "start", "strand")]) } } if (sort) { @@ -41,7 +45,7 @@ seqnames <- Rle(dt[["seqnames"]]) dt[, `:=`(seqnames, NULL)] seqinfo <- Seqinfo(seqnames = levels(seqnames)) - ranges <- .FWIRanges(start = dt[["start"]], width = 1L) #Check how this compares with bed format + ranges <- .FWIRanges(start = dt[["start"]], width = 1L) dt[, `:=`(start, NULL)] mcols <- make_zero_col_DFrame(length(ranges)) if (is.null(dt[["strand"]])) { @@ -51,10 +55,10 @@ dt[, `:=`(strand, NULL)] } fwgranges <- .FWGRanges( - seqnames = seqnames, - ranges = ranges, - strand = strand, - seqinfo = seqinfo, + seqnames = seqnames, + ranges = ranges, + strand = strand, + seqinfo = seqinfo, elementMetadata = mcols) if (sort) { fwgranges <- sort(sortSeqlevels(fwgranges)) @@ -63,11 +67,11 @@ } ## Right now this function use GRanges instead of FWGRanges -.constructFWGRangesFrombedMethylFiles <- function(files, - rmZeroCov, +.constructFWGRangesFrombedMethylFiles <- function(files, + rmZeroCov, strandCollapse, - verbose, - nThread, + verbose, + nThread, BPPARAM) { subverbose <- max(as.integer(verbose) - 1L, 0L) if (verbose) { @@ -75,27 +79,27 @@ message(writeLines(c(files)), sep = "\n") } loci_from_first_file <- .readbedMethylAsFWGRanges( - file = files[[1L]], - rmZeroCov = rmZeroCov, - strandCollapse = strandCollapse, - nThread = nThread, + file = files[[1L]], + rmZeroCov = rmZeroCov, + strandCollapse = strandCollapse, + nThread = nThread, verbose = subverbose) if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) { bptasks(BPPARAM) <- length(files) - 1L } - list_of_loci_from_other_files_not_in_first_file <- bplapply(files[-1L], + list_of_loci_from_other_files_not_in_first_file <- bplapply(files[-1L], function(file, loci_from_first_file) { loci_from_this_file <- .readbedMethylAsFWGRanges( - file = file, - rmZeroCov = rmZeroCov, - strandCollapse = strandCollapse, + file = file, + rmZeroCov = rmZeroCov, + strandCollapse = strandCollapse, verbose = subverbose) subsetByOverlaps( x = loci_from_this_file, ranges = loci_from_first_file, - type = "equal", + type = "equal", invert = TRUE) - }, loci_from_first_file = loci_from_first_file, + }, loci_from_first_file = loci_from_first_file, BPPARAM = BPPARAM) loci_non_found_in_first_file <- unique( do.call(c, list_of_loci_from_other_files_not_in_first_file)) @@ -106,7 +110,7 @@ ## .readbedMethylAsDT <- function(file, col_spec = c("all", "BSseq", "GRanges"), - check = FALSE, + check = FALSE, showProgress = FALSE, nThread = 1L, verbose = FALSE) { @@ -115,9 +119,9 @@ stopifnot(isTRUEorFALSE(check)) stopifnot(isTRUEorFALSE(showProgress)) fread_verbose <- as.logical(max(verbose - 1L, 0L)) - + header <- FALSE - colClasses <- c("factor", "integer", "NULL", "factor", "NULL", + colClasses <- c("factor", "integer", "NULL", "factor", "NULL", "factor", "NULL", "NULL", "NULL", "NULL", "NULL", "integer", "integer", "integer", "integer", "NULL", "integer", "integer") @@ -130,26 +134,27 @@ if (verbose) { message("[.readbedMethylAsDT] Reading file ...") } - x <- fread(input = file, - sep = "\t", + x <- fread(input = file, + sep = "\t", header = header, - verbose = fread_verbose, - drop = drop, - colClasses = colClasses, - col.names = col.names, - quote = "", - showProgress = showProgress, + verbose = fread_verbose, + drop = drop, + colClasses = colClasses, + col.names = col.names, + quote = "", + showProgress = showProgress, nThread = nThread) - + if (!is.null(x[["strand"]])) { + x[strand == ".", strand := "*"] x[, `:=`(strand, strand(strand))] } setkey(x, mod) - x<-x[.("m")] # remove all other mods except 5mC (they are still included in the count) + x<-x[.("m")] # remove all other mods except 5mC (they are still included in the count) if (verbose) { message("Reading in 5mC and 5hmC") } - + M <- H <- C <- DE <- DI <- NO <- mod <- . <- NULL if (check && all(c("M", "H", "C") %in% colnames(x))) { if (verbose) { @@ -157,7 +162,7 @@ } valid <- x[, isTRUE(all(M >= 0L & C >= 0L & H >= 0L & DE >= 0L & DI >= 0L & NO >= 0L))] if (!valid) { - stop("[.readbedMethylAsDT] Invalid counts detected.\n", + stop("[.readbedMethylAsDT] Invalid counts detected.\n", "'M', 'H' and 'C' columns should be non-negative integers.") } else { @@ -178,22 +183,25 @@ ## .constructCountsFromSinglebedMethylFile <- function(b, files, loci, strandCollapse, grid, M_sink, H_sink, C_sink, D_sink, - Cov_sink, sink_lock, + Cov_sink, sink_lock, nThread, verbose) { subverbose <- max(as.integer(verbose) - 1L, 0L) M <- H <- C <- DE <- DI <- NO <- mod <- NULL file <- files[b] if (verbose) { - message("[.constructCountsFrombedMethylFileAndFWGRanges] Extracting ", + message("[.constructCountsFrombedMethylFileAndFWGRanges] Extracting ", "counts from ", "'", file, "'") } - dt <- .readbedMethylAsDT(file = file, col_spec = "BSseq", check = TRUE, + dt <- .readbedMethylAsDT(file = file, col_spec = "BSseq", check = TRUE, nThread = nThread, verbose = subverbose) - if (strandCollapse && !is.null(dt[["strand"]]) && + if (strandCollapse && !is.null(dt[["strand"]]) && !dt[, all(strand == "*")]) { dt[strand == "-", `:=`(start, start - 1L)][, `:=`(strand, NULL)] - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), by = c("seqnames", "start")]) + } else { + dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + by = c("seqnames", "start", "strand")]) } seqnames <- Rle(dt[["seqnames"]]) dt[, `:=`(seqnames, NULL)] @@ -210,9 +218,9 @@ } loci_from_this_sample <- .FWGRanges( seqnames = seqnames, - ranges = ranges, - strand = strand, - seqinfo = seqinfo, + ranges = ranges, + strand = strand, + seqinfo = seqinfo, elementMetadata = mcols) ol <- findOverlaps(loci_from_this_sample, loci, type = "equal") M <- matrix(rep(0L, length(loci)), ncol = 1) @@ -259,71 +267,71 @@ else if (BACKEND == "HDF5Array") { h5_path <- file.path(dir, "assays.h5") M_sink <- HDF5RealizationSink( - dim = ans_dim, - dimnames = NULL, - type = "integer", - filepath = h5_path, - name = "M", - chunkdim = chunkdim, + dim = ans_dim, + dimnames = NULL, + type = "integer", + filepath = h5_path, + name = "M", + chunkdim = chunkdim, level = level) on.exit(close(M_sink), add = TRUE) H_sink <- HDF5RealizationSink( - dim = ans_dim, - dimnames = NULL, - type = "integer", - filepath = h5_path, - name = "H", - chunkdim = chunkdim, + dim = ans_dim, + dimnames = NULL, + type = "integer", + filepath = h5_path, + name = "H", + chunkdim = chunkdim, level = level) on.exit(close(H_sink), add = TRUE) C_sink <- HDF5RealizationSink( - dim = ans_dim, - dimnames = NULL, - type = "integer", - filepath = h5_path, - name = "C", - chunkdim = chunkdim, + dim = ans_dim, + dimnames = NULL, + type = "integer", + filepath = h5_path, + name = "C", + chunkdim = chunkdim, level = level) on.exit(close(C_sink), add = TRUE) D_sink <- HDF5RealizationSink( - dim = ans_dim, - dimnames = NULL, - type = "integer", - filepath = h5_path, - name = "D", - chunkdim = chunkdim, + dim = ans_dim, + dimnames = NULL, + type = "integer", + filepath = h5_path, + name = "D", + chunkdim = chunkdim, level = level) on.exit(close(D_sink), add = TRUE) Cov_sink <- HDF5RealizationSink( - dim = ans_dim, - dimnames = NULL, - type = "integer", - filepath = h5_path, - name = "Cov", - chunkdim = chunkdim, + dim = ans_dim, + dimnames = NULL, + type = "integer", + filepath = h5_path, + name = "Cov", + chunkdim = chunkdim, level = level) on.exit(close(Cov_sink), add = TRUE) sink_lock <- ipcid() on.exit(ipcremove(sink_lock), add = TRUE) } else { M_sink <- DelayedArray::AutoRealizationSink( - dim = ans_dim, + dim = ans_dim, type = "integer") on.exit(close(M_sink), add = TRUE) H_sink <- DelayedArray::AutoRealizationSink( - dim = ans_dim, + dim = ans_dim, type = "integer") on.exit(close(H_sink), add = TRUE) C_sink <- DelayedArray::AutoRealizationSink( - dim = ans_dim, + dim = ans_dim, type = "integer") on.exit(close(C_sink), add = TRUE) D_sink <- DelayedArray::AutoRealizationSink( - dim = ans_dim, + dim = ans_dim, type = "integer") on.exit(close(D_sink), add = TRUE) Cov_sink <- DelayedArray::AutoRealizationSink( - dim = ans_dim, + dim = ans_dim, type = "integer") on.exit(close(Cov_sink), add = TRUE) sink_lock <- ipcid() @@ -332,20 +340,20 @@ if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) { bptasks(BPPARAM) <- length(grid) } - counts <- bptry(bplapply(X = seq_along(grid), - FUN = .constructCountsFromSinglebedMethylFile, - files = files, - loci = loci, - strandCollapse = strandCollapse, - grid = grid, - M_sink = M_sink, + counts <- bptry(bplapply(X = seq_along(grid), + FUN = .constructCountsFromSinglebedMethylFile, + files = files, + loci = loci, + strandCollapse = strandCollapse, + grid = grid, + M_sink = M_sink, H_sink = H_sink, C_sink = C_sink, D_sink = D_sink, - Cov_sink = Cov_sink, - sink_lock = sink_lock, - verbose = subverbose, - nThread = nThread, + Cov_sink = Cov_sink, + sink_lock = sink_lock, + verbose = subverbose, + nThread = nThread, BPPARAM = BPPARAM)) if (!all(bpok(counts))) { stop(".constructbedMethylCounts() encountered errors for these files:\n ", @@ -372,17 +380,17 @@ return(list(M = M, H = H, C = C, D = D, Cov = Cov)) } -## +## read.bedMethyl <- function (files, loci = NULL, colData = NULL, - rmZeroCov = TRUE, + rmZeroCov = TRUE, strandCollapse = TRUE, BPPARAM = bpparam(), - BACKEND = NULL, + BACKEND = NULL, dir = tempfile("BSseq"), replace = FALSE, - chunkdim = NULL, + chunkdim = NULL, level = NULL, nThread = 1L, verbose = getOption("verbose")) { @@ -391,7 +399,7 @@ read.bedMethyl <- function (files, } file_exists <- file.exists(files) if (!isTRUE(all(file_exists))) { - stop("These files cannot be found:\n ", paste(files[!file_exists], + stop("These files cannot be found:\n ", paste(files[!file_exists], collapse = "\n ")) } if (!is.null(loci)) { @@ -415,22 +423,22 @@ read.bedMethyl <- function (files, setAutoRealizationBackend(BACKEND) if (!.areBackendsInMemory(BACKEND)) { if (!.isSingleMachineBackend(BPPARAM)) { - stop("The parallelisation strategy must use a single machine ", - "when using an on-disk realization backend.\n", + stop("The parallelisation strategy must use a single machine ", + "when using an on-disk realization backend.\n", "See help(\"read.bismark\") for details.", call. = FALSE) } } else { if (!is.null(BACKEND)) { - stop("The '", BACKEND, "' realization backend is not supported.", - "\n See help(\"read.bismark\") for details.", + stop("The '", BACKEND, "' realization backend is not supported.", + "\n See help(\"read.bismark\") for details.", call. = FALSE) } } if (identical(BACKEND, "HDF5Array")) { if (!isSingleString(dir)) { - stop(wmsg("'dir' must be a single string specifying the path to ", - "the directory where to save the BSseq object (the ", + stop(wmsg("'dir' must be a single string specifying the path to ", + "the directory where to save the BSseq object (the ", "directory will be created).")) } if (!isTRUEorFALSE(replace)) { @@ -450,12 +458,12 @@ read.bedMethyl <- function (files, message("[read.bedMethyl] Parsing files and constructing valid loci ...") } loci <- .constructFWGRangesFrombedMethylFiles( - files = files, - rmZeroCov = rmZeroCov, - strandCollapse = strandCollapse, - verbose = verbose, - nThread = nThread, - BPPARAM = BPPARAM) + files = files, + rmZeroCov = rmZeroCov, + strandCollapse = strandCollapse, + verbose = verbose, + nThread = nThread, + BPPARAM = BPPARAM) ptime2 <- proc.time() stime <- (ptime2 - ptime1)[3] if (verbose) { @@ -480,16 +488,16 @@ read.bedMethyl <- function (files, } if (rmZeroCov) { if (verbose) { - message("[read.bedMethyl] Parsing files to identify elements of ", + message("[read.bedMethyl] Parsing files to identify elements of ", "'loci' with non-zero coverage ...") } ptime1 <- proc.time() loci_from_files <- .constructFWGRangesFrombedMethylFiles( - files = files, - rmZeroCov = rmZeroCov, - strandCollapse = strandCollapse, - verbose = subverbose, - nThread = nThread, + files = files, + rmZeroCov = rmZeroCov, + strandCollapse = strandCollapse, + verbose = subverbose, + nThread = nThread, BPPARAM = BPPARAM) loci <- subsetByOverlaps(loci, loci_from_files, type = "equal") ptime2 <- proc.time() @@ -501,19 +509,19 @@ read.bedMethyl <- function (files, } ptime1 <- proc.time() if (verbose) { - message("[read.bedMethyl] Parsing files and constructing", + message("[read.bedMethyl] Parsing files and constructing", "'M', 'H', 'C', 'D' and 'Cov' ", "matrices ...") } counts <- .constructbedMethylCounts( - files = files, - loci = loci, - strandCollapse = strandCollapse, - BPPARAM = BPPARAM, - BACKEND = BACKEND, - dir = dir, - chunkdim = chunkdim, - level = level, - nThread = nThread, + files = files, + loci = loci, + strandCollapse = strandCollapse, + BPPARAM = BPPARAM, + BACKEND = BACKEND, + dir = dir, + chunkdim = chunkdim, + level = level, + nThread = nThread, verbose = subverbose) ptime2 <- proc.time() stime <- (ptime2 - ptime1)[3] @@ -523,7 +531,7 @@ read.bedMethyl <- function (files, if (verbose) { message("[read.bedMethyl] Constructing BSseq object ... ") } - se <- SummarizedExperiment(assays = counts, rowRanges = as(loci, + se <- SummarizedExperiment(assays = counts, rowRanges = as(loci, "GRanges"), colData = colData) bsseq <- new2("BSseq", se, check = FALSE) if (!is.null(BACKEND) && BACKEND == "HDF5Array") { @@ -533,5 +541,3 @@ read.bedMethyl <- function (files, } bsseq } - - From e6f80196cdfd971c7c65199c1dfb5e05960841ed Mon Sep 17 00:00:00 2001 From: SrenBlikdal Date: Tue, 4 Nov 2025 11:49:36 +0100 Subject: [PATCH 2/5] Change C for canonincal to U for unmethylated in R/read.bedMethyl --- R/read.bedMethyl.R | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/R/read.bedMethyl.R b/R/read.bedMethyl.R index a24d61a..5eb1cba 100644 --- a/R/read.bedMethyl.R +++ b/R/read.bedMethyl.R @@ -5,7 +5,7 @@ stopifnot(isTRUEorFALSE(rmZeroCov)) stopifnot(isTRUEorFALSE(strandCollapse)) stopifnot(isTRUEorFALSE(sort)) - M <- H <- C <- DE <- DI <- NO <- mod <- NULL + M <- H <- U <- DE <- DI <- NO <- mod <- NULL if (rmZeroCov) { dt <- .readbedMethylAsDT(file = file, col_spec = "BSseq", @@ -14,10 +14,10 @@ if (strandCollapse && !is.null(dt[["strand"]]) && !dt[, all(strand == "*")]) { dt[strand == "-", `:=`(start, start - 1L)][, `:=`(strand, NULL)] - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), U= sum(U), D = sum(DE + DI + NO)), by = c("seqnames", "start")]) } - dt <- dt[(M + H + C) > 0][, `:=`(c("M", "H", "C"), + dt <- dt[(M + H + U) > 0][, `:=`(c("M", "H", "U"), list(NULL, NULL, NULL))] } else { dt <- .readbedMethylAsDT(file = file, @@ -28,10 +28,10 @@ if (strandCollapse && !is.null(dt[["strand"]]) && !dt[, all(strand == "*")]) { dt[strand == "-", `:=`(start, start - 1L)][, `:=`(strand, NULL)] - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), U = sum(U), D = sum(DE + DI + NO)), by = c("seqnames", "start")]) } else { - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), U = sum(U), D = sum(DE + DI + NO)), by = c("seqnames", "start", "strand")]) } } @@ -126,7 +126,7 @@ "NULL", "integer", "integer", "integer", "integer", "NULL", "integer", "integer") drop <- c(3L, 5L, 7L, 8L, 9L, 10L, 11L, 16L) - col.names <- c("seqnames", "start", "mod", "strand", "M", "C", "H", "DE", "DI", "NO") + col.names <- c("seqnames", "start", "mod", "strand", "M", "U", "H", "DE", "DI", "NO") if (verbose) { message("[.readbedMethylAsDT] Parsing '", file, "'") } @@ -155,15 +155,15 @@ message("Reading in 5mC and 5hmC") } - M <- H <- C <- DE <- DI <- NO <- mod <- . <- NULL - if (check && all(c("M", "H", "C") %in% colnames(x))) { + M <- H <- U <- DE <- DI <- NO <- mod <- . <- NULL + if (check && all(c("M", "H", "U") %in% colnames(x))) { if (verbose) { message("[.readbedMethylAsDT] Checking validity of counts in file.") } - valid <- x[, isTRUE(all(M >= 0L & C >= 0L & H >= 0L & DE >= 0L & DI >= 0L & NO >= 0L))] + valid <- x[, isTRUE(all(M >= 0L & U >= 0L & H >= 0L & DE >= 0L & DI >= 0L & NO >= 0L))] if (!valid) { stop("[.readbedMethylAsDT] Invalid counts detected.\n", - "'M', 'H' and 'C' columns should be non-negative integers.") + "'M', 'H' and 'U' columns should be non-negative integers.") } else { if (verbose) { @@ -186,7 +186,7 @@ Cov_sink, sink_lock, nThread, verbose) { subverbose <- max(as.integer(verbose) - 1L, 0L) - M <- H <- C <- DE <- DI <- NO <- mod <- NULL + M <- H <- U <- DE <- DI <- NO <- mod <- NULL file <- files[b] if (verbose) { message("[.constructCountsFrombedMethylFileAndFWGRanges] Extracting ", @@ -197,10 +197,10 @@ if (strandCollapse && !is.null(dt[["strand"]]) && !dt[, all(strand == "*")]) { dt[strand == "-", `:=`(start, start - 1L)][, `:=`(strand, NULL)] - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), U = sum(U), D = sum(DE + DI + NO)), by = c("seqnames", "start")]) } else { - dt <- unique(dt[, list(M = sum(M), H = sum(H), C = sum(C), D = sum(DE + DI + NO)), + dt <- unique(dt[, list(M = sum(M), H = sum(H), U = sum(U), D = sum(DE + DI + NO)), by = c("seqnames", "start", "strand")]) } seqnames <- Rle(dt[["seqnames"]]) @@ -225,22 +225,22 @@ ol <- findOverlaps(loci_from_this_sample, loci, type = "equal") M <- matrix(rep(0L, length(loci)), ncol = 1) H <- matrix(rep(0L, length(loci)), ncol = 1) - C <- matrix(rep(0L, length(loci)), ncol = 1) + U <- matrix(rep(0L, length(loci)), ncol = 1) D <- matrix(rep(0L, length(loci)), ncol = 1) Cov <- matrix(rep(0L, length(loci)), ncol = 1) M[subjectHits(ol)] <- dt[queryHits(ol), ][["M"]] H[subjectHits(ol)] <- dt[queryHits(ol), ][["H"]] - C[subjectHits(ol)] <- dt[queryHits(ol), ][["C"]] + U[subjectHits(ol)] <- dt[queryHits(ol), ][["U"]] D[subjectHits(ol)] <- dt[queryHits(ol), ][["D"]] - Cov[subjectHits(ol)] <- dt[queryHits(ol), list(Cov = (M + H + C))][["Cov"]] + Cov[subjectHits(ol)] <- dt[queryHits(ol), list(Cov = (M + H + U))][["Cov"]] if (is.null(M_sink)) { - return(list(M = M, H = H, C = C, D = D, Cov = Cov)) + return(list(M = M, H = H, U = U, D = D, Cov = Cov)) } viewport <- grid[[b]] ipclock(sink_lock) write_block(M_sink, viewport = viewport, block = M) write_block(H_sink, viewport = viewport, block = H) - write_block(C_sink, viewport = viewport, block = C) + write_block(C_sink, viewport = viewport, block = U) write_block(D_sink, viewport = viewport, block = D) write_block(Cov_sink, viewport = viewport, block = Cov) ipcunlock(sink_lock) @@ -289,7 +289,7 @@ dimnames = NULL, type = "integer", filepath = h5_path, - name = "C", + name = "U", chunkdim = chunkdim, level = level) on.exit(close(C_sink), add = TRUE) @@ -364,8 +364,8 @@ attr(M, "dim") <- ans_dim H <- do.call(c, lapply(counts, "[[", "H")) attr(H, "dim") <- ans_dim - C <- do.call(c, lapply(counts, "[[", "C")) - attr(C, "dim") <- ans_dim + U <- do.call(c, lapply(counts, "[[", "U")) + attr(U, "dim") <- ans_dim D <- do.call(c, lapply(counts, "[[", "D")) attr(D, "dim") <- ans_dim Cov <- do.call(c, lapply(counts, "[[", "Cov")) @@ -373,11 +373,11 @@ } else { M <- as(M_sink, "DelayedArray") H <- as(H_sink, "DelayedArray") - C <- as(C_sink, "DelayedArray") + U <- as(C_sink, "DelayedArray") D <- as(D_sink, "DelayedArray") Cov <- as(Cov_sink, "DelayedArray") } - return(list(M = M, H = H, C = C, D = D, Cov = Cov)) + return(list(M = M, H = H, U = U, D = D, Cov = Cov)) } ## @@ -510,7 +510,7 @@ read.bedMethyl <- function (files, ptime1 <- proc.time() if (verbose) { message("[read.bedMethyl] Parsing files and constructing", - "'M', 'H', 'C', 'D' and 'Cov' ", "matrices ...") + "'M', 'H', 'U', 'D' and 'Cov' ", "matrices ...") } counts <- .constructbedMethylCounts( files = files, From 3a0c50260d9f01814e44c6a226eb7eacc8aedb6d Mon Sep 17 00:00:00 2001 From: SrenBlikdal Date: Tue, 4 Nov 2025 14:08:17 +0100 Subject: [PATCH 3/5] add alpha version of read.bedMethyl which can output MethylCount object. Default is still a BSseq object and is not modified in this commit --- R/read.bedMethyl.R | 12 +++++++++++- man/read.bedMethyl.Rd | 8 +++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/R/read.bedMethyl.R b/R/read.bedMethyl.R index 5eb1cba..7a88af5 100644 --- a/R/read.bedMethyl.R +++ b/R/read.bedMethyl.R @@ -393,7 +393,9 @@ read.bedMethyl <- function (files, chunkdim = NULL, level = NULL, nThread = 1L, + output = c("BSseq", "MethylCount"), verbose = getOption("verbose")) { + output <- match.arg(output) if (anyDuplicated(files)) { stop("'files' cannot have duplicate entries.") } @@ -529,8 +531,15 @@ read.bedMethyl <- function (files, message("Done in ", round(stime, 1), " secs") } if (verbose) { - message("[read.bedMethyl] Constructing BSseq object ... ") + message("[read.bedMethyl] Constructing ", if(output=="MethylCount") "MethylCount" else "BSseq", " object ... ") } + if (output == "MethylCount") { + assays_mc <- list(M = counts$M, H = counts$H, U = counts$U, D = counts$D) + se <- SummarizedExperiment(assays = assays_mc, rowRanges = as(loci, "GRanges"), colData = colData) + mc <- new2("MethylCounts", se, check = FALSE) + return(mc) + } + else { se <- SummarizedExperiment(assays = counts, rowRanges = as(loci, "GRanges"), colData = colData) bsseq <- new2("BSseq", se, check = FALSE) @@ -540,4 +549,5 @@ read.bedMethyl <- function (files, saveRDS(x, file = file.path(dir, "se.rds")) } bsseq + } } diff --git a/man/read.bedMethyl.Rd b/man/read.bedMethyl.Rd index 3fdeff5..d313cf9 100644 --- a/man/read.bedMethyl.Rd +++ b/man/read.bedMethyl.Rd @@ -19,6 +19,7 @@ read.bedMethyl(files, chunkdim = NULL, level = NULL, nThread = 1L, + output = c("BSseq", "MethylCount"), verbose = getOption("verbose")) } \arguments{ @@ -34,13 +35,14 @@ read.bedMethyl(files, \item{chunkdim}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.} The dimensions of the chunks to use for writing the data to disk.} \item{level}{The compression level to use for writing the data to disk.} \item{nThread}{The number of threads used by \code{\link[data.table]{fread}} when reading the \code{files}.} + \item{output}{The type of output returned when reading the \code{files}. Allowed values are "BSseq" and "MethylCount")} \item{verbose}{A \code{logical(1)} indicating whether progress messages should be printed (default \code{TRUE}).} } \section{File formats}{ The format of each file should be similar to the examples in [link to preprint]. Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically decompressed to \code{\link{tempdir}()}. \subsection{Supported file formats}{ - Modkit bedMethyl files from modkit pileup. For downstream likelihood functions we recommend running modkit pileup on output from bam files modification/basecalled using a CG context model and not using a reference genome for pileup. + Modkit bedMethyl files from modkit pileup. For downstream likelihood functions we recommend running modkit pileup on output from bam files modification/basecalled using a CG context model and not using a reference genome for pileup. } \subsection{Unsupported file formats}{ Other types of output. @@ -60,7 +62,7 @@ infiles <- c(system.file("extdata/HG002_nanopore_test.bedMethyl.gz", # Run the function to import data bsseq <- read.bedMethyl(files = infiles, - colData = DataFrame(row.names = c("test_nanopore", + colData = DataFrame(row.names = c("test_nanopore", "test_pacbio")), rmZeroCov = FALSE, strandCollapse = TRUE, @@ -71,5 +73,5 @@ bsseq } \author{ - Søren Blikdal Hansen (soren.blikdal.hansen@sund.ku.dk) + Søren Blikdal Hansen (soren.blikdal.hansen@sund.ku.dk) } From 3d20b1ebc01fc3ae2636e8f95a71edb70bf6e055 Mon Sep 17 00:00:00 2001 From: SrenBlikdal Date: Tue, 4 Nov 2025 14:17:04 +0100 Subject: [PATCH 4/5] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index fcb86df..2047ecf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: bsseq -Version: 1.47.2 +Version: 1.47.3 Encoding: UTF-8 Title: Analyze, manage and store whole-genome methylation data Description: A collection of tools for analyzing and visualizing whole-genome From c734a5cc949ad72d3c3047f1cbbef1d950001657 Mon Sep 17 00:00:00 2001 From: SrenBlikdal Date: Tue, 4 Nov 2025 15:13:21 +0100 Subject: [PATCH 5/5] Fix MethylCount/MethylCounts typo --- R/read.bedMethyl.R | 6 +++--- man/read.bedMethyl.Rd | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/read.bedMethyl.R b/R/read.bedMethyl.R index 7a88af5..25d8dfb 100644 --- a/R/read.bedMethyl.R +++ b/R/read.bedMethyl.R @@ -393,7 +393,7 @@ read.bedMethyl <- function (files, chunkdim = NULL, level = NULL, nThread = 1L, - output = c("BSseq", "MethylCount"), + output = c("BSseq", "MethylCounts"), verbose = getOption("verbose")) { output <- match.arg(output) if (anyDuplicated(files)) { @@ -531,9 +531,9 @@ read.bedMethyl <- function (files, message("Done in ", round(stime, 1), " secs") } if (verbose) { - message("[read.bedMethyl] Constructing ", if(output=="MethylCount") "MethylCount" else "BSseq", " object ... ") + message("[read.bedMethyl] Constructing ", if(output=="MethylCounts") "MethylCounts" else "BSseq", " object ... ") } - if (output == "MethylCount") { + if (output == "MethylCounts") { assays_mc <- list(M = counts$M, H = counts$H, U = counts$U, D = counts$D) se <- SummarizedExperiment(assays = assays_mc, rowRanges = as(loci, "GRanges"), colData = colData) mc <- new2("MethylCounts", se, check = FALSE) diff --git a/man/read.bedMethyl.Rd b/man/read.bedMethyl.Rd index d313cf9..1d7fa8e 100644 --- a/man/read.bedMethyl.Rd +++ b/man/read.bedMethyl.Rd @@ -19,7 +19,7 @@ read.bedMethyl(files, chunkdim = NULL, level = NULL, nThread = 1L, - output = c("BSseq", "MethylCount"), + output = c("BSseq", "MethylCounts"), verbose = getOption("verbose")) } \arguments{ @@ -35,7 +35,7 @@ read.bedMethyl(files, \item{chunkdim}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.} The dimensions of the chunks to use for writing the data to disk.} \item{level}{The compression level to use for writing the data to disk.} \item{nThread}{The number of threads used by \code{\link[data.table]{fread}} when reading the \code{files}.} - \item{output}{The type of output returned when reading the \code{files}. Allowed values are "BSseq" and "MethylCount")} + \item{output}{The type of output returned when reading the \code{files}. Allowed values are "BSseq" and "MethylCounts")} \item{verbose}{A \code{logical(1)} indicating whether progress messages should be printed (default \code{TRUE}).} }