diff --git a/DESCRIPTION b/DESCRIPTION index d8c372c..d5ddd31 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("cre", "aut"), person("Jean-Philippe", "Fortin", role = "ctb"), person("Tim", "Triche", role = "ctb"), person(c("Shan", "V."), "Andrews", role = "ctb")) -Depends: methods, +Depends: methods, BiocGenerics (>= 0.15.3), GenomicRanges, SummarizedExperiment (>= 1.1.6), @@ -28,7 +28,7 @@ Suggests: IlluminaHumanMethylation450kmanifest (>= 0.2.0), BiocStyle, knitr, rmarkdown -Imports: S4Vectors, +Imports: S4Vectors, GenomeInfoDb, Biobase (>= 2.33.2), IRanges, @@ -53,7 +53,7 @@ Imports: S4Vectors, grDevices, graphics, utils -Collate: generics.R +Collate: generics.R mset.R rset.R rgset.R @@ -74,6 +74,7 @@ Collate: generics.R preprocessQuantile.R preprocessNoob.R preprocessFunnorm.R + tcgaPipeline.R qc.R read.450k.R read.meth.R @@ -91,3 +92,4 @@ BugReports: https://github.com/kasperdanielhansen/minfi/issues LazyData: yes biocViews: DNAMethylation, DifferentialMethylation, Epigenetics, Microarray, MethylationArray, MultiChannel, TwoChannel, DataImport, Normalization, Preprocessing, QualityControl +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 4979537..9467074 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,7 +48,7 @@ export(getManifest, getProbeInfo, getManifestInfo, IlluminaMethylationManifest, IlluminaMethylationAnnotation, getControlAddress, getRed, getGreen, getNBeads, getMeth, getUnmeth, - getBeta, getM, getCN, getMethSignal, + getBeta, getM, getCN, getSnpBeta, getMethSignal, getOOB, getSnpBeta, dropMethylationLoci, getLocations, getAnnotation, getAnnotationObject, @@ -57,7 +57,7 @@ export(getManifest, getProbeInfo, getManifestInfo, normalize.illumina.control, bgcorrect.illumina, preprocessRaw, preprocessIllumina, preprocessSWAN, preprocessQuantile, - preprocessFunnorm, preprocessNoob, + preprocessFunnorm, preprocessNoob, tcgaPipeline, RGChannelSet, RGChannelSetExtended, MethylSet, RatioSet, GenomicMethylSet, GenomicRatioSet, @@ -80,8 +80,9 @@ exportClasses(RGChannelSet, RGChannelSetExtended, GenomicMethylSet, GenomicRatioSet, IlluminaMethylationManifest, IlluminaMethylationAnnotation) -exportMethods(show, initialize, getBeta, getM, getCN, getMeth, getUnmeth, - getManifest, annotation, "annotation<-", preprocessMethod, +exportMethods(show, initialize, + getBeta, getM, getCN, getSnpBeta, getMeth, getUnmeth, getManifest, + annotation, "annotation<-", preprocessMethod, combine, mapToGenome, ratioConvert, bumphunter, pData, "pData<-", sampleNames, "sampleNames<-", featureNames, "featureNames<-", coerce) diff --git a/R/generics.R b/R/generics.R index db4aa47..611457e 100644 --- a/R/generics.R +++ b/R/generics.R @@ -8,5 +8,6 @@ setGeneric("getManifest", function(object) standardGeneric("getManifest")) ## setGeneric("getLocations", function(object, ...) standardGeneric("getLocations")) setGeneric("preprocessMethod", function(object) standardGeneric("preprocessMethod")) setGeneric("getCN", function(object, ...) standardGeneric("getCN")) +setGeneric("getSnpBeta", function(object) standardGeneric("getSnpBeta")) setGeneric("convertArray", function(object, ...) standardGeneric("convertArray")) setGeneric("combineArrays", function(object1, object2, ...) standardGeneric("combineArrays")) diff --git a/R/grset.R b/R/grset.R index 66797a8..c297654 100644 --- a/R/grset.R +++ b/R/grset.R @@ -70,6 +70,28 @@ setMethod("getCN", signature(object = "GenomicRatioSet"), return(NULL) }) +# FIXME: enforce something a bit less lackadaisical? +setMethod("getSnpBeta", signature(object = "GenomicRatioSet"), + function (object) { + if (!all(colnames(object) %in% colnames(metadata(object)$SNPs))) { + missed <- setdiff(colnames(object), + colnames(metadata(object)$SNPs)) + stop("Error: columns for ", paste(missed, sep=", "), + " are missing from metadata(object)$SNPs.") + } + if ("SNPs" %in% names(metadata(object))) { + if (!identical(colnames(object), + colnames(metadata(object)$SNPs))) { + message("colnames(metadata(object)$SNPs) != colnames(object)") + message("You should probably fix this if you use these SNPs.") + } + return(metadata(object)$SNPs[, colnames(object)]) + } else { + message("No SNPs found in your GenomicRatioSet metadata().") + return(NULL) + } + }) + setMethod("mapToGenome", signature(object = "GenomicRatioSet"), function(object, ...) { object diff --git a/R/rgset.R b/R/rgset.R index 3a5792d..6be1e65 100644 --- a/R/rgset.R +++ b/R/rgset.R @@ -112,27 +112,32 @@ getNBeads <- function(object) { assay(object, "NBeads") } +setMethod("getSnpBeta", signature(object = "RGChannelSet"), + function (object) { + + .isRGOrStop(object) + + snpInfoII <- getProbeInfo(object, type="SnpII") + snpProbesII <- snpInfoII$Name + M.II <- getGreen(object)[snpInfoII$AddressA, , drop=FALSE] + U.II <- getRed(object)[snpInfoII$AddressA, , drop=FALSE] + + snpInfoI <- getProbeInfo(object, type="SnpI") + snpProbesI.Green <- snpInfoI[snpInfoI$Color=="Grn", , drop=FALSE] + M.I.Green <- getGreen(object)[snpProbesI.Green$AddressB, , drop=F] + U.I.Green <- getGreen(object)[snpProbesI.Green$AddressA, , drop=F] + snpProbesI.Red <- snpInfoI[snpInfoI$Color=="Red", , drop=FALSE] + M.I.Red <- getRed(object)[snpProbesI.Red$AddressB, , drop=FALSE] + U.I.Red <- getRed(object)[snpProbesI.Red$AddressA, , drop=FALSE] + + M <- rbind(M.II, M.I.Red, M.I.Green) + U <- rbind(U.II, U.I.Red, U.I.Green) + rsID <- c(snpProbesII, snpProbesI.Red$Name, snpProbesI.Green$Name) + rownames(M) <- rownames(U) <- rsID + beta <- M/(U+M+100) # I hate this. + return(beta) -getSnpBeta <- function(object){ - .isRGOrStop(object) - - snpProbesII <- getProbeInfo(object, type = "SnpII")$Name - M.II <- getGreen(object)[getProbeInfo(object, type = "SnpII")$AddressA,,drop=FALSE] - U.II <- getRed(object)[getProbeInfo(object, type = "SnpII")$AddressA,,drop=FALSE] - - snpProbesI.Green <- getProbeInfo(object, type = "SnpI")[getProbeInfo(object, type = "SnpI")$Color=="Grn",,drop=FALSE] - snpProbesI.Red <- getProbeInfo(object, type = "SnpI")[getProbeInfo(object, type = "SnpI")$Color=="Red",,drop=FALSE] - M.I.Red <- getRed(object)[snpProbesI.Red$AddressB,,drop=FALSE] - U.I.Red <- getRed(object)[snpProbesI.Red$AddressA,,drop=FALSE] - M.I.Green <- getGreen(object)[snpProbesI.Green$AddressB,,drop=FALSE] - U.I.Green <- getGreen(object)[snpProbesI.Green$AddressA,,drop=FALSE] - - M <- rbind(M.II, M.I.Red, M.I.Green) - U <- rbind(U.II, U.I.Red, U.I.Green) - rownames(M) <- rownames(U) <- c(snpProbesII, snpProbesI.Red$Name, snpProbesI.Green$Name) - beta <- M/(U+M+100) - return(beta) -} + }) setMethod("getManifest", signature(object = "RGChannelSet"), function(object) { diff --git a/R/tcgaPipeline.R b/R/tcgaPipeline.R new file mode 100644 index 0000000..788926e --- /dev/null +++ b/R/tcgaPipeline.R @@ -0,0 +1,12 @@ +# a "best practices" baseline for most studies +tcgaPipeline <- function(rgSet, pCutoff=0.05) { + pval <- detectionP(rgSet) + message("Running TCGA-style (noob) pipeline on ", ncol(rgSet), " samples...") + grSet <- ratioConvert(mapToGenome(preprocessNoob(rgSet))) + message("Masking probes with detection p-value > ", pCutoff, "...") + is.na(assays(grSet)$Beta) <- (pval[rownames(grSet),] >= pCutoff) + message("Placing SNP probe betas in metadata(grSet)$SNPs...") + metadata(grSet)$SNPs <- getSnpBeta(rgSet) + message("Done.") + return(grSet) +} diff --git a/inst/extData/GM12878.csv b/inst/extData/GM12878.csv new file mode 100644 index 0000000..08ea5d8 --- /dev/null +++ b/inst/extData/GM12878.csv @@ -0,0 +1,59 @@ +"","seqnames","start","end","width","strand","score","genotype","ref","alt","U","M" +"rs3936238","chr1",4031586,4031586,1,"*",3,"A:A","A","G","G","A" +"rs877309","chr1",11412265,11412265,1,"*",2,"A:G","A","G","A","G" +"rs213028","chr1",21524764,21524764,1,"*",2,"C:T","C","T","C","T" +"rs11249206","chr1",25150569,25150569,1,"*",1,"T:T","C","T","T","C" +"rs654498","chr1",81945636,81945636,1,"*",1,"C:C","C","T","C","T" +"rs715359","chr1",175697791,175697791,1,"*",1,"C:C","T","C","C","T" +"rs2804694","chr1",179598456,179598456,1,"*",1,"A:A","G","A","A","G" +"rs6426327","chr1",244156434,244156434,1,"*",1,"A:A","G","A","A","G" +"rs966367","chr2",12065671,12065671,1,"*",3,"C:C","C","T","T","C" +"rs4331560","chr2",48938177,48938177,1,"*",3,"G:G","A","G","A","G" +"rs1510480","chr2",60678945,60678945,1,"*",2,"A:G","G","A","A","G" +"rs6546473","chr2",69113861,69113861,1,"*",1,"A:A","A","G","A","G" +"rs264581","chr2",159679609,159679609,1,"*",1,"A:A","A","G","A","G" +"rs939290","chr3",14633870,14633870,1,"*",3,"C:C","T","C","T","C" +"rs9839873","chr3",86744845,86744845,1,"*",3,"C:C","T","C","T","C" +"rs1520670","chr3",100455616,100455616,1,"*",3,"A:A","A","G","G","A" +"rs10936224","chr3",162385225,162385225,1,"*",2,"A:G","G","A",NA,NA +"rs10033147","chr4",13966228,13966228,1,"*",1,NA,"A","G",NA,NA +"rs2125573","chr4",128827282,128827282,1,"*",2,"T:C","T","C",NA,NA +"rs7660805","chr4",131263500,131263500,1,"*",1,"A:A","G","A","A","G" +"rs9292570","chr5",35036069,35036069,1,"*",3,"C:C","T","C","T","C" +"rs348937","chr5",112853576,112853576,1,"*",1,"T:T","C","T","T","C" +"rs1019916","chr5",146628552,146628552,1,"*",2,"G:A","G","A",NA,NA +"rs7746156","chr6",47238778,47238778,1,"*",2,"C:T","T","C",NA,NA +"rs9363764","chr6",68288763,68288763,1,"*",2,"G:A","G","A",NA,NA +"rs10457834","chr6",149084493,149084493,1,"*",3,"C:C","T","C","T","C" +"rs6982811","chr8",36666471,36666471,1,"*",1,NA,"T","C",NA,NA +"rs1484127","chr8",51888207,51888207,1,"*",1,"A:A","G","A","A","G" +"rs6471533","chr8",96539476,96539476,1,"*",2,"G:A","G","A",NA,NA +"rs472920","chr8",96917920,96917920,1,"*",1,NA,"A","G",NA,NA +"rs6991394","chr8",121851583,121851583,1,"*",3,NA,"C","T",NA,NA +"rs2385226","chr8",126751178,126751178,1,"*",2,"T:C","C","T",NA,NA +"rs4742386","chr9",7701758,7701758,1,"*",1,NA,"A","G",NA,NA +"rs1040870","chr9",84605357,84605357,1,"*",2,"T:C","C","T",NA,NA +"rs10796216","chr10",14731895,14731895,1,"*",3,"A:A","A","G","G","A" +"rs10882854","chr10",98629087,98629087,1,"*",1,"C:C","T","C","C","T" +"rs11034952","chr11",38745191,38745191,1,"*",2,"C:T","T","C","C","T" +"rs1945975","chr11",104804934,104804934,1,"*",2,"T:C","C","A","A","C" +"rs2468330","chr12",41484989,41484989,1,"*",2,"A:G","G","A","A","G" +"rs1495031","chr12",61784403,61784403,1,"*",2,"T:C","C","T","T","C" +"rs951295","chr15",43787115,43787115,1,"*",1,"G:G","A","G","G","A" +"rs2959823","chr15",74200584,74200584,1,"*",2,"A:G","G","A","A","G" +"rs1510189","chr16",54658025,54658025,1,"*",1,"T:T","T","C","T","C" +"rs1941955","chr18",33416836,33416836,1,"*",1,"T:T","T","C","T","C" +"rs2235751","chr20",1917934,1917934,1,"*",2,"A:G","A","G","A","G" +"rs845016","chr21",32920155,32920155,1,"*",3,"C:C","C","T","T","C" +"rs2032088","chr21",37399200,37399200,1,"*",1,"A:A","G","A","A","G" +"rs1467387","chr22",24261372,24261372,1,"*",2,"T:C","T","C","T","C" +"rs133860","chr22",24474760,24474760,1,"*",2,"T:C","C","T","T","C" +"rs739259","chr22",25686579,25686579,1,"*",2,"A:G","A","G","A","G" +"rs2857639","chr22",28385674,28385674,1,"*",2,"A:G","A","G","A","G" +"rs2208123","chr22",46593476,46593476,1,"*",3,"A:A","A","G","G","A" +"rs2521373","chrX",9436990,9436990,1,"*",2,"A:G","A","G",NA,NA +"rs798149","chrX",15795352,15795352,1,"*",3,"G:G","A","G","A","G" +"rs5926356","chrX",28157252,28157252,1,"*",2,"A:G","A","G",NA,NA +"rs5987737","chrX",114616977,114616977,1,"*",1,NA,"T","C",NA,NA +"rs1416770","chrX",145026857,145026857,1,"*",2,"T:C","T","C",NA,NA +"rs6626309","chrX",147012065,147012065,1,"*",3,NA,"C","T",NA,NA diff --git a/inst/unitTests/test_tcgaPipeline.R b/inst/unitTests/test_tcgaPipeline.R new file mode 100644 index 0000000..d789d11 --- /dev/null +++ b/inst/unitTests/test_tcgaPipeline.R @@ -0,0 +1,7 @@ +test_tcgaPipeline <- function() { + stopifnot(require(minfiData)) + grSetEx <- tcgaPipeline(updateObject(RGsetEx)) + checkIdentical(getSnpBeta(grSetEx), getSnpBeta(updateObject(RGsetEx))) + checkIdentical(preprocessMethod(grSetEx), + c(mu.norm="Noob, dyeCorr=TRUE, dyeMethod=single")) +} diff --git a/man/preprocessNoob.Rd b/man/preprocessNoob.Rd index 79c5855..efdab02 100644 --- a/man/preprocessNoob.Rd +++ b/man/preprocessNoob.Rd @@ -19,7 +19,7 @@ preprocessNoob(rgSet, offset = 15, dyeCorr = TRUE, verbose = TRUE, \item{dyeMethod}{How should dye bias correction be done: use a single sample approach (ssNoob), or a reference array?} } \value{ - An object of class \code{MethylSet}. + For preprocessNoob, an object of class \code{MethylSet}. } \references{ TJ Triche, DJ Weisenberger, D Van Den Berg, PW Laird and KD Siegmund @@ -36,12 +36,14 @@ preprocessNoob(rgSet, offset = 15, dyeCorr = TRUE, verbose = TRUE, as well as \code{\linkS4class{IlluminaMethylationManifest}} for the basic classes involved in these functions. \code{\link{preprocessRaw}} and \code{\link{preprocessQuantile}} are other preprocessing functions. + \code{\link{tcgaPipeline}} for an end-to-end processing pipeline with results directly comparable to the Cancer Genome Atlas (TCGA) DNA methylation data. } \examples{ if (require(minfiData)) { ## RGsetEx.sub is a small subset of RGsetEx; ## only used for computational speed. MsetEx.sub.noob <- preprocessNoob(RGsetEx.sub) + grSet.sub.noob <- tcgaPipeline(RGsetEx.sub) } \dontrun{ if (require(minfiData)) { diff --git a/man/tcgaPipeline.Rd b/man/tcgaPipeline.Rd new file mode 100644 index 0000000..64a6bcb --- /dev/null +++ b/man/tcgaPipeline.Rd @@ -0,0 +1,53 @@ +\name{tcgaPipeline} +\alias{tcgaPipeline} +\alias{tcgaPipeline} +\title{ + A TCGA-style preprocessing pipeline for Infinium methylation microarrays. +} +\description{ + The standard DNAme processing pipeline for the Cancer Genome Atlas (TCGA) was + the methylumi implementation of preprocessNoob (which yields identical beta/M + values to the single-sample ssNoob implementation now standard in minfi, in + the common case where a balanced reference sample was identified in each TCGA + batch). Probes with detection p-values > 0.05 were masked as NA, and SNP (rs) + probes included on each array by Illumina for label swap detection tabulated. + After QC and certain masking steps (related to common SNPs and repetitive + regions) had been applied, the results were uploaded as Level 2 data for TCGA. + This function wraps the individual steps to turn an \code{RGChannelSet} into + a \code{GenomicRatioSet} directly comparable to canonical TCGA datasets, save + for the SNP/repeat masking steps (which, in hindsight, seem overzealous). The + resulting GenomicRatioSet is thus directly comparable to processed TCGA data, + and as of this writing (May 2017), no single preprocessing method (let alone + any single-sample preprocessing method) has emerged as superior for the + general case of mixed tumor and normal samples from varying tissues. Thus, + this approach may also serve as a useful benchmark for methods development. +} +\usage{ +tcgaPipeline(rgSet, pCutoff=0.05) +} +\arguments{ + \item{rgSet}{An object of class \code{RGChannelSet}.} + \item{pCutoff}{Above what detection pvalue should probes be marked NA? (0.05)} +} +\value{ + An object of class \code{GenomicRatioSet} with additional SNP metadata. +} +\references{ + TJ Triche, DJ Weisenberger, D Van Den Berg, PW Laird and KD Siegmund + \emph{Low-level processing of Illumina Infinium DNA Methylation + BeadArrays}. + Nucleic Acids Res (2013) 41, e90. + doi:\href{http://www.dx.doi.org/10.1093/nar/gkt090}{10.1093/nar/gkt090}. +} +\author{ + Tim Triche, Jr. +} +\seealso{ + \code{\linkS4class{RGChannelSet}} +} +\examples{ +if (require(minfiData)) { + grSetEx <- tcgaPipeline(RGsetEx) + stopifnot(identical(getSnpBeta(grSetEx), getSnpBeta(RGsetEx))) +} +}