From 8f261e637b273b45516c3a14a9b9b34c4767d901 Mon Sep 17 00:00:00 2001 From: kaanokay Date: Thu, 7 Dec 2023 16:12:06 -0500 Subject: [PATCH 1/4] PCA plot from BSseq object --- R/PCA.plot.R | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 R/PCA.plot.R diff --git a/R/PCA.plot.R b/R/PCA.plot.R new file mode 100644 index 0000000..5e58c00 --- /dev/null +++ b/R/PCA.plot.R @@ -0,0 +1,51 @@ +PCA <- function( + BSseq.obj, + CpG, + win_size, + nudge_x, + nudge_y, + size +) { + + # Filtering number of CpGs with 0 coverage in all samples + coverage <- getCoverage(BSseq.obj) + keep <- which(rowSums(coverage) !=0) + BSseq.obj.filtered <- BSseq.obj[keep,] + + # Tile the genome + BSseq.obj.tiled <- tile_by_windows(bs = BSseq.obj.filtered, win_size = win_size) + + # Filter out CpG tiles based on cutoff of number of CpGs + + BSseq.obj.tiled.filtered <- BSseq.obj.tiled[rowSums(as.matrix(countOverlaps(BSseq.obj.tiled, BSseq.obj.filtered))) > CpG, ] + + # Get raw methylation values + + BSseq.obj.methylation <- getMeth(BSseq.obj.tiled.filtered, type = "raw") + + # Filter tiles with NA methylation values + BSseq.obj.methylation.filtered <- BSseq.obj.methylation[rowSums(is.na(BSseq.obj.methylation)) == 0,] + + # Perform PCA + pca_data <- prcomp(t(BSseq.obj.methylation.filtered)) + pca_data_perc <- round(100 * pca_data$sdev^2 / sum(pca_data$sdev^2), 1) + df_pca_data <- data.frame( + PC1 = pca_data$x[, 1], + PC2 = pca_data$x[, 2]) + + pdf("PCA.pdf") + + # Plot PCA + p <- ggplot(df_pca_data, aes(PC1, PC2, label = row.names(df_pca_data))) + + geom_point() + + labs( + x = paste0("PC1 (", pca_data_perc[1], ")"), + y = paste0("PC2 (", pca_data_perc[2], ")") + ) + + geom_text(nudge_x = nudge_x, nudge_y = nudge_y, size = size) + + theme_classic() + + print(p) + + dev.off() +} From 68b753ab7cfeb4b91fb4614d8e5da4a5f636c256 Mon Sep 17 00:00:00 2001 From: Kaan Okay <42235353+kaanokay@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:54:35 +0000 Subject: [PATCH 2/4] Update PCA.plot.R --- R/PCA.plot.R | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/R/PCA.plot.R b/R/PCA.plot.R index 5e58c00..3276a3c 100644 --- a/R/PCA.plot.R +++ b/R/PCA.plot.R @@ -1,30 +1,41 @@ PCA <- function( BSseq.obj, + genome, + tilewidth, + cut.last.tile.in.chrom, CpG, - win_size, nudge_x, nudge_y, size -) { +) + +{ + + # Tile the UCSC mm10 mouse genome + + mm10.mouse.genome.1kb.tiles <- GenomicRanges::tileGenome(seqinfo(genome), tilewidth = tilewidth, cut.last.tile.in.chrom = cut.last.tile.in.chrom) + + # Filtering number of CpGs with 0 coverage across all samples - # Filtering number of CpGs with 0 coverage in all samples coverage <- getCoverage(BSseq.obj) keep <- which(rowSums(coverage) !=0) BSseq.obj.filtered <- BSseq.obj[keep,] + + # Find which 1kb tile has more than a number of CpGs (default is 3) - # Tile the genome - BSseq.obj.tiled <- tile_by_windows(bs = BSseq.obj.filtered, win_size = win_size) - - # Filter out CpG tiles based on cutoff of number of CpGs + one.kb.tiled.genome.more.number.of.CpGs <- mm10.mouse.genome.1kb.tiles[which(GenomicRanges::countOverlaps(mm10.mouse.genome.1kb.tiles, BSseq.obj.filtered) >= CpG), ] + + # Find which CpG in BSseq.object overlap with 1kb tiled coordinates with more than number of CpGs and then keep those CpGs in the BSseq.obj - BSseq.obj.tiled.filtered <- BSseq.obj.tiled[rowSums(as.matrix(countOverlaps(BSseq.obj.tiled, BSseq.obj.filtered))) > CpG, ] + BSseq.obj.CpGs.within.1kb.tilled.genome <- BSseq.obj.filtered[as.data.frame(GenomicRanges::findOverlaps(BSseq.obj.filtered, one.kb.tiled.genome.more.number.of.CpGs))$queryHits, ] # Get raw methylation values - BSseq.obj.methylation <- getMeth(BSseq.obj.tiled.filtered, type = "raw") - + BSseq.obj.methylation <- getMeth(BSseq.obj.CpGs.within.1kb.tilled.genome, type = "raw") + # Filter tiles with NA methylation values - BSseq.obj.methylation.filtered <- BSseq.obj.methylation[rowSums(is.na(BSseq.obj.methylation)) == 0,] + + BSseq.obj.methylation.filtered <- BSseq.obj.methylation[rowSums(is.na(BSseq.obj.methylation)) == 0, ] # Perform PCA pca_data <- prcomp(t(BSseq.obj.methylation.filtered)) @@ -33,7 +44,7 @@ PCA <- function( PC1 = pca_data$x[, 1], PC2 = pca_data$x[, 2]) - pdf("PCA.pdf") + pdf("PCA.function.2.pdf") # Plot PCA p <- ggplot(df_pca_data, aes(PC1, PC2, label = row.names(df_pca_data))) + @@ -48,4 +59,5 @@ PCA <- function( print(p) dev.off() + } From d7b86d26f76f6af6456cf06412fb0e4d4ca15b92 Mon Sep 17 00:00:00 2001 From: Kaan Okay <42235353+kaanokay@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:58:11 +0000 Subject: [PATCH 3/4] Rename PCA.plot.R to PCA.plot.v2.R --- R/{PCA.plot.R => PCA.plot.v2.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{PCA.plot.R => PCA.plot.v2.R} (100%) diff --git a/R/PCA.plot.R b/R/PCA.plot.v2.R similarity index 100% rename from R/PCA.plot.R rename to R/PCA.plot.v2.R From 2c6467f4c073c762b72e3a21d04db18fd2ddab6f Mon Sep 17 00:00:00 2001 From: Kaan Okay <42235353+kaanokay@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:59:08 +0000 Subject: [PATCH 4/4] Delete R/PCA.plot.v2.R --- R/PCA.plot.v2.R | 63 ------------------------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 R/PCA.plot.v2.R diff --git a/R/PCA.plot.v2.R b/R/PCA.plot.v2.R deleted file mode 100644 index 3276a3c..0000000 --- a/R/PCA.plot.v2.R +++ /dev/null @@ -1,63 +0,0 @@ -PCA <- function( - BSseq.obj, - genome, - tilewidth, - cut.last.tile.in.chrom, - CpG, - nudge_x, - nudge_y, - size -) - -{ - - # Tile the UCSC mm10 mouse genome - - mm10.mouse.genome.1kb.tiles <- GenomicRanges::tileGenome(seqinfo(genome), tilewidth = tilewidth, cut.last.tile.in.chrom = cut.last.tile.in.chrom) - - # Filtering number of CpGs with 0 coverage across all samples - - coverage <- getCoverage(BSseq.obj) - keep <- which(rowSums(coverage) !=0) - BSseq.obj.filtered <- BSseq.obj[keep,] - - # Find which 1kb tile has more than a number of CpGs (default is 3) - - one.kb.tiled.genome.more.number.of.CpGs <- mm10.mouse.genome.1kb.tiles[which(GenomicRanges::countOverlaps(mm10.mouse.genome.1kb.tiles, BSseq.obj.filtered) >= CpG), ] - - # Find which CpG in BSseq.object overlap with 1kb tiled coordinates with more than number of CpGs and then keep those CpGs in the BSseq.obj - - BSseq.obj.CpGs.within.1kb.tilled.genome <- BSseq.obj.filtered[as.data.frame(GenomicRanges::findOverlaps(BSseq.obj.filtered, one.kb.tiled.genome.more.number.of.CpGs))$queryHits, ] - - # Get raw methylation values - - BSseq.obj.methylation <- getMeth(BSseq.obj.CpGs.within.1kb.tilled.genome, type = "raw") - - # Filter tiles with NA methylation values - - BSseq.obj.methylation.filtered <- BSseq.obj.methylation[rowSums(is.na(BSseq.obj.methylation)) == 0, ] - - # Perform PCA - pca_data <- prcomp(t(BSseq.obj.methylation.filtered)) - pca_data_perc <- round(100 * pca_data$sdev^2 / sum(pca_data$sdev^2), 1) - df_pca_data <- data.frame( - PC1 = pca_data$x[, 1], - PC2 = pca_data$x[, 2]) - - pdf("PCA.function.2.pdf") - - # Plot PCA - p <- ggplot(df_pca_data, aes(PC1, PC2, label = row.names(df_pca_data))) + - geom_point() + - labs( - x = paste0("PC1 (", pca_data_perc[1], ")"), - y = paste0("PC2 (", pca_data_perc[2], ")") - ) + - geom_text(nudge_x = nudge_x, nudge_y = nudge_y, size = size) + - theme_classic() - - print(p) - - dev.off() - -}