From d19fa8c4ecdc294e1c75c7852ad8c413a487c206 Mon Sep 17 00:00:00 2001 From: Alexander Favorov Date: Wed, 16 Nov 2022 15:49:22 -0500 Subject: [PATCH 1/6] updated links to HMCan and GCCount in readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 66218ca..e7b5f78 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ Here is step-by-step instuctions how to detect super-enhancers in cancer data wi 1. Remove low quality reads (Q\<20 with `samtools view -q`) and remove duplicates from the BAM files ("samtools rmdup") `samtools view -u -q 20 $DATAIN/XXX.bam | samtools rmdup -s - $DATAOUT/$CL.K27ac.Q20.noDup.bam` `samtools view -u -q 20 $DATAIN/YYY.bam | samtools rmdup -s - $DATAOUT/$CL.Input.Q20.noDup.bam` -2. Call peaks with HMCan: http://www.cbrc.kaust.edu.sa/hmcan/ -By default, the size of genomic regions to perform copy number normalization is 100Kb. In case you have 30M reads in the input, you can decrease this size to 10Kb if you wish. It is mostly important if you expect to have some amplifications in your tumor genomes. In this case, you will need to generate a GC content profile for such windows (use [this tool](http://www.cbrc.kaust.edu.sa/hmcan/GCCount.tar.gz)) or kindly ask Haitham Ashoor to do it for you. +2. Call peaks with HMCan: https://github.com/BoevaLab/HMCan +By default, the size of genomic regions to perform copy number normalization is 100Kb. In case you have 30M reads in the input, you can decrease this size to 10Kb if you wish. It is mostly important if you expect to have some amplifications in your tumor genomes. In this case, you will need to generate a GC content profile for such windows (use [this tool](https://github.com/BoevaLab/HMCan/tree/master/Utils/GCCount)) or kindly ask Haitham Ashoor to do it for you. 3. [Optional: bigWig files can be created automatically later by `runLILY.R`] Create bigWig files (.bw) from HMCan density (.wig) files: `file=$CL.K27ac.wig` From 8a2e82f0401565abdbc2bdc33533271ef5c356f5 Mon Sep 17 00:00:00 2001 From: Alexander Favorov Date: Fri, 7 Apr 2023 00:19:55 -0400 Subject: [PATCH 2/6] added a test to run from a gui --- scripts/runLILY.R | 610 ++++++++++++++++++++++++---------------------- 1 file changed, 313 insertions(+), 297 deletions(-) diff --git a/scripts/runLILY.R b/scripts/runLILY.R index 3a666d0..1ead9e6 100644 --- a/scripts/runLILY.R +++ b/scripts/runLILY.R @@ -1,297 +1,313 @@ -#Copywrite Valentina Boeva, 2017 - -# >>> SOURCE LICENSE >>> -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation (www.fsf.org); either version 2 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# A copy of the GNU General Public License is available at -# http://www.fsf.org/licensing/licenses -# -# >>> END OF LICENSE >>> - -################################################################################# -# to run: -# cat runLILY.R | R --slave --args SAMPLE_NAME OUTPUT_DIR Distance_toStitch distFromTSS transcriptome_GFF_file faiFileToCreateBigWig -################################################################################# - -args <- commandArgs() - -sampleName=args[4] -OUTPUT_DIR=args[5] -maxDistanceToStitch=as.numeric(args[6]) -distFromTSS=as.numeric(args[7]) -transcriptomeFile=args[8] -faiFileToCreateBigWig=args[9] - -cat (paste("..Working with sample",sampleName,"\n")) -cat (paste("..Will stitch enhancers at maximal distance of",maxDistanceToStitch,"\n")) -cat (paste("..Will consider promoter regions as regions around +-",distFromTSS,"from gene TSS\n")) - - -#################check that all files and directories exist #################### - -dir.create(OUTPUT_DIR, showWarnings = FALSE) -setwd(OUTPUT_DIR) -cat (paste("..Will write the output into ",OUTPUT_DIR,"\n")) - -if(!file.exists(transcriptomeFile)) { - cat ("Error:Please prodive a valid path to a file with transcriptome information\nFor example from here: https://github.com/linlabbcm/rose2/tree/master/rose2/annotation\nEXIT!\n") - quit() -} - -if(!file.exists(paste0(sampleName,"_peaks.narrowPeak"))) { - cat ("Error:Please prodive a valid path to output files of HMCan\n") - quit() -} - - -##################################################### - -suppressMessages(library(rtracklayer)) - -################# FUNCTIONS ######################### - - -import.bed3<- function(filename){ - peaks=read.table(filename) - return(GRanges(peaks[,1],IRanges(peaks[,2],peaks[,3]))) -} - -import.bed6<- function(filename){ - peaks=read.table(filename) - return(GRanges(peaks[,1],IRanges(peaks[,2],peaks[,3]),score=peaks[,5])) -} - -import.ucsc<- function(filename){ - peaks=read.table(filename,header = T,comment.char = "!") - peaks=GRanges(peaks$chrom,IRanges(peaks$txStart,peaks$txEnd),strand = peaks$strand) - return(peaks) -} - - -getThresholdOnPeakScore <- function (filename) { - dataTable <-read.table(filename, header=F); - lengths = dataTable$V3-dataTable$V2 - lengths = (lengths - 1)/50 + 1 - scores=dataTable$V5 - - tranch=200 - ascendingScores = rev(scores) - lengthForAscendingScores = rev(lengths) - - x = NULL - y = NULL - z=NULL - for (i in c(0:(floor(length(lengths)/tranch)-1))) { - tt=c((tranch*i+1):(tranch*i+tranch)) - m=mean(ascendingScores[tt]/lengthForAscendingScores[tt]) - s=sd(ascendingScores[tt]/lengthForAscendingScores[tt]) - x=c(x,max(ascendingScores[tt])) - y=c(y,m) - z=c(z,s) - } - - thresholdToReturn=x[min(which(y>=min(y[which(x>5)])))] - if (thresholdToReturn<2) {thresholdToReturn=2} #warning!!! - return(thresholdToReturn) -} - -output_peaks_density <- function(enhancersStitched,enhancers,promoters,bwFile,outputFile){ - - bedRegions = enhancersStitched - strand(bedRegions)="*"; - bedRegions$score=0; - densities = import.bw(bwFile) - gc() - overlaps=findOverlaps(bedRegions,densities) - ensids <- densities$score[subjectHits(overlaps)] - x <- tapply(ensids, queryHits(overlaps), sum) - bedRegions$score[unique(queryHits(overlaps))]=x - rm(overlaps) - rm(densities) - gc() - strand(bedRegions)="+"; - bedRegions=bedRegions[order(bedRegions$score,decreasing=T)] - cutoff=calculate_cutoff(bedRegions$score,F) - bedRegions$name="enhancer" - bedRegions$name[which(bedRegions$score>cutoff)]="SE" - - #unstitch enhancers: - strand(enhancers)="+";enhancers$score=0;enhancers$name="enhancer"; - tt1=which(countOverlaps(enhancers,bedRegions[which(bedRegions$name=="enhancer")])>0) - tt2=which(countOverlaps(bedRegions,enhancers)>0 & bedRegions$name=="enhancer") - - bedRegions=bedRegions[-tt2] - bedRegions=c(bedRegions,enhancers[tt1]) - - #find promoters intersecting with enhancers (not SEs) and call these enhancers: promoters - promoters$score=0 - promoters$name="promoter" - strand(promoters)="+" - - enh_promoters=c(promoters,bedRegions[which(bedRegions$name=="enhancer")]) - enh_promoters=resize(enh_promoters,20+width(enh_promoters)) - enh_promoters=reduce(enh_promoters) - enh_promoters=resize(enh_promoters,-20+width(enh_promoters)) - - tt1=which(countOverlaps(enh_promoters,promoters)>0) - tt2=which(countOverlaps(enh_promoters,promoters)==0) - largePromoters=enh_promoters[tt1] - largeEnh=enh_promoters[tt2] - - largePromoters$score=0;largePromoters$name="promoter" - largeEnh$score=0;largeEnh$name="enhancer" - tt1=which(countOverlaps(largeEnh,bedRegions[which(bedRegions$name=="enhancer")])>0) - tt2=which(countOverlaps(bedRegions,largeEnh)>0 & bedRegions$name=="enhancer") - bedRegions=bedRegions[-tt2] - bedRegions=c(bedRegions,largeEnh[tt1]) - - finalSet=c(bedRegions,largePromoters) - strand(finalSet)="*"; - finalSet$score=0; - densities = import.bw(bwFile) - gc() - overlaps=findOverlaps(finalSet,densities) - ensids <- densities$score[subjectHits(overlaps)] - x <- tapply(ensids, queryHits(overlaps), sum) - finalSet$score[unique(queryHits(overlaps))]=x - rm(overlaps) - rm(densities) - gc() - strand(finalSet)="+"; - - export.bed(finalSet,outputFile) -} - -#============================================================================ -#==============SUPER-ENHANCER CALLING AND PLOTTING FUNCTIONS================= -#====== from the original ROSE package developed by Charles Lin (c) ====== -#====== ROSE licence: http://younglab.wi.mit.edu/ROSE/LICENSE.txt ====== -#============================================================================ - -#this is an accessory function, that determines the number of points below a diagnoal passing through [x,yPt] -numPts_below_line <- function(myVector,slope,x){ - yPt <- myVector[x] - b <- yPt-(slope*x) - xPts <- 1:length(myVector) - return(sum(myVector<=(xPts*slope+b))) -} - -#This function calculates the cutoff by sliding a diagonal line and finding where it is tangential (or as close as possible) -calculate_cutoff <- function(inputVector, drawPlot=FALSE,...){ - print("this version will try to get more than 600 SEs") - t1=600 - - inputVector <- sort(inputVector) - inputVector[inputVector<0]<-0 #set those regions with more control than ranking equal to zero - - numberOfSE=0 - while (numberOfSEy_cutoff)) - inputVector=inputVector[-length(inputVector)] - } - - if(drawPlot){ #if TRUE, draw the plot - plot(1:length(inputVector), inputVector,type="l",...) - b <- y_cutoff-(slope* xPt) - abline(v= xPt,h= y_cutoff,lty=2,col=8) - points(xPt,y_cutoff,pch=16,cex=0.9,col=2) - abline(coef=c(b,slope),col=2) - title(paste("x=",xPt,"\ny=",signif(y_cutoff,3),"\nFold over Median=",signif(y_cutoff/median(inputVector),3),"x\nFold over Mean=",signif(y_cutoff/mean(inputVector),3),"x",sep="")) - axis(1,sum(inputVector==0),sum(inputVector==0),col.axis="pink",col="pink") #Number of regions with zero signal - } - return(y_cutoff) -} - -######## ######## ######## ######## ######## ######## ######## ######## ######## -######## select threshold based on Narrow peaks: - -threshold_score = getThresholdOnPeakScore(paste0(sampleName,"_peaks.narrowPeak")) -cat (paste("..Minimal peak score set to ",threshold_score,"\n")) - -####### read regions and select regions with high score: - -regionsToStitch=import.bed6(paste0(sampleName,"_regions.bed")) -tt=which(regionsToStitch$score>=threshold_score) -cat (paste("will select ",length(tt), " regions out of ",length(regionsToStitch)," initial peaks\n")) - -regionsToStitch=regionsToStitch[tt] - -######## read gene info and extract TSS: -if (substr(transcriptomeFile,start = nchar(transcriptomeFile)-2,nchar(transcriptomeFile))=="gff") { - transcripts=import(transcriptomeFile) -} -if (substr(transcriptomeFile,start = nchar(transcriptomeFile)-3,nchar(transcriptomeFile))=="ucsc"){ - transcripts=import.ucsc(transcriptomeFile) -} - -cat(paste("..Read file",transcriptomeFile,"\n")) - -TSSRegions=GRanges(chrom(transcripts),IRanges(start(transcripts)-distFromTSS,start(transcripts)+distFromTSS-1)) -tt=which(strand(transcripts)=='-') -TSSRegions[tt]=GRanges(chrom(transcripts[tt]),IRanges(end(transcripts[tt])-distFromTSS,end(transcripts[tt])+distFromTSS-1)) - -cat(paste("..Created",length(TSSRegions),"TSS regions\n")) - -########## remove TSS peaks from enhancers: -enhancers=setdiff(regionsToStitch,TSSRegions,ignore.strand=T) -promoters=intersect(regionsToStitch,TSSRegions,ignore.strand=T) - -########## stitch enhancers: -enhancersStitched=resize(enhancers,maxDistanceToStitch+width(enhancers)) -enhancersStitched=reduce(enhancersStitched) -enhancersStitched=resize(enhancersStitched,-maxDistanceToStitch+width(enhancersStitched)) - -cat(paste("..Created",length(enhancersStitched),"stitched regions\n")) -#export(enhancersStitched,paste0(sampleName,".stitched_regions.bed")) - -######### create big wig file out of wig if .bw does not exists ###### -hasBW=F -bwFile=paste0(sampleName,".wig.bw") -if(file.exists(bwFile)) { - hasBW = TRUE -} - -if (!hasBW) { - bwFile=paste0(sampleName,".bw") - if(file.exists(bwFile)) { - hasBW = TRUE - } -} - -if (!hasBW) { - cat ("Will create bw file using 'wigToBigWig'\nCheck what wigToBigWig is added to your PATH!\nEx: PATH=$PATH:/usr/local/bin/ucsc_tools/wigToBigWig\n") - outSys=system(paste("wigToBigWig -clip", paste0(sampleName,".wig"), faiFileToCreateBigWig, paste0(sampleName,".bw"))) - bwFile=paste0(sampleName,".bw") - if (outSys==0) { - cat (paste(bwFile,"was created\n")) - } - if(file.exists(bwFile)) { - hasBW = TRUE - }else { - cat(paste0("Could not create bigwig file out of ",paste0(sampleName,".wig"),"\nPlease create this file yourself and rerun\n")) - cat (paste0("Ex: wigToBigWig -clip ", paste0(sampleName,".wig"), " YOUR_PATH_TO/Human/hg19/hg19.fa.fai ", paste0(sampleName,".bw\n"))) - cat ("wigToBigWig can be downloaded here: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/\n") - quit() - } -} - -outputFile=paste0(sampleName,".scores.bed") -outputFile=basename(outputFile) -cat (paste("..Printing SEs, enhancers and promoters with their scores into",outputFile,"\n")) -output_peaks_density(enhancersStitched,enhancers,promoters,bwFile,outputFile) - - - +#Copywrite Valentina Boeva, 2017 + +# >>> SOURCE LICENSE >>> +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation (www.fsf.org); either version 2 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# A copy of the GNU General Public License is available at +# http://www.fsf.org/licensing/licenses +# +# >>> END OF LICENSE >>> + +################################################################################# +# to run: +# cat runLILY.R | R --slave --args SAMPLE_NAME OUTPUT_DIR Distance_toStitch distFromTSS transcriptome_GFF_file faiFileToCreateBigWig +################################################################################# + +cl_args <- commandArgs() + +if( + sum(grep("RStudio",cl_args,ignore.case = TRUE))>0 || #rstudio + sum(grep("RGui",cl_args,ignore.case = TRUE))>0 #rgui +){ + + if (sum(c("sampleName", + "OUTPUT_DIR", + "maxDistanceToStitch", + "distFromTSS", + "transcriptomeFile", + "faiFileToCreateBigWig") %in% ls())<6){ + stop("We have started from a GUI and we cannot use args.\n", + "Please, define sampleName, OUTPUT_DIR, maxDistanceToStitch, distFromTSS,\n", + "transcriptomeFile, and faiFileToCreateBigWig before we start.\n") + } +} else { #Rscript cli run. Let's parse + sampleName=cl_args[4] + OUTPUT_DIR=cl_args[5] + maxDistanceToStitch=as.numeric(cl_args[6]) + distFromTSS=as.numeric(cl_args[7]) + transcriptomeFile=cl_args[8] + faiFileToCreateBigWig=cl_args[9] +} +cat (paste("..Working with sample",sampleName,"\n")) +cat (paste("..Will stitch enhancers at maximal distance of",maxDistanceToStitch,"\n")) +cat (paste("..Will consider promoter regions as regions around +-",distFromTSS,"from gene TSS\n")) + + +#################check that all files and directories exist #################### + +dir.create(OUTPUT_DIR, showWarnings = FALSE) +setwd(OUTPUT_DIR) +cat (paste("..Will write the output into ",OUTPUT_DIR,"\n")) + +if(!file.exists(transcriptomeFile)) { + cat ("Error:Please prodive a valid path to a file with transcriptome information\nFor example from here: https://github.com/linlabbcm/rose2/tree/master/rose2/annotation\nEXIT!\n") + quit() +} + +if(!file.exists(paste0(sampleName,"_peaks.narrowPeak"))) { + cat ("Error:Please prodive a valid path to output files of HMCan\n") + quit() +} + + +##################################################### + +suppressMessages(library(rtracklayer)) + +################# FUNCTIONS ######################### + + +import.bed3<- function(filename){ + peaks=read.table(filename) + return(GRanges(peaks[,1],IRanges(peaks[,2],peaks[,3]))) +} + +import.bed6<- function(filename){ + peaks=read.table(filename) + return(GRanges(peaks[,1],IRanges(peaks[,2],peaks[,3]),score=peaks[,5])) +} + +import.ucsc<- function(filename){ + peaks=read.table(filename,header = T,comment.char = "!") + peaks=GRanges(peaks$chrom,IRanges(peaks$txStart,peaks$txEnd),strand = peaks$strand) + return(peaks) +} + + +getThresholdOnPeakScore <- function (filename) { + dataTable <-read.table(filename, header=F); + lengths = dataTable$V3-dataTable$V2 + lengths = (lengths - 1)/50 + 1 + scores=dataTable$V5 + + tranch=200 + ascendingScores = rev(scores) + lengthForAscendingScores = rev(lengths) + + x = NULL + y = NULL + z=NULL + for (i in c(0:(floor(length(lengths)/tranch)-1))) { + tt=c((tranch*i+1):(tranch*i+tranch)) + m=mean(ascendingScores[tt]/lengthForAscendingScores[tt]) + s=sd(ascendingScores[tt]/lengthForAscendingScores[tt]) + x=c(x,max(ascendingScores[tt])) + y=c(y,m) + z=c(z,s) + } + + thresholdToReturn=x[min(which(y>=min(y[which(x>5)])))] + if (thresholdToReturn<2) {thresholdToReturn=2} #warning!!! + return(thresholdToReturn) +} + +output_peaks_density <- function(enhancersStitched,enhancers,promoters,bwFile,outputFile){ + + bedRegions = enhancersStitched + strand(bedRegions)="*"; + bedRegions$score=0; + densities = import.bw(bwFile) + gc() + overlaps=findOverlaps(bedRegions,densities) + ensids <- densities$score[subjectHits(overlaps)] + x <- tapply(ensids, queryHits(overlaps), sum) + bedRegions$score[unique(queryHits(overlaps))]=x + rm(overlaps) + rm(densities) + gc() + strand(bedRegions)="+"; + bedRegions=bedRegions[order(bedRegions$score,decreasing=T)] + cutoff=calculate_cutoff(bedRegions$score,F) + bedRegions$name="enhancer" + bedRegions$name[which(bedRegions$score>cutoff)]="SE" + + #unstitch enhancers: + strand(enhancers)="+";enhancers$score=0;enhancers$name="enhancer"; + tt1=which(countOverlaps(enhancers,bedRegions[which(bedRegions$name=="enhancer")])>0) + tt2=which(countOverlaps(bedRegions,enhancers)>0 & bedRegions$name=="enhancer") + + bedRegions=bedRegions[-tt2] + bedRegions=c(bedRegions,enhancers[tt1]) + + #find promoters intersecting with enhancers (not SEs) and call these enhancers: promoters + promoters$score=0 + promoters$name="promoter" + strand(promoters)="+" + + enh_promoters=c(promoters,bedRegions[which(bedRegions$name=="enhancer")]) + enh_promoters=resize(enh_promoters,20+width(enh_promoters)) + enh_promoters=reduce(enh_promoters) + enh_promoters=resize(enh_promoters,-20+width(enh_promoters)) + + tt1=which(countOverlaps(enh_promoters,promoters)>0) + tt2=which(countOverlaps(enh_promoters,promoters)==0) + largePromoters=enh_promoters[tt1] + largeEnh=enh_promoters[tt2] + + largePromoters$score=0;largePromoters$name="promoter" + largeEnh$score=0;largeEnh$name="enhancer" + tt1=which(countOverlaps(largeEnh,bedRegions[which(bedRegions$name=="enhancer")])>0) + tt2=which(countOverlaps(bedRegions,largeEnh)>0 & bedRegions$name=="enhancer") + bedRegions=bedRegions[-tt2] + bedRegions=c(bedRegions,largeEnh[tt1]) + + finalSet=c(bedRegions,largePromoters) + strand(finalSet)="*"; + finalSet$score=0; + densities = import.bw(bwFile) + gc() + overlaps=findOverlaps(finalSet,densities) + ensids <- densities$score[subjectHits(overlaps)] + x <- tapply(ensids, queryHits(overlaps), sum) + finalSet$score[unique(queryHits(overlaps))]=x + rm(overlaps) + rm(densities) + gc() + strand(finalSet)="+"; + + export.bed(finalSet,outputFile) +} + +#============================================================================ +#==============SUPER-ENHANCER CALLING AND PLOTTING FUNCTIONS================= +#====== from the original ROSE package developed by Charles Lin (c) ====== +#====== ROSE licence: http://younglab.wi.mit.edu/ROSE/LICENSE.txt ====== +#============================================================================ + +#this is an accessory function, that determines the number of points below a diagnoal passing through [x,yPt] +numPts_below_line <- function(myVector,slope,x){ + yPt <- myVector[x] + b <- yPt-(slope*x) + xPts <- 1:length(myVector) + return(sum(myVector<=(xPts*slope+b))) +} + +#This function calculates the cutoff by sliding a diagonal line and finding where it is tangential (or as close as possible) +calculate_cutoff <- function(inputVector, drawPlot=FALSE,...){ + print("this version will try to get more than 600 SEs") + t1=600 + + inputVector <- sort(inputVector) + inputVector[inputVector<0]<-0 #set those regions with more control than ranking equal to zero + + numberOfSE=0 + while (numberOfSEy_cutoff)) + inputVector=inputVector[-length(inputVector)] + } + + if(drawPlot){ #if TRUE, draw the plot + plot(1:length(inputVector), inputVector,type="l",...) + b <- y_cutoff-(slope* xPt) + abline(v= xPt,h= y_cutoff,lty=2,col=8) + points(xPt,y_cutoff,pch=16,cex=0.9,col=2) + abline(coef=c(b,slope),col=2) + title(paste("x=",xPt,"\ny=",signif(y_cutoff,3),"\nFold over Median=",signif(y_cutoff/median(inputVector),3),"x\nFold over Mean=",signif(y_cutoff/mean(inputVector),3),"x",sep="")) + axis(1,sum(inputVector==0),sum(inputVector==0),col.axis="pink",col="pink") #Number of regions with zero signal + } + return(y_cutoff) +} + +######## ######## ######## ######## ######## ######## ######## ######## ######## +######## select threshold based on Narrow peaks: + +threshold_score = getThresholdOnPeakScore(paste0(sampleName,"_peaks.narrowPeak")) +cat (paste("..Minimal peak score set to ",threshold_score,"\n")) + +####### read regions and select regions with high score: + +regionsToStitch=import.bed6(paste0(sampleName,"_regions.bed")) +tt=which(regionsToStitch$score>=threshold_score) +cat (paste("will select ",length(tt), " regions out of ",length(regionsToStitch)," initial peaks\n")) + +regionsToStitch=regionsToStitch[tt] + +######## read gene info and extract TSS: +if (substr(transcriptomeFile,start = nchar(transcriptomeFile)-2,nchar(transcriptomeFile))=="gff") { + transcripts=import(transcriptomeFile) +} +if (substr(transcriptomeFile,start = nchar(transcriptomeFile)-3,nchar(transcriptomeFile))=="ucsc"){ + transcripts=import.ucsc(transcriptomeFile) +} + +cat(paste("..Read file",transcriptomeFile,"\n")) + +TSSRegions=GRanges(chrom(transcripts),IRanges(start(transcripts)-distFromTSS,start(transcripts)+distFromTSS-1)) +tt=which(strand(transcripts)=='-') +TSSRegions[tt]=GRanges(chrom(transcripts[tt]),IRanges(end(transcripts[tt])-distFromTSS,end(transcripts[tt])+distFromTSS-1)) + +cat(paste("..Created",length(TSSRegions),"TSS regions\n")) + +########## remove TSS peaks from enhancers: +enhancers=setdiff(regionsToStitch,TSSRegions,ignore.strand=T) +promoters=intersect(regionsToStitch,TSSRegions,ignore.strand=T) + +########## stitch enhancers: +enhancersStitched=resize(enhancers,maxDistanceToStitch+width(enhancers)) +enhancersStitched=reduce(enhancersStitched) +enhancersStitched=resize(enhancersStitched,-maxDistanceToStitch+width(enhancersStitched)) + +cat(paste("..Created",length(enhancersStitched),"stitched regions\n")) +#export(enhancersStitched,paste0(sampleName,".stitched_regions.bed")) + +######### create big wig file out of wig if .bw does not exists ###### +hasBW=F +bwFile=paste0(sampleName,".wig.bw") +if(file.exists(bwFile)) { + hasBW = TRUE +} + +if (!hasBW) { + bwFile=paste0(sampleName,".bw") + if(file.exists(bwFile)) { + hasBW = TRUE + } +} + +if (!hasBW) { + cat ("Will create bw file using 'wigToBigWig'\nCheck what wigToBigWig is added to your PATH!\nEx: PATH=$PATH:/usr/local/bin/ucsc_tools/wigToBigWig\n") + outSys=system(paste("wigToBigWig -clip", paste0(sampleName,".wig"), faiFileToCreateBigWig, paste0(sampleName,".bw"))) + bwFile=paste0(sampleName,".bw") + if (outSys==0) { + cat (paste(bwFile,"was created\n")) + } + if(file.exists(bwFile)) { + hasBW = TRUE + }else { + cat(paste0("Could not create bigwig file out of ",paste0(sampleName,".wig"),"\nPlease create this file yourself and rerun\n")) + cat (paste0("Ex: wigToBigWig -clip ", paste0(sampleName,".wig"), " YOUR_PATH_TO/Human/hg19/hg19.fa.fai ", paste0(sampleName,".bw\n"))) + cat ("wigToBigWig can be downloaded here: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/\n") + quit() + } +} + +outputFile=paste0(sampleName,".scores.bed") +outputFile=basename(outputFile) +cat (paste("..Printing SEs, enhancers and promoters with their scores into",outputFile,"\n")) +output_peaks_density(enhancersStitched,enhancers,promoters,bwFile,outputFile) + + + From 2a087083461e5011b8ca6f9f7fdf6a8a7628d763 Mon Sep 17 00:00:00 2001 From: Alexander Favorov Date: Fri, 7 Apr 2023 00:52:08 -0400 Subject: [PATCH 3/6] Fixed the interaction btw setwd and relative file names --- scripts/runLILY.R | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/scripts/runLILY.R b/scripts/runLILY.R index 1ead9e6..c751313 100644 --- a/scripts/runLILY.R +++ b/scripts/runLILY.R @@ -53,21 +53,29 @@ cat (paste("..Will consider promoter regions as regions around +-",distFromTSS," #################check that all files and directories exist #################### +if(!file.exists(transcriptomeFile)) { + stop("Error: I cannot find the file ",transcriptomeFile,"\n", + "Please prodive a valid path to a file with transcriptome information\n", + "For example from here: https://github.com/linlabbcm/rose2/tree/master/rose2/annotation\nEXIT!\n") +} else {transcriptomeFile<-normalizePath(transcriptomeFile)} +#we need to normalise the path not to loose it after setwd() + +npkname<-paste0(sampleName,"_peaks.narrowPeak") +if(!file.exists(npkname)) { + stop("Error: I cannot find the file ",npkname,".\n", + "Please prodive a valid path to output files of HMCan\n") +} else {sampleName<-suppressWarnings(normalizePath(sampleName))} +#we need to normalise the path not to loose it after setwd() +#we supress warnings, the sample names is not a existing file name, +#it is a common prefix + +faiFileToCreateBigWig<-normalizePath(faiFileToCreateBigWig) +#we need to normalise the path not to loose it after setwd() + dir.create(OUTPUT_DIR, showWarnings = FALSE) setwd(OUTPUT_DIR) cat (paste("..Will write the output into ",OUTPUT_DIR,"\n")) -if(!file.exists(transcriptomeFile)) { - cat ("Error:Please prodive a valid path to a file with transcriptome information\nFor example from here: https://github.com/linlabbcm/rose2/tree/master/rose2/annotation\nEXIT!\n") - quit() -} - -if(!file.exists(paste0(sampleName,"_peaks.narrowPeak"))) { - cat ("Error:Please prodive a valid path to output files of HMCan\n") - quit() -} - - ##################################################### suppressMessages(library(rtracklayer)) From abd0351ba9d9d06c30237f5205fa4ec37eabbd93 Mon Sep 17 00:00:00 2001 From: Sasha Favorov Date: Sat, 8 Apr 2023 19:43:42 -0400 Subject: [PATCH 4/6] to normalizePath on mac, we need an existing file name --- scripts/runLILY.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/runLILY.R b/scripts/runLILY.R index c751313..c2ad90d 100644 --- a/scripts/runLILY.R +++ b/scripts/runLILY.R @@ -64,7 +64,10 @@ npkname<-paste0(sampleName,"_peaks.narrowPeak") if(!file.exists(npkname)) { stop("Error: I cannot find the file ",npkname,".\n", "Please prodive a valid path to output files of HMCan\n") -} else {sampleName<-suppressWarnings(normalizePath(sampleName))} +} else { + npkname<-normalizePath(npkname) #to normalize, we use the real file name + sampleName<-substr(npkname,1,nchar(npkname)-17) #and then, we cut it back +} #we need to normalise the path not to loose it after setwd() #we supress warnings, the sample names is not a existing file name, #it is a common prefix From 83b3c31efdf21c905203afd34f11a7b4cfb9d2d2 Mon Sep 17 00:00:00 2001 From: Alexander Favorov Date: Mon, 10 Apr 2023 02:08:12 -0400 Subject: [PATCH 5/6] fixed the bug when spript died if there were not enough intervals to get 600 SEs --- scripts/runLILY.R | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/scripts/runLILY.R b/scripts/runLILY.R index c2ad90d..eaaee27 100644 --- a/scripts/runLILY.R +++ b/scripts/runLILY.R @@ -64,10 +64,7 @@ npkname<-paste0(sampleName,"_peaks.narrowPeak") if(!file.exists(npkname)) { stop("Error: I cannot find the file ",npkname,".\n", "Please prodive a valid path to output files of HMCan\n") -} else { - npkname<-normalizePath(npkname) #to normalize, we use the real file name - sampleName<-substr(npkname,1,nchar(npkname)-17) #and then, we cut it back -} +} else {sampleName<-suppressWarnings(normalizePath(sampleName))} #we need to normalise the path not to loose it after setwd() #we supress warnings, the sample names is not a existing file name, #it is a common prefix @@ -145,7 +142,7 @@ output_peaks_density <- function(enhancersStitched,enhancers,promoters,bwFile,ou rm(densities) gc() strand(bedRegions)="+"; - bedRegions=bedRegions[order(bedRegions$score,decreasing=T)] + bedRegions<<-bedRegions[order(bedRegions$score,decreasing=T)] cutoff=calculate_cutoff(bedRegions$score,F) bedRegions$name="enhancer" bedRegions$name[which(bedRegions$score>cutoff)]="SE" @@ -215,11 +212,22 @@ numPts_below_line <- function(myVector,slope,x){ calculate_cutoff <- function(inputVector, drawPlot=FALSE,...){ print("this version will try to get more than 600 SEs") t1=600 + #hadrcoded 600. inputVector <- sort(inputVector) inputVector[inputVector<0]<-0 #set those regions with more control than ranking equal to zero - numberOfSE=0 + numberOfSE<-0 + max_numberOfSE<-0 + max_y_cutoff<-0 + max_slope<-0 + max_xPt<-0 + max_iv_len<-0 + #in this cycle, if we get t1 SE-s we are happy + #if not, we cycle until the best SE number is more than the length of the input, which length decreases every step + #if it is not more, we did out bast and exit + + iv<-inputVector #backup while (numberOfSEy_cutoff)) + + if(max_numberOfSE Date: Mon, 10 Apr 2023 02:11:49 -0400 Subject: [PATCH 6/6] returns to original work dir, it is useful if we run from r/rgui/rstudio --- scripts/runLILY.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/runLILY.R b/scripts/runLILY.R index eaaee27..ef24568 100644 --- a/scripts/runLILY.R +++ b/scripts/runLILY.R @@ -73,6 +73,7 @@ faiFileToCreateBigWig<-normalizePath(faiFileToCreateBigWig) #we need to normalise the path not to loose it after setwd() dir.create(OUTPUT_DIR, showWarnings = FALSE) +popd<-getwd() setwd(OUTPUT_DIR) cat (paste("..Will write the output into ",OUTPUT_DIR,"\n")) @@ -346,5 +347,5 @@ outputFile=basename(outputFile) cat (paste("..Printing SEs, enhancers and promoters with their scores into",outputFile,"\n")) output_peaks_density(enhancersStitched,enhancers,promoters,bwFile,outputFile) - +setwd(popd)