diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index 5b471a3..c7639fd 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -5,51 +5,75 @@ from extract_region import extract_bedbase_region from label_peak_type import label_peak_type, convert_narrow_peak_to_bedbase from IO import BedGraph, Bed +from multiprocessing import Pool +import pandas as pd +import sys -def main(args: argparse.Namespace) -> None: - chromosome = args.chromosome - start = args.start - end = args.end +def string_list(arg): + return [x for x in arg.split(',')] + +def integer_list(arg): + try: + return [int(x) for x in arg.split(',')] + except ValueError: + raise argparse.ArgumentTypeError( + "Argument must be a comma-separated list of integers") + + +def run(reference_merged_peaks: Bed, + reference_unmerged_peaks: Bed, + reference_bias_track: BedGraph, + reference_coverage_track: BedGraph, + comparison_bias_track: BedGraph, + comparison_coverage_track: BedGraph, + comparison_pvalue_track: BedGraph, + significance: float, + window_size: int, + cutoff: float, + unmerged: bool, + chromosome: str, + start: int, + end: int) -> float: reference_merged_peaks = convert_narrow_peak_to_bedbase( - Bed.read_from_file(args.reference_merged_peaks_file), + reference_merged_peaks, chromosome, start, end ) reference_unmerged_peaks = convert_narrow_peak_to_bedbase( - Bed.read_from_file(args.reference_unmerged_peaks_file), + reference_unmerged_peaks, chromosome, start, end ) reference_bias_track = extract_bedbase_region( - BedGraph.read_from_file(args.reference_bias_track_file), + reference_bias_track, chromosome, start, end ) reference_coverage_track = extract_bedbase_region( - BedGraph.read_from_file(args.reference_coverage_track_file), + reference_coverage_track, chromosome, start, end ) comparison_bias_track = extract_bedbase_region( - BedGraph.read_from_file(args.comparison_bias_track_file), + comparison_bias_track, chromosome, start, end ) comparison_coverage_track = extract_bedbase_region( - BedGraph.read_from_file(args.comparison_coverage_track_file), + comparison_coverage_track, chromosome, start, end ) comparison_pvalue_track = extract_bedbase_region( - BedGraph.read_from_file(args.comparison_pvalue_file), + comparison_pvalue_track, chromosome, start, end @@ -61,14 +85,14 @@ def main(args: argparse.Namespace) -> None: reference_pvalue_ci = generate_pvalue_ci( reference_bias_track, reference_coverage_track, - args.significance, - args.window_size + significance, + window_size ) comparison_pvalue_ci = generate_pvalue_ci( comparison_bias_track, comparison_coverage_track, - args.significance, - args.window_size + significance, + window_size ) compared_pvalues = compare_pvalue_ci( reference_pvalue_ci, @@ -78,18 +102,47 @@ def main(args: argparse.Namespace) -> None: comparison_pvalue_track, compared_pvalues, reference_labelled_peaks, - args.cutoff + cutoff ) metric = calculate_metric( reference_labelled_peaks, pseudopeaks, - include_merged_peaks=(not args.unmerged) + include_merged_peaks=(not unmerged) ) - if args.parsable: - print(metric) - else: - print("The metric for these datasets over the range ", - f"{start} to {end} for chromosome {chromosome} is: {metric}.") + return chromosome, start, end, metric + + +def main(args: argparse.Namespace, regions: list) -> None: + reference_merged_peaks = Bed.read_from_file( + args.reference_merged_peaks_file) + reference_unmerged_peaks = Bed.read_from_file( + args.reference_unmerged_peaks_file) + reference_bias_track = BedGraph.read_from_file( + args.reference_bias_track_file) + reference_coverage_track = BedGraph.read_from_file( + args.reference_coverage_track_file) + comparison_bias_track = BedGraph.read_from_file( + args.comparison_bias_track_file) + comparison_coverage_track = BedGraph.read_from_file( + args.comparison_coverage_track_file) + comparison_pvalue_track = BedGraph.read_from_file( + args.comparison_pvalue_file) + run_arguments = [ + ( + reference_merged_peaks, reference_unmerged_peaks, + reference_bias_track, reference_coverage_track, + comparison_bias_track, comparison_coverage_track, + comparison_pvalue_track, args.significance, args.window_size, + args.cutoff, args.unmerged, chromosome, start, end + ) + for chromosome, start, end in zip( + args.chromosome, args.start, args.end + ) + ] + with Pool() as pool: + results = pd.DataFrame(pool.starmap(run, run_arguments)) + results.to_csv(sys.stdout, header=False, index=False, + float_format='%.4f', sep="\t") if __name__ == "__main__": @@ -97,13 +150,6 @@ def main(args: argparse.Namespace) -> None: prog="PeakCompare", description="Determine how likely a peak is to be in two datasets." ) - parser.add_argument( - "-p", - "--parsable", - action="store_true", - help=("Set this if you want the output of the script to be computer ", - "parsable (and not human readable).") - ) parser.add_argument( "--unmerged", action="store_true", @@ -128,19 +174,21 @@ def main(args: argparse.Namespace) -> None: ) parser.add_argument( "chromosome", - help="The chromosome of the region you wish to inspect." + type=string_list, + help=("The chromosomes of the regions you wish to inspect. Comma " + "separated list.") ) parser.add_argument( "start", - type=int, - help=("The base pair position at the start of the region you wish to " - "inspect") + type=integer_list, + help=("The base pair positions at the start of the regions you wish " + "to inspect. Comma separated list.") ) parser.add_argument( "end", - type=int, - help=("The base pair position at the end of the region you wish to " - "inspect") + type=integer_list, + help=("The base pair positions at the end of the regions you wish to " + "inspect. Comma separated list") ) parser.add_argument( "reference_merged_peaks_file", diff --git a/Setup/example_config_compare.txt b/Setup/example_config_compare.txt index e6c396c..c365861 100644 --- a/Setup/example_config_compare.txt +++ b/Setup/example_config_compare.txt @@ -2,12 +2,12 @@ # PARAMETERS # # ---------- # -# The region that you wish to inspect. It is recommended that you pick a region -# that is mainly characterised by peaks in your reference dataset. A larger -# region will result in slowdown. -CHROMOSOME=chr1 -START=500 -END=1000 +# The regions that you wish to inspect. It is recommended that you pick regions +# that are characterised by peaks in your reference dataset, though this is not +# a requirement. For multiple regions, supply a comma separated list. +CHROMOSOME=chr1,chr1,chr1 +START=500,1234,9999 +END=1000,1280,12345 # The cutoff that was used when peak calling with the reference dataset # Recall that this isn't the actual value, but the -log() value of the cutoff diff --git a/docs/peak_comparing.md b/docs/peak_comparing.md index e238289..a588272 100644 --- a/docs/peak_comparing.md +++ b/docs/peak_comparing.md @@ -47,6 +47,20 @@ chromosome of interest before passing them to `peak_compare.py`. If you plan on running the python script interactively (or you are writing your own wrapper script), consider implementing this as well. +## Parallelism + +The python script here allows for inspecting multiple regions at once. Give +a comma separated list of chromosomes, start bases and end bases for each +region you want to inspect. The order of the results may not be in the same +order as your inputs. Because of this, the output of the script is 4 tab +separated columns. + +|Chromosome|Start|End|Metric| +|----------|-----|---|------| + +This is written directly to the standard output stream, so you can use basic +linux commands to work with it (awk, grep, `>`, *etc.*). + ## How the metric is calculated The metric is a simple ratio of the number of bases in psuedopeaks in the