From 56d9d48b4e6a69c3ee8c7fb02da1784ed666b4e6 Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 11:52:55 +0000 Subject: [PATCH 1/9] feat: add logic to parse a list of regions --- Python_Scripts/peak_compare.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index 5b471a3..f69bc20 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -7,6 +7,20 @@ from IO import BedGraph, Bed +def list_type(arg): + try: + return [int(x) for x in arg.split(',')] + except ValueError: + raise argparse.ArgumentTypeError( + "Argument must be a comma-separated list of integers") + + +def process_regions(args: argparse.Namespace) -> list: + regions = [(chromosome, start, end) for chromosome, start, + end in zip(args.chromosome, args.start, args.end)] + return regions + + def main(args: argparse.Namespace) -> None: chromosome = args.chromosome start = args.start @@ -128,19 +142,21 @@ def main(args: argparse.Namespace) -> None: ) parser.add_argument( "chromosome", - help="The chromosome of the region you wish to inspect." + type=list_type, + help=("The chromosomes of the regions you wish to inspect. Comma " + "separated list.") ) parser.add_argument( "start", - type=int, - help=("The base pair position at the start of the region you wish to " - "inspect") + type=list_type, + help=("The base pair positions at the start of the regions you wish " + "to inspect. Comma separated list.") ) parser.add_argument( "end", - type=int, - help=("The base pair position at the end of the region you wish to " - "inspect") + type=list_type, + help=("The base pair positions at the end of the regions you wish to " + "inspect. Comma separated list") ) parser.add_argument( "reference_merged_peaks_file", @@ -178,4 +194,5 @@ def main(args: argparse.Namespace) -> None: help="The cutoff used to call peaks in the reference dataset." ) args = parser.parse_args() + regions = process_regions(args) main(args) From 962b4206fa00013b649fb6c678dfb4d5b2c99405 Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 12:22:06 +0000 Subject: [PATCH 2/9] feat: add multiprocessing to running I also got rid of the non parsable version as it is kinda useless and doesn't lend to this new code unless I add additional logic to handle the different regions considered. --- Python_Scripts/peak_compare.py | 91 +++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 30 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index f69bc20..b0375df 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -5,6 +5,7 @@ from extract_region import extract_bedbase_region from label_peak_type import label_peak_type, convert_narrow_peak_to_bedbase from IO import BedGraph, Bed +from multiprocessing import Pool def list_type(arg): @@ -21,49 +22,58 @@ def process_regions(args: argparse.Namespace) -> list: return regions -def main(args: argparse.Namespace) -> None: - chromosome = args.chromosome - start = args.start - end = args.end - +def run(reference_merged_peaks, + reference_unmerged_peaks, + reference_bias_track, + reference_coverage_track, + comparison_bias_track, + comparison_coverage_track, + comparison_pvalue_track, + significance, + window_size, + cutoff, + unmerged, + chromosome, + start, + end) -> float: reference_merged_peaks = convert_narrow_peak_to_bedbase( - Bed.read_from_file(args.reference_merged_peaks_file), + reference_merged_peaks, chromosome, start, end ) reference_unmerged_peaks = convert_narrow_peak_to_bedbase( - Bed.read_from_file(args.reference_unmerged_peaks_file), + reference_unmerged_peaks, chromosome, start, end ) reference_bias_track = extract_bedbase_region( - BedGraph.read_from_file(args.reference_bias_track_file), + reference_bias_track, chromosome, start, end ) reference_coverage_track = extract_bedbase_region( - BedGraph.read_from_file(args.reference_coverage_track_file), + reference_coverage_track, chromosome, start, end ) comparison_bias_track = extract_bedbase_region( - BedGraph.read_from_file(args.comparison_bias_track_file), + comparison_bias_track, chromosome, start, end ) comparison_coverage_track = extract_bedbase_region( - BedGraph.read_from_file(args.comparison_coverage_track_file), + comparison_coverage_track, chromosome, start, end ) comparison_pvalue_track = extract_bedbase_region( - BedGraph.read_from_file(args.comparison_pvalue_file), + comparison_pvalue_track, chromosome, start, end @@ -75,14 +85,14 @@ def main(args: argparse.Namespace) -> None: reference_pvalue_ci = generate_pvalue_ci( reference_bias_track, reference_coverage_track, - args.significance, - args.window_size + significance, + window_size ) comparison_pvalue_ci = generate_pvalue_ci( comparison_bias_track, comparison_coverage_track, - args.significance, - args.window_size + significance, + window_size ) compared_pvalues = compare_pvalue_ci( reference_pvalue_ci, @@ -92,18 +102,46 @@ def main(args: argparse.Namespace) -> None: comparison_pvalue_track, compared_pvalues, reference_labelled_peaks, - args.cutoff + cutoff ) metric = calculate_metric( reference_labelled_peaks, pseudopeaks, - include_merged_peaks=(not args.unmerged) + include_merged_peaks=(not unmerged) ) - if args.parsable: - print(metric) - else: - print("The metric for these datasets over the range ", - f"{start} to {end} for chromosome {chromosome} is: {metric}.") + return metric + + +def main(args: argparse.Namespace, regions: list) -> None: + reference_merged_peaks = Bed.read_from_file( + args.reference_merged_peaks_file) + reference_unmerged_peaks = Bed.read_from_file( + args.reference_unmerged_peaks_file) + reference_bias_track = BedGraph.read_from_file( + args.reference_bias_track_file) + reference_coverage_track = BedGraph.read_from_file( + args.reference_coverage_track_file) + comparison_bias_track = BedGraph.read_from_file( + args.comparison_bias_track_file) + comparison_coverage_track = BedGraph.read_from_file( + args.comparison_coverage_track_file) + comparison_pvalue_track = BedGraph.read_from_file( + args.comparison_pvalue_file) + run_arguments = [ + ( + reference_merged_peaks, reference_unmerged_peaks, + reference_bias_track, reference_coverage_track, + comparison_bias_track, comparison_coverage_track, + comparison_pvalue_track, args.significance, args.window_size, + args.cutoff, args.unmerged, chromosome, start, end + ) + for chromosome, start, end in zip( + args.chromosome, args.start, args.end + ) + ] + with Pool() as pool: + metrics = pool.starmap(run, run_arguments) + print(metrics) if __name__ == "__main__": @@ -111,13 +149,6 @@ def main(args: argparse.Namespace) -> None: prog="PeakCompare", description="Determine how likely a peak is to be in two datasets." ) - parser.add_argument( - "-p", - "--parsable", - action="store_true", - help=("Set this if you want the output of the script to be computer ", - "parsable (and not human readable).") - ) parser.add_argument( "--unmerged", action="store_true", From 0b60a5196b5fcde3cd89388a56a68c53cb78f9da Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 12:24:32 +0000 Subject: [PATCH 3/9] revert: remove processing of regions (handled by main) --- Python_Scripts/peak_compare.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index b0375df..08323e1 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -16,12 +16,6 @@ def list_type(arg): "Argument must be a comma-separated list of integers") -def process_regions(args: argparse.Namespace) -> list: - regions = [(chromosome, start, end) for chromosome, start, - end in zip(args.chromosome, args.start, args.end)] - return regions - - def run(reference_merged_peaks, reference_unmerged_peaks, reference_bias_track, @@ -225,5 +219,4 @@ def main(args: argparse.Namespace, regions: list) -> None: help="The cutoff used to call peaks in the reference dataset." ) args = parser.parse_args() - regions = process_regions(args) main(args) From a88ec91094e4737a8b21a15139569862f6f0a00b Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 12:28:39 +0000 Subject: [PATCH 4/9] docs: update config file to showcase multi region processing --- Setup/example_config_compare.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Setup/example_config_compare.txt b/Setup/example_config_compare.txt index e6c396c..c365861 100644 --- a/Setup/example_config_compare.txt +++ b/Setup/example_config_compare.txt @@ -2,12 +2,12 @@ # PARAMETERS # # ---------- # -# The region that you wish to inspect. It is recommended that you pick a region -# that is mainly characterised by peaks in your reference dataset. A larger -# region will result in slowdown. -CHROMOSOME=chr1 -START=500 -END=1000 +# The regions that you wish to inspect. It is recommended that you pick regions +# that are characterised by peaks in your reference dataset, though this is not +# a requirement. For multiple regions, supply a comma separated list. +CHROMOSOME=chr1,chr1,chr1 +START=500,1234,9999 +END=1000,1280,12345 # The cutoff that was used when peak calling with the reference dataset # Recall that this isn't the actual value, but the -log() value of the cutoff From 01ac2aae2e8be2a2b02e375ad9a85d986f425bd8 Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 12:31:31 +0000 Subject: [PATCH 5/9] docs: add types to args --- Python_Scripts/peak_compare.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index 08323e1..a60d4dc 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -16,20 +16,20 @@ def list_type(arg): "Argument must be a comma-separated list of integers") -def run(reference_merged_peaks, - reference_unmerged_peaks, - reference_bias_track, - reference_coverage_track, - comparison_bias_track, - comparison_coverage_track, - comparison_pvalue_track, - significance, - window_size, - cutoff, - unmerged, - chromosome, - start, - end) -> float: +def run(reference_merged_peaks: Bed, + reference_unmerged_peaks: Bed, + reference_bias_track: BedGraph, + reference_coverage_track: BedGraph, + comparison_bias_track: BedGraph, + comparison_coverage_track: BedGraph, + comparison_pvalue_track: BedGraph, + significance: float, + window_size: int, + cutoff: float, + unmerged: bool, + chromosome: str, + start: int, + end: int) -> float: reference_merged_peaks = convert_narrow_peak_to_bedbase( reference_merged_peaks, chromosome, From 79a5d74619c6ab47080e34a9f37ac9ca95375f79 Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 12:34:12 +0000 Subject: [PATCH 6/9] fix: chromosome type conversion Previously chromosome was being converted to int, which doesn't work when the chromosome starts with "chr" (standard) --- Python_Scripts/peak_compare.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index a60d4dc..3e6a4fa 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -8,7 +8,11 @@ from multiprocessing import Pool -def list_type(arg): +def string_list(arg): + return [x for x in arg.split(',')] + + +def integer_list(arg): try: return [int(x) for x in arg.split(',')] except ValueError: @@ -167,19 +171,19 @@ def main(args: argparse.Namespace, regions: list) -> None: ) parser.add_argument( "chromosome", - type=list_type, + type=string_list, help=("The chromosomes of the regions you wish to inspect. Comma " "separated list.") ) parser.add_argument( "start", - type=list_type, + type=integer_list, help=("The base pair positions at the start of the regions you wish " "to inspect. Comma separated list.") ) parser.add_argument( "end", - type=list_type, + type=integer_list, help=("The base pair positions at the end of the regions you wish to " "inspect. Comma separated list") ) From 877ae67504408a9b52068ee0461e597ddf1944d3 Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 12:48:34 +0000 Subject: [PATCH 7/9] fix: give context to order of results Due to parallelism, the order of inputs isn't necessarily going to be the order of results. So a bunch of numbers doesn't really give you any information. Instead, the region is printed alongside the metric. --- Python_Scripts/peak_compare.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index 3e6a4fa..f80a7be 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -107,7 +107,7 @@ def run(reference_merged_peaks: Bed, pseudopeaks, include_merged_peaks=(not unmerged) ) - return metric + return chromosome, start, end, metric def main(args: argparse.Namespace, regions: list) -> None: @@ -138,8 +138,8 @@ def main(args: argparse.Namespace, regions: list) -> None: ) ] with Pool() as pool: - metrics = pool.starmap(run, run_arguments) - print(metrics) + results = pool.starmap(run, run_arguments) + print(results) if __name__ == "__main__": From 546c714e12514dd62bff2523ba954d3509802bf1 Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 13:07:28 +0000 Subject: [PATCH 8/9] feat: nicer printing of results (tab separated table format) --- Python_Scripts/peak_compare.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py index f80a7be..c7639fd 100644 --- a/Python_Scripts/peak_compare.py +++ b/Python_Scripts/peak_compare.py @@ -6,6 +6,8 @@ from label_peak_type import label_peak_type, convert_narrow_peak_to_bedbase from IO import BedGraph, Bed from multiprocessing import Pool +import pandas as pd +import sys def string_list(arg): @@ -138,8 +140,9 @@ def main(args: argparse.Namespace, regions: list) -> None: ) ] with Pool() as pool: - results = pool.starmap(run, run_arguments) - print(results) + results = pd.DataFrame(pool.starmap(run, run_arguments)) + results.to_csv(sys.stdout, header=False, index=False, + float_format='%.4f', sep="\t") if __name__ == "__main__": From df18f385e3ed176ab9de801dc4e08499b0d2153a Mon Sep 17 00:00:00 2001 From: sof202 Date: Thu, 30 Jan 2025 13:25:49 +0000 Subject: [PATCH 9/9] docs: add explaination for parallelism and script outputs --- docs/peak_comparing.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/peak_comparing.md b/docs/peak_comparing.md index e238289..a588272 100644 --- a/docs/peak_comparing.md +++ b/docs/peak_comparing.md @@ -47,6 +47,20 @@ chromosome of interest before passing them to `peak_compare.py`. If you plan on running the python script interactively (or you are writing your own wrapper script), consider implementing this as well. +## Parallelism + +The python script here allows for inspecting multiple regions at once. Give +a comma separated list of chromosomes, start bases and end bases for each +region you want to inspect. The order of the results may not be in the same +order as your inputs. Because of this, the output of the script is 4 tab +separated columns. + +|Chromosome|Start|End|Metric| +|----------|-----|---|------| + +This is written directly to the standard output stream, so you can use basic +linux commands to work with it (awk, grep, `>`, *etc.*). + ## How the metric is calculated The metric is a simple ratio of the number of bases in psuedopeaks in the