From 56d9d48b4e6a69c3ee8c7fb02da1784ed666b4e6 Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 11:52:55 +0000
Subject: [PATCH 1/9] feat: add logic to parse a list of regions

---
 Python_Scripts/peak_compare.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index 5b471a3..f69bc20 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -7,6 +7,20 @@
 from IO import BedGraph, Bed
 
 
+def list_type(arg):
+    try:
+        return [int(x) for x in arg.split(',')]
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            "Argument must be a comma-separated list of integers")
+
+
+def process_regions(args: argparse.Namespace) -> list:
+    regions = [(chromosome, start, end) for chromosome, start,
+               end in zip(args.chromosome, args.start, args.end)]
+    return regions
+
+
 def main(args: argparse.Namespace) -> None:
     chromosome = args.chromosome
     start = args.start
@@ -128,19 +142,21 @@ def main(args: argparse.Namespace) -> None:
     )
     parser.add_argument(
         "chromosome",
-        help="The chromosome of the region you wish to inspect."
+        type=list_type,
+        help=("The chromosomes of the regions you wish to inspect. Comma "
+              "separated list.")
     )
     parser.add_argument(
         "start",
-        type=int,
-        help=("The base pair position at the start of the region you wish to "
-              "inspect")
+        type=list_type,
+        help=("The base pair positions at the start of the regions you wish "
+              "to inspect. Comma separated list.")
     )
     parser.add_argument(
         "end",
-        type=int,
-        help=("The base pair position at the end of the region you wish to "
-              "inspect")
+        type=list_type,
+        help=("The base pair positions at the end of the regions you wish to "
+              "inspect. Comma separated list")
     )
     parser.add_argument(
         "reference_merged_peaks_file",
@@ -178,4 +194,5 @@ def main(args: argparse.Namespace) -> None:
         help="The cutoff used to call peaks in the reference dataset."
     )
     args = parser.parse_args()
+    regions = process_regions(args)
     main(args)

From 962b4206fa00013b649fb6c678dfb4d5b2c99405 Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 12:22:06 +0000
Subject: [PATCH 2/9] feat: add multiprocessing to running

I also got rid of the non parsable version as it is kinda useless and
doesn't lend to this new code unless I add additional logic to handle
the different regions considered.
---
 Python_Scripts/peak_compare.py | 91 +++++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 30 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index f69bc20..b0375df 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -5,6 +5,7 @@
 from extract_region import extract_bedbase_region
 from label_peak_type import label_peak_type, convert_narrow_peak_to_bedbase
 from IO import BedGraph, Bed
+from multiprocessing import Pool
 
 
 def list_type(arg):
@@ -21,49 +22,58 @@ def process_regions(args: argparse.Namespace) -> list:
     return regions
 
 
-def main(args: argparse.Namespace) -> None:
-    chromosome = args.chromosome
-    start = args.start
-    end = args.end
-
+def run(reference_merged_peaks,
+        reference_unmerged_peaks,
+        reference_bias_track,
+        reference_coverage_track,
+        comparison_bias_track,
+        comparison_coverage_track,
+        comparison_pvalue_track,
+        significance,
+        window_size,
+        cutoff,
+        unmerged,
+        chromosome,
+        start,
+        end) -> float:
     reference_merged_peaks = convert_narrow_peak_to_bedbase(
-        Bed.read_from_file(args.reference_merged_peaks_file),
+        reference_merged_peaks,
         chromosome,
         start,
         end
     )
     reference_unmerged_peaks = convert_narrow_peak_to_bedbase(
-        Bed.read_from_file(args.reference_unmerged_peaks_file),
+        reference_unmerged_peaks,
         chromosome,
         start,
         end
     )
     reference_bias_track = extract_bedbase_region(
-        BedGraph.read_from_file(args.reference_bias_track_file),
+        reference_bias_track,
         chromosome,
         start,
         end
     )
     reference_coverage_track = extract_bedbase_region(
-        BedGraph.read_from_file(args.reference_coverage_track_file),
+        reference_coverage_track,
         chromosome,
         start,
         end
     )
     comparison_bias_track = extract_bedbase_region(
-        BedGraph.read_from_file(args.comparison_bias_track_file),
+        comparison_bias_track,
         chromosome,
         start,
         end
     )
     comparison_coverage_track = extract_bedbase_region(
-        BedGraph.read_from_file(args.comparison_coverage_track_file),
+        comparison_coverage_track,
         chromosome,
         start,
         end
     )
     comparison_pvalue_track = extract_bedbase_region(
-        BedGraph.read_from_file(args.comparison_pvalue_file),
+        comparison_pvalue_track,
         chromosome,
         start,
         end
@@ -75,14 +85,14 @@ def main(args: argparse.Namespace) -> None:
     reference_pvalue_ci = generate_pvalue_ci(
         reference_bias_track,
         reference_coverage_track,
-        args.significance,
-        args.window_size
+        significance,
+        window_size
     )
     comparison_pvalue_ci = generate_pvalue_ci(
         comparison_bias_track,
         comparison_coverage_track,
-        args.significance,
-        args.window_size
+        significance,
+        window_size
     )
     compared_pvalues = compare_pvalue_ci(
         reference_pvalue_ci,
@@ -92,18 +102,46 @@ def main(args: argparse.Namespace) -> None:
         comparison_pvalue_track,
         compared_pvalues,
         reference_labelled_peaks,
-        args.cutoff
+        cutoff
     )
     metric = calculate_metric(
         reference_labelled_peaks,
         pseudopeaks,
-        include_merged_peaks=(not args.unmerged)
+        include_merged_peaks=(not unmerged)
     )
-    if args.parsable:
-        print(metric)
-    else:
-        print("The metric for these datasets over the range ",
-              f"{start} to {end} for chromosome {chromosome} is: {metric}.")
+    return metric
+
+
+def main(args: argparse.Namespace, regions: list) -> None:
+    reference_merged_peaks = Bed.read_from_file(
+        args.reference_merged_peaks_file)
+    reference_unmerged_peaks = Bed.read_from_file(
+        args.reference_unmerged_peaks_file)
+    reference_bias_track = BedGraph.read_from_file(
+        args.reference_bias_track_file)
+    reference_coverage_track = BedGraph.read_from_file(
+        args.reference_coverage_track_file)
+    comparison_bias_track = BedGraph.read_from_file(
+        args.comparison_bias_track_file)
+    comparison_coverage_track = BedGraph.read_from_file(
+        args.comparison_coverage_track_file)
+    comparison_pvalue_track = BedGraph.read_from_file(
+        args.comparison_pvalue_file)
+    run_arguments = [
+        (
+            reference_merged_peaks, reference_unmerged_peaks,
+            reference_bias_track, reference_coverage_track,
+            comparison_bias_track, comparison_coverage_track,
+            comparison_pvalue_track, args.significance, args.window_size,
+            args.cutoff, args.unmerged, chromosome, start, end
+        )
+        for chromosome, start, end in zip(
+            args.chromosome, args.start, args.end
+        )
+    ]
+    with Pool() as pool:
+        metrics = pool.starmap(run, run_arguments)
+    print(metrics)
 
 
 if __name__ == "__main__":
@@ -111,13 +149,6 @@ def main(args: argparse.Namespace) -> None:
         prog="PeakCompare",
         description="Determine how likely a peak is to be in two datasets."
     )
-    parser.add_argument(
-        "-p",
-        "--parsable",
-        action="store_true",
-        help=("Set this if you want the output of the script to be computer ",
-              "parsable (and not human readable).")
-    )
     parser.add_argument(
         "--unmerged",
         action="store_true",

From 0b60a5196b5fcde3cd89388a56a68c53cb78f9da Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 12:24:32 +0000
Subject: [PATCH 3/9] revert: remove processing of regions (handled by main)

---
 Python_Scripts/peak_compare.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index b0375df..08323e1 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -16,12 +16,6 @@ def list_type(arg):
             "Argument must be a comma-separated list of integers")
 
 
-def process_regions(args: argparse.Namespace) -> list:
-    regions = [(chromosome, start, end) for chromosome, start,
-               end in zip(args.chromosome, args.start, args.end)]
-    return regions
-
-
 def run(reference_merged_peaks,
         reference_unmerged_peaks,
         reference_bias_track,
@@ -225,5 +219,4 @@ def main(args: argparse.Namespace, regions: list) -> None:
         help="The cutoff used to call peaks in the reference dataset."
     )
     args = parser.parse_args()
-    regions = process_regions(args)
     main(args)

From a88ec91094e4737a8b21a15139569862f6f0a00b Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 12:28:39 +0000
Subject: [PATCH 4/9] docs: update config file to showcase multi region
 processing

---
 Setup/example_config_compare.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Setup/example_config_compare.txt b/Setup/example_config_compare.txt
index e6c396c..c365861 100644
--- a/Setup/example_config_compare.txt
+++ b/Setup/example_config_compare.txt
@@ -2,12 +2,12 @@
 # PARAMETERS #
 # ---------- #
 
-# The region that you wish to inspect. It is recommended that you pick a region
-# that is mainly characterised by peaks in your reference dataset. A larger
-# region will result in slowdown.
-CHROMOSOME=chr1
-START=500
-END=1000
+# The regions that you wish to inspect. It is recommended that you pick regions
+# that are characterised by peaks in your reference dataset, though this is not
+# a requirement. For multiple regions, supply a comma separated list.
+CHROMOSOME=chr1,chr1,chr1
+START=500,1234,9999
+END=1000,1280,12345
 
 # The cutoff that was used when peak calling with the reference dataset
 # Recall that this isn't the actual value, but the -log() value of the cutoff

From 01ac2aae2e8be2a2b02e375ad9a85d986f425bd8 Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 12:31:31 +0000
Subject: [PATCH 5/9] docs: add types to args

---
 Python_Scripts/peak_compare.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index 08323e1..a60d4dc 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -16,20 +16,20 @@ def list_type(arg):
             "Argument must be a comma-separated list of integers")
 
 
-def run(reference_merged_peaks,
-        reference_unmerged_peaks,
-        reference_bias_track,
-        reference_coverage_track,
-        comparison_bias_track,
-        comparison_coverage_track,
-        comparison_pvalue_track,
-        significance,
-        window_size,
-        cutoff,
-        unmerged,
-        chromosome,
-        start,
-        end) -> float:
+def run(reference_merged_peaks: Bed,
+        reference_unmerged_peaks: Bed,
+        reference_bias_track: BedGraph,
+        reference_coverage_track: BedGraph,
+        comparison_bias_track: BedGraph,
+        comparison_coverage_track: BedGraph,
+        comparison_pvalue_track: BedGraph,
+        significance: float,
+        window_size: int,
+        cutoff: float,
+        unmerged: bool,
+        chromosome: str,
+        start: int,
+        end: int) -> float:
     reference_merged_peaks = convert_narrow_peak_to_bedbase(
         reference_merged_peaks,
         chromosome,

From 79a5d74619c6ab47080e34a9f37ac9ca95375f79 Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 12:34:12 +0000
Subject: [PATCH 6/9] fix: chromosome type conversion

Previously chromosome was being converted to int, which doesn't work
when the chromosome starts with "chr" (standard)
---
 Python_Scripts/peak_compare.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index a60d4dc..3e6a4fa 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -8,7 +8,11 @@
 from multiprocessing import Pool
 
 
-def list_type(arg):
+def string_list(arg):
+    return [x for x in arg.split(',')]
+
+
+def integer_list(arg):
     try:
         return [int(x) for x in arg.split(',')]
     except ValueError:
@@ -167,19 +171,19 @@ def main(args: argparse.Namespace, regions: list) -> None:
     )
     parser.add_argument(
         "chromosome",
-        type=list_type,
+        type=string_list,
         help=("The chromosomes of the regions you wish to inspect. Comma "
               "separated list.")
     )
     parser.add_argument(
         "start",
-        type=list_type,
+        type=integer_list,
         help=("The base pair positions at the start of the regions you wish "
               "to inspect. Comma separated list.")
     )
     parser.add_argument(
         "end",
-        type=list_type,
+        type=integer_list,
         help=("The base pair positions at the end of the regions you wish to "
               "inspect. Comma separated list")
     )

From 877ae67504408a9b52068ee0461e597ddf1944d3 Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 12:48:34 +0000
Subject: [PATCH 7/9] fix: give context to order of results

Due to parallelism, the order of inputs isn't necessarily going to be
the order of results. So a bunch of numbers doesn't really give you any
information. Instead, the region is printed alongside the metric.
---
 Python_Scripts/peak_compare.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index 3e6a4fa..f80a7be 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -107,7 +107,7 @@ def run(reference_merged_peaks: Bed,
         pseudopeaks,
         include_merged_peaks=(not unmerged)
     )
-    return metric
+    return chromosome, start, end, metric
 
 
 def main(args: argparse.Namespace, regions: list) -> None:
@@ -138,8 +138,8 @@ def main(args: argparse.Namespace, regions: list) -> None:
         )
     ]
     with Pool() as pool:
-        metrics = pool.starmap(run, run_arguments)
-    print(metrics)
+        results = pool.starmap(run, run_arguments)
+    print(results)
 
 
 if __name__ == "__main__":

From 546c714e12514dd62bff2523ba954d3509802bf1 Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 13:07:28 +0000
Subject: [PATCH 8/9] feat: nicer printing of results (tab separated table
 format)

---
 Python_Scripts/peak_compare.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Python_Scripts/peak_compare.py b/Python_Scripts/peak_compare.py
index f80a7be..c7639fd 100644
--- a/Python_Scripts/peak_compare.py
+++ b/Python_Scripts/peak_compare.py
@@ -6,6 +6,8 @@
 from label_peak_type import label_peak_type, convert_narrow_peak_to_bedbase
 from IO import BedGraph, Bed
 from multiprocessing import Pool
+import pandas as pd
+import sys
 
 
 def string_list(arg):
@@ -138,8 +140,9 @@ def main(args: argparse.Namespace, regions: list) -> None:
         )
     ]
     with Pool() as pool:
-        results = pool.starmap(run, run_arguments)
-    print(results)
+        results = pd.DataFrame(pool.starmap(run, run_arguments))
+    results.to_csv(sys.stdout, header=False, index=False,
+                   float_format='%.4f', sep="\t")
 
 
 if __name__ == "__main__":

From df18f385e3ed176ab9de801dc4e08499b0d2153a Mon Sep 17 00:00:00 2001
From: sof202 <s.o.fletcher@exeter.ac.uk>
Date: Thu, 30 Jan 2025 13:25:49 +0000
Subject: [PATCH 9/9] docs: add explaination for parallelism and script outputs

---
 docs/peak_comparing.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/peak_comparing.md b/docs/peak_comparing.md
index e238289..a588272 100644
--- a/docs/peak_comparing.md
+++ b/docs/peak_comparing.md
@@ -47,6 +47,20 @@ chromosome of interest before passing them to `peak_compare.py`. If you plan on
 running the python script interactively (or you are writing your own wrapper
 script), consider implementing this as well.
 
+## Parallelism
+
+The python script here allows for inspecting multiple regions at once. Give
+a comma separated list of chromosomes, start bases and end bases for each 
+region you want to inspect. The order of the results may not be in the same
+order as your inputs. Because of this, the output of the script is 4 tab
+separated columns.
+
+|Chromosome|Start|End|Metric|
+|----------|-----|---|------|
+
+This is written directly to the standard output stream, so you can use basic
+linux commands to work with it (awk, grep, `>`, *etc.*).
+
 ## How the metric is calculated
 
 The metric is a simple ratio of the number of bases in psuedopeaks in the