Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 84 additions & 36 deletions Python_Scripts/peak_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,75 @@
from extract_region import extract_bedbase_region
from label_peak_type import label_peak_type, convert_narrow_peak_to_bedbase
from IO import BedGraph, Bed
from multiprocessing import Pool
import pandas as pd
import sys


def main(args: argparse.Namespace) -> None:
chromosome = args.chromosome
start = args.start
end = args.end
def string_list(arg):
return [x for x in arg.split(',')]


def integer_list(arg):
try:
return [int(x) for x in arg.split(',')]
except ValueError:
raise argparse.ArgumentTypeError(
"Argument must be a comma-separated list of integers")


def run(reference_merged_peaks: Bed,
reference_unmerged_peaks: Bed,
reference_bias_track: BedGraph,
reference_coverage_track: BedGraph,
comparison_bias_track: BedGraph,
comparison_coverage_track: BedGraph,
comparison_pvalue_track: BedGraph,
significance: float,
window_size: int,
cutoff: float,
unmerged: bool,
chromosome: str,
start: int,
end: int) -> float:
reference_merged_peaks = convert_narrow_peak_to_bedbase(
Bed.read_from_file(args.reference_merged_peaks_file),
reference_merged_peaks,
chromosome,
start,
end
)
reference_unmerged_peaks = convert_narrow_peak_to_bedbase(
Bed.read_from_file(args.reference_unmerged_peaks_file),
reference_unmerged_peaks,
chromosome,
start,
end
)
reference_bias_track = extract_bedbase_region(
BedGraph.read_from_file(args.reference_bias_track_file),
reference_bias_track,
chromosome,
start,
end
)
reference_coverage_track = extract_bedbase_region(
BedGraph.read_from_file(args.reference_coverage_track_file),
reference_coverage_track,
chromosome,
start,
end
)
comparison_bias_track = extract_bedbase_region(
BedGraph.read_from_file(args.comparison_bias_track_file),
comparison_bias_track,
chromosome,
start,
end
)
comparison_coverage_track = extract_bedbase_region(
BedGraph.read_from_file(args.comparison_coverage_track_file),
comparison_coverage_track,
chromosome,
start,
end
)
comparison_pvalue_track = extract_bedbase_region(
BedGraph.read_from_file(args.comparison_pvalue_file),
comparison_pvalue_track,
chromosome,
start,
end
Expand All @@ -61,14 +85,14 @@ def main(args: argparse.Namespace) -> None:
reference_pvalue_ci = generate_pvalue_ci(
reference_bias_track,
reference_coverage_track,
args.significance,
args.window_size
significance,
window_size
)
comparison_pvalue_ci = generate_pvalue_ci(
comparison_bias_track,
comparison_coverage_track,
args.significance,
args.window_size
significance,
window_size
)
compared_pvalues = compare_pvalue_ci(
reference_pvalue_ci,
Expand All @@ -78,32 +102,54 @@ def main(args: argparse.Namespace) -> None:
comparison_pvalue_track,
compared_pvalues,
reference_labelled_peaks,
args.cutoff
cutoff
)
metric = calculate_metric(
reference_labelled_peaks,
pseudopeaks,
include_merged_peaks=(not args.unmerged)
include_merged_peaks=(not unmerged)
)
if args.parsable:
print(metric)
else:
print("The metric for these datasets over the range ",
f"{start} to {end} for chromosome {chromosome} is: {metric}.")
return chromosome, start, end, metric


def main(args: argparse.Namespace, regions: list) -> None:
reference_merged_peaks = Bed.read_from_file(
args.reference_merged_peaks_file)
reference_unmerged_peaks = Bed.read_from_file(
args.reference_unmerged_peaks_file)
reference_bias_track = BedGraph.read_from_file(
args.reference_bias_track_file)
reference_coverage_track = BedGraph.read_from_file(
args.reference_coverage_track_file)
comparison_bias_track = BedGraph.read_from_file(
args.comparison_bias_track_file)
comparison_coverage_track = BedGraph.read_from_file(
args.comparison_coverage_track_file)
comparison_pvalue_track = BedGraph.read_from_file(
args.comparison_pvalue_file)
run_arguments = [
(
reference_merged_peaks, reference_unmerged_peaks,
reference_bias_track, reference_coverage_track,
comparison_bias_track, comparison_coverage_track,
comparison_pvalue_track, args.significance, args.window_size,
args.cutoff, args.unmerged, chromosome, start, end
)
for chromosome, start, end in zip(
args.chromosome, args.start, args.end
)
]
with Pool() as pool:
results = pd.DataFrame(pool.starmap(run, run_arguments))
results.to_csv(sys.stdout, header=False, index=False,
float_format='%.4f', sep="\t")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="PeakCompare",
description="Determine how likely a peak is to be in two datasets."
)
parser.add_argument(
"-p",
"--parsable",
action="store_true",
help=("Set this if you want the output of the script to be computer ",
"parsable (and not human readable).")
)
parser.add_argument(
"--unmerged",
action="store_true",
Expand All @@ -128,19 +174,21 @@ def main(args: argparse.Namespace) -> None:
)
parser.add_argument(
"chromosome",
help="The chromosome of the region you wish to inspect."
type=string_list,
help=("The chromosomes of the regions you wish to inspect. Comma "
"separated list.")
)
parser.add_argument(
"start",
type=int,
help=("The base pair position at the start of the region you wish to "
"inspect")
type=integer_list,
help=("The base pair positions at the start of the regions you wish "
"to inspect. Comma separated list.")
)
parser.add_argument(
"end",
type=int,
help=("The base pair position at the end of the region you wish to "
"inspect")
type=integer_list,
help=("The base pair positions at the end of the regions you wish to "
"inspect. Comma separated list")
)
parser.add_argument(
"reference_merged_peaks_file",
Expand Down
12 changes: 6 additions & 6 deletions Setup/example_config_compare.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
# PARAMETERS #
# ---------- #

# The region that you wish to inspect. It is recommended that you pick a region
# that is mainly characterised by peaks in your reference dataset. A larger
# region will result in slowdown.
CHROMOSOME=chr1
START=500
END=1000
# The regions that you wish to inspect. It is recommended that you pick regions
# that are characterised by peaks in your reference dataset, though this is not
# a requirement. For multiple regions, supply a comma separated list.
CHROMOSOME=chr1,chr1,chr1
START=500,1234,9999
END=1000,1280,12345

# The cutoff that was used when peak calling with the reference dataset
# Recall that this isn't the actual value, but the -log() value of the cutoff
Expand Down
14 changes: 14 additions & 0 deletions docs/peak_comparing.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ chromosome of interest before passing them to `peak_compare.py`. If you plan on
running the python script interactively (or you are writing your own wrapper
script), consider implementing this as well.

## Parallelism

The python script here allows for inspecting multiple regions at once. Give
a comma separated list of chromosomes, start bases and end bases for each
region you want to inspect. The order of the results may not be in the same
order as your inputs. Because of this, the output of the script is 4 tab
separated columns.

|Chromosome|Start|End|Metric|
|----------|-----|---|------|

This is written directly to the standard output stream, so you can use basic
linux commands to work with it (awk, grep, `>`, *etc.*).

## How the metric is calculated

The metric is a simple ratio of the number of bases in psuedopeaks in the
Expand Down