Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions assets/test-datasets/b_fragilis_checkv_contamination.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
contig_id contig_length total_genes viral_genes host_genes provirus proviral_length host_length region_types region_lengths region_coords_bp region_coords_genes region_viral_genes region_host_genes
k141_126 3500 6 4 2 Yes 2000 1500 viral,host,viral 1000,1500,1000 1-1000,1001-2500,2501-3500 1-2,3-4,5-6 2,0,2 0,2,0
k141_248 1000 3 1 2 Yes 500 500 host,viral,host 250,500,250 1-250,251-750,751-1000 1,2,3 0,1,0 1,0,1
k141_50 1000 6 0 0 No NA NA NA NA NA NA NA NA
k141_203 843 2 1 1 Yes 500 343 viral,host 500,343 1-500,501-843 1,2 1,0 0,1
3 changes: 3 additions & 0 deletions assets/test-datasets/b_fragilis_clusters.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
k141_126 k141_126,k141_248
k141_50 k141_50
k141_203 k141_203
Binary file added assets/test-datasets/b_fragilis_contigs.fasta.gz
Binary file not shown.
5 changes: 5 additions & 0 deletions assets/test-datasets/b_fragilis_genomad_virus_summary.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
seq_name length topology coordinates n_genes genetic_code virus_score fdr n_hallmarks marker_enrichment taxonomy
k141_126 3970 Provirus 1-3500 6 11 0.8408 0.0833 0 1.5749 Unclassified
k141_248 3153 Provirus 1000-2000 3 11 0.7912 0.1337 0 1.7183 Viruses;Duplodnaviria;Heunggongvirae;Uroviricota;Caudoviricetes
k141_50 3219 Provirus 1-1000 6 11 0.6643 0.1798 2 2.7775 Viruses;Duplodnaviria;Heunggongvirae;Uroviricota;Caudoviricetes
k141_203 843 No terminal repeats NA 2 11 0.9925 0.0075 1 3.0763 Viruses;Duplodnaviria;Heunggongvirae;Uroviricota;Caudoviricetes
6 changes: 6 additions & 0 deletions assets/test-datasets/b_fragilis_propagate_contigs.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>k141_126
CATAAACTACGCTCGGCACAGCCTCTGCTCGCCGTGCTGAACCGCCTGGAACAGAGAAAACCGGTGGGGCTGCGCTACGATCCGCAAGCACAATCGCTGGTGTGCCTGCCCACGCAAACCCGGACGGGCTGGAATCTCAATGGCTTTGAGGTGGGGTTCAGGCCATGCGTCAGGCTGATGATTTACGGACGTTCTCTTGAGGCGCAGGCAACCGCGAGTCTTGCAGCAGCCACAGGCTATGACAGCCATATCTTCGATCTTTTTCCGGCCTCAGCCAGCGCTCAGATCGATACCGATACGGCGGTCATTTTGCTGTGCCATGATCTCAACCGGGAGCTGCCAGTGTTGCAGGCCGCGCGAGAAGCAAAACCCTTTTATCTCGGCGCATTGGGCAGCTATCGAACCCACACGTTACGTCTGCAAAAGCTCCACGAGCTGGGATGGTCCAGGGAGGAAACAACGCAAATCCGGGCACCCGTCGGGATATTCCCCAAAGCCCGGGATGCGCATACTCTGGCACTCTCCGTGCTGGCAGAAGTCGCCTCTGTACGTCTCCATCAGGAGGAGGATTCATGCCTGCCCCCGTCGTCCTGATCCTTGCGGCCGGGCGTGGAGAGTGCTTTCTCGCCTCCGGGGGAAATACCCATAAGTGTATCGGCTGGCGTCAGTCCCCGGAGGTTGCGCCTTATCGCTGGCCATTTGAAGAAAACGGGAGAACTTTCGACCTTGCGATTGAACCGCAGATTACGACTAATGATCTGCGTCTGATGTTGAGGCTGGCTCTTGCCGGCGGAGGAATAACAATTGCCACTCAGGAAACTTTCAGGCCATATATTGAAAGCGGTAAGCTTGTATCGCTGCTTGATGACTTTCTTCCACAATTTCCGGGCTTCTATCTGTATTTCCCACAGCGTCGCAATATTGCACCAAAGCTCCGCGCCCTGATTGACTACGTCAAAGAATGGCGGCAGCAATTGGCTTAAATGTCTGCACCTGCACTGCCTGATGTCAGAACAGTATTTTGATGAATTGCCAAGGTTACAATGGCACAAATACGGCACAGGAGGAAACGTGGTGTATTTAAATATGGGGTAACTCTTTGATTTTAATGGTGCCGATAATAGGAGTCGAACCTACGACCTTCGCATTACGAATGCGATGTTAATGCATTTTTAACTACTTGAGCAATCAATCAAAACAACATACTTTATATTATAAAACACCAAGTTATAGCAATAAACCCGTTGATTGAAATTAACAAAAGTTGATGTTTATTCCCTCCATTTACTTACACCAGCAGTTACATTATTGTAAGACAGCAATTCTCAACTGGAGCCGAAAAATGGCGCTTTCAAGACAAAAATTTACCTTCGAAAGACTTCGCAGATTCACCTTACCGGAAGGGAAAAAACAAACTTTTCTTTGGGATGCAGATGTAACAACCCTGGCATGCCGAGCAACTAGCGGAGCAAAAGCCTTTGTATTCCAAAGCGTATATGCGGGGAAAACCCTTCGCATGACTATTGGCAACATTAACGACTGGAAGATTGATGATGCGAGAGCCGAGGCAAGACGGTTACAAACATTGATCGATACAGGGATAGATCCACGAATTGCTAAGGCTGTAAAAATCGCAGAAGCAGAATCCCTGCAGGCAGAATCACGTAAAACAAAAGTGACTTTCTCCGTCGCCTGGGAAGACTATCTTCAAGAATTGAGAACCGGTATCAGTGCAAAAACTAAACGCCCATATTCTACTCGATACATTGCCGATCACATTAACTTGTCCAGTCGTGGAGGCGAAAGTAAAAAAAGAGGCCAAGGCCCGACTTCGGCTGGACCATTGGCTAGTTTGCTCAACCTGCCGTTATCGGAGCTAACCCCAGATTACATAGCAGCGTGGCTGAGTACAGAAAGGCAAAATAGACCTACCGTCACTGCTCACGCTTATCGCCTACTACGTGCTTTCATCAAATGGAGTAATTATCAGAAAAAATATCAAGGGATCATTCCTGGCGATCTGGCACAAGATTACAACGTAAGAAAAATGGTTCCCGTGTCAGCGAGTAAAGCTGATGATTGCCTGCAAAAGGAACAACTAAAAAGCTGGTTTAGTGCCGTGCGTAGCCTCAATAATCCTATTGCATCGGCCTATCTCCAAGTACTTTTGCTCACTGGTGCTCGGCGTGAAGAAATTGCGTCGCTTCGCTGGTCAGACGTAGATTTCAAATGGTCAAGCATGCGAATTAAAGACAAGATCGAAGGTGAACGTATCATCCCTCTCACTCCTTATGTTTCTGAATTGTTAAATGTGCTAGCGCAATCCCCAAATTCTGACGTAAATAAGGAGGGTTGGGTTTTCAGAAGTAACAGTAAAAGTGGCAAAATTATTGAGCCGCGTTCAGCGCACAACAGAGCATTAGTGCTGGCTGAGTTACCACATATCAGCCTTCACGGTTTACGTCGTAGTTTTGGTACTTTGGCCGAGTGGGTTGAAGTTCCCACTGGTATTGTTGCTCAAATTATGGGACACAAACCCAGCGCTCTTGCCGAAAAACACTATCGCCGTCGTCCGTTAGATCTGTTACGAAAATGGCACGAGAAAATTGAGACATGGATCTTAAATGAAGCAGGTATTACCATAAAAAACAACGTTGATATGCGTTGATTCCATTAAAAATCAACATATTACAAAATATCATCAACTATTGATCAAGATAGATTTTCATGTATCGTAATACACAGTTTAGTCAATGATACAGCAACTACACAGGAGATAAGCCAATGGCAACCCCAGCAACTGTATCCATAGAACCCACTCTGGCAGCTATCAGAGCTCGCTGGTGTATTAATTCAAGTAAAACAACTCAATCCTTTAACGATCCTGCGTCCATGGAAGAGGTTGTCGAGTATCTCAAAGGAACATACTCAGCTCTTCGCAAGTCTGTCGCATGCGCCAAACTGAAAATTTTACATCTTAAACAAAGAATGCAAAATGCTACTAACTTTCTCGCGCGTCTGATGTCATGTAAAAATCAGGCATCCAGATCGCATCACAGTACGGCTAAATCAGCTAAAAGTGCCTTATCATCAGATTCAGGTGATGGTAGTGACCCCGACCCCGAGCCCGAAACGTTTCCTTCTGCCTTCATTACTACCCCTACTAATTCAATAATGCTTAAAGCTTTCTTTGCCAATATCTCAATCACTGAGGTGGCAAAATGAGCGCATTCAAACTCCCGGATACATCTCAATCACAGCTCATTTCAACAGCTGAGTTAGCTAAAATCATTAGCTACAAATCTCAAACCATTCGTAAATGGCTTTGTCAGGACAAATTGCCTGAGGGGCTACCTCGCCCAAAACAAATCAATGGCCGCCATTACTGGTTACGTAAAGATGTCCTCGATTTTATAGATACATTTTCTGTACGAGAAAGTCTGTAATAAATTACAGATTTAATTTTATTGATTTATAGCGATGTTGCCCCGAGAAAAATGGGGCAACACTGAGAAATTTCAGATAGTAGTTTTATATTGAGATAACAAAGAGGTTTCCTTAAAAATGTCTAATAGTGTTACTAATTTTGAGATGAGCAGCGTTCTACCAGGAAAAAAACCTTGTCAAGGCAAAAACAATGAGTCACAGGTAGTACAGACTACTCCCATAAAAAAACACTCAGTCACGTTCAAAAATCAATCTTCATTAGGAGTAATTGATCATTATGCCAGACTAACAAATAAATCTCACTCTTCCGTAATAGCGGAAGTTGTGGATTTGGCTATCCCTATATTAGAAAAATGCAATCGTCATAACTGGTCAATAAATGAAATAAAAAATGACCTGTTAAAGTTCTCTATAAAAGAAAGCATCAATCGAAGCCGAGGTAAAACAGAAGTAACTCTGGAAGAGTACTGTTCGTTAATCTGGAAAACGAACATCATGAGTCCATTAAAAATCCCCATTGCAG
>k141_50
CGCTGGCCCTGCTTATTACAGGATGTGCTCAACAGACATTTACTGTTCAAAACAAACAGACAGCAGTAGCACCAAAGGAAACCATCACCCATCATTTCTTCGTTTCTGGAATTGGGCAGAAGAAAACTGTCGATGCAGCTAAAATTTGTGGCGGCGCAGAAAATGTTGTTAAAACAGAAACCCAGCAAACATTCGTAAATGGATTGCTCGGTTTTATTACTTTAGGCATTTATACTCCGCTGGAAGCGCGTGTGTATTGCTCAAAATAATTGCATGAGTTGCCCATCGATATGGTCAGCTCTATCTGCACTGCTCATTAATATACTTCTGGGTTCCTTCCAGTTGTTTTTGCATAGTGATCAGCCTCTCTCTGAGGGTGAAATAATCCCGTTCAGCGGTGTCTGCCAGTCGGGGGGAGGCTGCATTATCCACGCCGGAGGCCGTGGTGGCTTCACGCACTGACTGACAGACTGCTTTGATGTGCAACCGACGACGACCAGCGGCAACATCATCACGCAGAGCATCATTTTCAGCTTTCGCATCAGCTAACTCCTTCGTGTATTTTGCATCGAGCGCAGCAACATCACGCTGACGCATCTGCATGTCAGTAATTGCCGCGTTCGCTAGCTTCAGTTCTCTGGCATTTTTGTCGCGCTGGACTTTGTAGGCGATTGCGTTATCACGGTAATGATTGACCGCCCATGACAGGCTGACGATGATGCAGATAATCAGAGCGGATATAATCGCGGTTACTCTGCTCACTGTTGCCCCCACAAACAGACTTCACGCTCAATCTCACGACGAGTCATCAGGCCTTTCCATTGCTTACCGCCAGCGTATGTCCAGCGACGCAGCTGATCACATGCGCCTTTGATATCGCCCTGGTTTATTTTGCGAAGAAGCGTCGATGTTCTAAAATTGCCAGCACCCACGTTGTAAACGAATGAGTAAAGAGCGCCGCGCGTTGTTTCCGGTATATCGACTTTGATATACGGGTTAATTTGTCTGGCGACAGTGGCAAGGTCTTTATTCAAGAGTGCTTTGCATTCTGCTTTGGTATACGTTTTACCGAGCATGATGTCTTTTCCGGTGTGTCCGTGACATACAGTCCATACACCAACAATATCTTTGTATGGTATGTAGCTGACACCTTCCAGACCATCGTTACCACTTGGGCCAGTGATTAACACTGATGCTATAGCAATTGCTCCGCCACCAATAGCAGCAGCAACGGCTTTTCGTAATGATGGAGGCATTATTCACCTCTCGCAGCCTTGCGCTTATCTTCTTTAATCTTGAAATAAAGGTTTGTCAGGTACGTCAGCAGGCCAAATACCAGGCTACCCAGCACACCTATTGCTGCCCACTGTGAGGGAGTGACTTTATCTAGCAGCTGTAAAAACCAGTACCCGGCACTACCTGCTGAGGTGCCATAGGCGACACCCGTTGTTAACTTATCCATGGATTTCATAACCCCACCTCGCAGACAAAGCGGGTGTAAATTGAGGGAATACTACGAAACGTAACAGACTCGGAGTCAGTGAATAACTCAGGTATTGGGTTATCAGCTAATATCGAGACTCAAAAAATGGAAAAACCCGCTCGACGGCGGGTTTAAGCTGTGTGACGAAGTAACCACTCTTAACAGCATAACCAATTTTTTACGTACGTAAACCACTAAATGATATTTGAGAGAATGCTACCGAGTATTGAAAACACCACTACAAATACATAAGCAAATCTCAACAAATAACCAAAAAATAATTTCCAGTGTTATTTTTAGCCGGTTTAAATTGAACCTTCAAATTATAGAGCACTTATAAATAACAGCCGTTAATATAAATTGGCTAATAGATTTATTTTTATTCAGCCAAGAGCCATGAATAGGATTCGATAGAAAAAAGTTCAGATAAAAATAGAGATCTACTTCACAAATCAAACGAGAAACCAAAACTTACATCTTGAATAATCACATTGATTAGATGAATATTTATCGCGCAGTGACATCATTTTTTAATAATAGTTCAAAAAAAGGGCTCACGATGAAAAAATTAACAGTGGCAATTTCTGCTGTAGCTGCATCAGTACTAATGGCGATGTCTGCTCAGGCAGCTGAAATTTATAATAAAGACAGTAACAAGCTGGATCTGTACGGGAAAGTTAATGCTAAGCACTACTTCTCCTCTAATGATGCAGATGATGGTGATACTACTTATGCCCGTCTTGGCTTCAAAGGTGAAACCCAAATCAACGATCAACTGACTGGTTTCGGTCAGTGGGAATATGAATTCAAAGGCAACCGCGCTGAATCTCAAGGTTCCTCCAAAGATAAAACCCGTCTTGCCTTCGCTGGCCTGAAATTCGGTGACTACGGCTCCATCGATTACGGCCGTAACTACGGTGTAGCATACGACATCGGTGCGTGGACTGACGTCCTGCCAGAATTCGGTGGTGACACTTGGACTCAAACCGACGTGTTCATGACTCAACGTGCAACTGGTGTTGCAACCTATCGTAACAACGACTTCTTTGGTCTGGTTGATGGTCTGAACTTTGCTGCTCAGTACCAAGGCAAAAACGATCGTAGCGATTTCGATAACTACACTGAAGGTAACGGTGATGGCTTCGGTTTCTCTGCTACCTATGAATACGAAGGATTCGGTATCGGTGCAACTTATGCGAAATCTGATCGTACCGACACTCAAGTTAATGCAGGGAAAGTTCTTCCTGAAGTATTTGCTTCCGGTAAAAATGCAGAAGTTTGGGCCGCAGGTCTGAAATATGACGCTAACAACATTTACCTGGCCACTACCTATTCTGAAACCCAGAATATGACTGTATTTGCTGATCACTTCGTTGCTAATAAAGCCCAAAACTTCGAAGCTGTTGCACAATATCAGTTCGATTTCGGTCTGCGTCCGTCCGTTGCTTACCTGCAATCTAAAGGTAAGGATCTTGGAGTATGGGGCGATCAGGACTTAGTCAAATATGTTGATGTAGGTGCAACCTATTACTTCAACAAAAATATGTCTACTTTCGTTGATTACAAAATCAACCTGCTTGACAAAAATGACTTCACTAAGGAAGGTGCGAACAAGTCCCTGATATGAGATCATGTTTGTCATCTGGAGCCATAGAACAGGGTTCATCATGAGTCATCAACTTACCTTCGCCGACAGTGAATTCAGCAGTAAGCGCCGTCAGACCAGAAAAGAGATT
>k141_203
GGGCTTCACTGCGAAATTCAGGCGAATGCTGTTTACGGGGTTTTTTACTGGTTGATACTGTTTTTGTCATGTGAGTCACCTCTGACTGAGAGTTTACTCACTTAGCCGCGTGTCCACTATTGCTGGGTAAGATCAGATTACGGTTGCGCCTGTTACCGCGGCAACGTCCTGTGCACAGAAGCTCTTATGCGTCCCCAGGTAATGAATAATTGCCTCTTTGCCCGTCATACACTTGCTCCTTTCAGTCCGAACTTAGCTTTAATTTCTGCGATCTTCGCCAGAGCCTGTGCACGATTTAGAGGTCTACCGCCCATAACAGGAAGTTGTTTTACTGGTTCAGGTATCGTCTCACCACGGTTAATTCGCGCTGTCATACAGGTCAGTTCATCGGCAGCCTTGCGCCGTAATTCCGCGTCAGCCAGCGCATTGGCCCGCATGTTCTGGTACAAGTTGGTAACCAACCAGTAATGCGCGTTCGATTTCCACGGATAAGACTCTGCATCCGGATACAGGCCACGCTTCCGGCAATACTCGTACCTCCCGGGATTTCATGAAATTCCGGCTCGGTGGTTTCGAGGCAATAAAATCGGCTTACATGGCCCAGGTGCAGTACAGCATGTGGGTGACGCGAAAAGATGCCTGGTACTTTGCCAACTATGACCCGCGCATGAAGCGTGAAGGCCTGCATTATGTCGTGATTGAGCGGAATGAAAAGTACATGGCGAGTTTTGACGAGATGGTGCCGGAGTTCATCGAAAAAATGGACGAGGCACTGGCTGAAATTGGTTTTGTATTTGGGGAGCAATGGCGATGACGCATCCTCACGATAATATCCGGGTACCT
5 changes: 5 additions & 0 deletions assets/test-datasets/b_fragilis_provirus_coords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
scaffold fragment start stop
k141_126 k141_126|provirus_1_1000 1 1000
k141_126 k141_126|provirus_2501_3500 2501 3500
k141_50 k141_50|provirus_1_1000 1 1000
k141_203 k141_203|provirus_1_500 1 500
50 changes: 50 additions & 0 deletions bin/derep_coordinates.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like a big overhead for what it's doing. Is it just checking the FASTA header is in the TSV? We should be able to streamline this...

Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python

from Bio import SeqIO
import argparse
import sys
import gzip
import pandas as pd
import math


def parse_args(args=None):
Description = "Dereplicate provirus coordinates to match dereplicated provirus assemblies"
Epilog = "Example usage: python derep_coordinates.py --coordinates <COORDS_TSV> --clusters <CLUSTERS_TSV> --output <DEREP_COORDS_TSV>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument(
"-r",
"--coordinates",
help="Path to TSV file containing provirus coordinates for each assembly.",
)
parser.add_argument(
"-c",
"--clusters",
help="Path to the TSV file containing cluster information from dereplicating assemblies.",
)
parser.add_argument(
"-o",
"--output",
help="Output TSV file containing provirus coordinates for dereplicated assembly.",
)
return parser.parse_args(args)

def derep_coordinates(coords_tsv, clusters_tsv, output):
# open coordinates file
coords = pd.read_csv(coords_tsv, sep='\t')
# open cluster results
clusters = pd.read_csv(clusters_tsv, sep='\t', header=None)
# identify coords contained in derep clusters
derep_coords = coords[coords['scaffold'].isin(set(clusters[0]))]

# save coords file
derep_coords.to_csv(output, sep='\t', index=False)

def main(args=None):
args = parse_args(args)
derep_coordinates(args.coordinates, args.clusters, args.output )


if __name__ == "__main__":
sys.exit(main())
130 changes: 130 additions & 0 deletions bin/extract_proviruses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python

from Bio import SeqIO
import argparse
import sys
import gzip
import pandas as pd
import math


def parse_args(args=None):
Description = "Extract proviruses present in assemblies based on geNomad and CheckV outputs."
Epilog = "Example usage: python extractproviruses.py --fasta <FASTA> --genomad <virus_summary> --checkv <quality_summary> --output <proviruses.fasta.gz> --tsv <provirus_coordinates.tsv>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument(
"-f",
"--fasta",
help="Path to FASTA file (gzipped) that contains assemblies.",
)
parser.add_argument(
"-g",
"--genomad",
help="Path to the TSV file containing geNomad's virus summary.",
)
parser.add_argument(
"-c",
"--checkv",
help="Path to the TSV file containing CheckV's contamination summary.",
)
parser.add_argument(
"-o",
"--output",
help="Output FASTA file with assemblies containing proviruses.",
)
parser.add_argument(
"-t",
"--tsv",
help="Output TSV file containing provirus coordinates.",
)
return parser.parse_args(args)

def extract_proviruses(fasta, genomad, checkv, out_fasta, out_tsv):
# open genomad results
genomad = pd.read_csv(genomad, sep='\t')
# identify genomad proviruses
genomad_proviruses = genomad[genomad['topology'] == 'Provirus']

# open checkv results
checkv = pd.read_csv(checkv, sep='\t')
# filter to checkv proviruses
checkv_proviruses = checkv[checkv['provirus'] == 'Yes']

# identify checkv provirus coordinates
checkv_coords = pd.DataFrame()
for index, row in checkv_proviruses.iterrows():
contig_id = row['contig_id']
region_types = row['region_types'].split(',')
provirus_count = 0
# parse though regions for each contig
for i in range(len(region_types)):
if region_types[i] == 'viral':
# if a region is viral, extract contig name, assign a provirus id, and add checkv start/end coords
provirus_info = pd.DataFrame()
provirus_count += 1
provirus_info['seq_name'] = [contig_id]
provirus_info['provirus_id'] = [contig_id + '|checkv_provirus_' + str(provirus_count)]
provirus_info[['provirus_start', 'provirus_stop']] = [row['region_coords_bp'].split(',')[i].split('-')]
checkv_coords = pd.concat([checkv_coords, provirus_info], axis=0)
Comment on lines +54 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a fairly inefficient way of creating a dataframe. Better to create a list of DataFrames then concat once:

pd.concat([checkProvirusCoords(inputs) for inputs in checkv_proviruses])

Not a blocker because I doubt this is slow - but worth noting.


# merge genomad and checkv
genomad_checkv = genomad_proviruses.merge(checkv_coords, on='seq_name', how='outer')

# if checkv provirus exists, add it to df and skip associated genomad provirus. If not, use geNomad provirus information
provirus_combined_coords = pd.DataFrame()
already_added = {}
for index, row in genomad_checkv.iterrows():
if str(row['seq_name']) + '_' + str(row['provirus_id']) in already_added:
continue
else:
provirus_coords = pd.DataFrame()
# remove prefix added above or by geNomad
provirus_coords['scaffold'] = [row['seq_name'].split('|provirus')[0]]
# if checkv provirus only; use checkv coordinates
if float(row['provirus_start']) > 0 and '-' not in str(row['coordinates']):
provirus_coords['start'] = [row['provirus_start']]
provirus_coords['stop'] = [row['provirus_stop'] ]
# if checkv and genomad provirus; add genomad start and checkv start (-1 to set checkv start to 0)
# find stop by adding length (checkv_stop - checkv_start) to start coord found above
elif float(row['provirus_start']) > 0 and '-' in row['coordinates']:
provirus_coords['start'] = [int(row['coordinates'].split('-')[0]) + (int(row['provirus_start']) - 1)]
provirus_coords['stop'] = [provirus_coords['start'][0] + int(row['provirus_stop']) - int(row['provirus_start'])]
# if genomad provirus only; use genomad coordinates
elif math.isnan(row['provirus_start']) and '-' in row['coordinates']:
provirus_coords['start'] = [int(row['coordinates'].split('-')[0])]
provirus_coords['stop'] = [int(row['coordinates'].split('-')[1])]
# rename provirus fragment based on final start/end
if '|provirus_' in row['seq_name']:
provirus_coords['fragment'] = row['seq_name'].split('|provirus_')[0] + '|provirus_' + str(provirus_coords['start'][0]) + '_' + str(provirus_coords['stop'][0])
else:
provirus_coords['fragment'] = row['seq_name'] + '|provirus_' + str(provirus_coords['start'][0]) + '_' + str(provirus_coords['stop'][0])
# concatenate all provirus coordinates
provirus_combined_coords = pd.concat([provirus_combined_coords, provirus_coords], axis=0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More loops


# reorder columns to match propagate input
provirus_combined_coords_reorg = provirus_combined_coords[['scaffold', 'fragment', 'start', 'stop']]
# save coords file
provirus_combined_coords_reorg.to_csv(out_tsv, sep='\t', index=False)
# identify scaffolds to extract
provirus_scaffolds = set(provirus_combined_coords_reorg['scaffold'])

# extract provirus scaffolds (full length) from fasta
exracted_scaffolds = []
fasta_gunzipped = gzip.open(fasta, "rt")
for record in SeqIO.parse(fasta_gunzipped, "fasta"):
if record.id in provirus_scaffolds:
record.description = ''
record.name = ''
exracted_scaffolds.append(record)
# save all extracted provirus scaffolds to specified file
SeqIO.write(exracted_scaffolds, out_fasta, "fasta")


def main(args=None):
args = parse_args(args)
extract_proviruses(args.fasta, args.genomad, args.checkv, args.output, args.tsv)


if __name__ == "__main__":
sys.exit(main())
Loading