From 9fd64c5f0924396f6e5f7450171f85a328fa37b5 Mon Sep 17 00:00:00 2001 From: Stefan Kranz Date: Tue, 13 May 2025 11:46:25 +0200 Subject: [PATCH 1/3] take care of syntax warnings for comparing literals with `is` --- BLAST/configure.py | 2 +- BLAST/runblastn.py | 2 +- Objects/alignment.py | 33 +++-- Objects/blast.py | 252 +++++++++++++++++++++++-------------- Objects/snp.py | 219 ++++++++++++++++++-------------- Objects/wrappers.py | 69 ++++++----- Utilities/arguments.py | 274 ++++++++++++++++++++--------------------- Utilities/filters.py | 165 +++++++++++++------------ Utilities/utilities.py | 118 +++++++++--------- snp_utils.py | 209 +++++++++++++++++++------------ 10 files changed, 750 insertions(+), 593 deletions(-) diff --git a/BLAST/configure.py b/BLAST/configure.py index ca06426..f6deeb4 100644 --- a/BLAST/configure.py +++ b/BLAST/configure.py @@ -3,7 +3,7 @@ """Write and parse configuration files for BLASTn""" import sys -if sys.version_info.major is not 3: +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module: " + __name__) import configparser diff --git a/BLAST/runblastn.py b/BLAST/runblastn.py index 7e3af86..bfe13e4 100644 --- a/BLAST/runblastn.py +++ b/BLAST/runblastn.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import sys -if sys.version_info.major is not 3: +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module: " + __name__) import os diff --git a/Objects/alignment.py b/Objects/alignment.py index 7966c8c..c41b2f9 100644 --- a/Objects/alignment.py +++ b/Objects/alignment.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module: " + __name__) import re + # A class definition for a SAM alignment class Alignment(object): """A class to hold a SAM Alignment @@ -16,27 +18,35 @@ class Alignment(object): 1-based leftmost mapping position Cigar string """ - _CIGAR = re.compile(u'([0-9]+[A-Z]+)') # Regex to break the CIGAR string into component codes using re.findall() + + _CIGAR = re.compile( + "([0-9]+[A-Z]+)" + ) # Regex to break the CIGAR string into component codes using re.findall() + def __init__(self, line): try: assert isinstance(line, str) except AssertionError: raise TypeError # There's only some information that we need for our alignment, everything else is forgotten - split_line = line.strip().split() # Remove leading and trailing whitespace, then split the line by column - self._qname = split_line[0] # First column in a SAM file - self._flag = int(split_line[1]) # Second column - self._rname = split_line[2] # Third column - self._pos = int(split_line[3]) # Fourth column, should be an int - self._cigar = self._CIGAR.findall(split_line[5]) # Sixth column, after breaking up the CIGAR string into a list of component codes + split_line = ( + line.strip().split() + ) # Remove leading and trailing whitespace, then split the line by column + self._qname = split_line[0] # First column in a SAM file + self._flag = int(split_line[1]) # Second column + self._rname = split_line[2] # Third column + self._pos = int(split_line[3]) # Fourth column, should be an int + self._cigar = self._CIGAR.findall( + split_line[5] + ) # Sixth column, after breaking up the CIGAR string into a list of component codes def __repr__(self): - return self._rname + ':' + self._qname + return self._rname + ":" + self._qname def get_rc(self): """Check to see if we're reverse complementing our sequence""" # If the 16th bit is set, it's reverse complement - return self._flag is 16 + return self._flag == 16 def get_name(self): """Get the alignment name""" @@ -56,5 +66,4 @@ def get_cigar(self): def check_flag(self): """Make sure we don't have extraneous alignments""" - return self._flag is 0 or self._flag is 16 - + return self._flag == 0 or self._flag == 16 diff --git a/Objects/blast.py b/Objects/blast.py index 74ffe45..aa25bcf 100644 --- a/Objects/blast.py +++ b/Objects/blast.py @@ -2,7 +2,8 @@ """This module holds two classes and one error""" import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module: " + __name__) @@ -18,27 +19,29 @@ # A tuple of values that need to be parsed for each HSP VALS = ( - 'Hsp_bit-score', - 'Hsp_evalue', - 'Hsp_hit-from', - 'Hsp_hit-to', - 'Hsp_hit-frame', - 'Hsp_identity', - 'Hsp_align-len', - 'Hsp_qseq', - 'Hsp_hseq' + "Hsp_bit-score", + "Hsp_evalue", + "Hsp_hit-from", + "Hsp_hit-to", + "Hsp_hit-frame", + "Hsp_identity", + "Hsp_align-len", + "Hsp_qseq", + "Hsp_hseq", ) # The message found in a BLAST iteration if no hits were found -NO_HIT_MESSAGE = 'No hits found' +NO_HIT_MESSAGE = "No hits found" # The message if no definition line, used to decide if we use accessions instead of deflines -NO_DEF_MESSAGE = 'No definition line' +NO_DEF_MESSAGE = "No definition line" + # An error I probably overuse... class NoSNPError(Exception): """A SNP has not been found""" - def __init__(self, message): # Make this error infinitely more useful + + def __init__(self, message): # Make this error infinitely more useful super(self.__class__, self).__init__(message) self.message = str(message) @@ -59,10 +62,23 @@ class Hsp(object): Identity (int) Aligned length (int) SNP Position relative to subject (once calcualted with Hsp.get_snp_position()) - """ - - def __init__(self, chrom, name, evalue, qseq, hseq, hstart, hend, hstrand, bit_score, identity, aligned_length): - try: # Type checking + """ + + def __init__( + self, + chrom, + name, + evalue, + qseq, + hseq, + hstart, + hend, + hstrand, + bit_score, + identity, + aligned_length, + ): + try: # Type checking assert isinstance(chrom, str) assert isinstance(name, str) assert isinstance(evalue, float) @@ -76,23 +92,23 @@ def __init__(self, chrom, name, evalue, qseq, hseq, hstart, hend, hstrand, bit_s assert isinstance(aligned_length, int) except AssertionError: raise TypeError - try: # Value checking + try: # Value checking assert hstrand == 1 or hstrand == -1 assert identity <= aligned_length except AssertionError: raise ValueError # Hsp information - self._chrom = chrom # Hit_def/Hit_accession - self._name = name # Iteration_query-def - self._evalue = evalue # Hsp_evalue - self._query = qseq # Hsp_qseq - self._subject = hseq # Hsp_hseq - self._start = hstart # Hsp_hit-from - self._end = hend # Hsp_hit-to - self._hstrand = hstrand #Hsp_hit-strand - self._bits = bit_score # Hsp_bit-score - self._identity = identity # Hsp_identity - self._alength = aligned_length # Hsp_align-len + self._chrom = chrom # Hit_def/Hit_accession + self._name = name # Iteration_query-def + self._evalue = evalue # Hsp_evalue + self._query = qseq # Hsp_qseq + self._subject = hseq # Hsp_hseq + self._start = hstart # Hsp_hit-from + self._end = hend # Hsp_hit-to + self._hstrand = hstrand # Hsp_hit-strand + self._bits = bit_score # Hsp_bit-score + self._identity = identity # Hsp_identity + self._alength = aligned_length # Hsp_align-len # SNP information self._snp_pos = None self._snp = None @@ -101,11 +117,19 @@ def __repr__(self): return self._name + ":" + str(self._evalue) def __eq__(self, other): - if isinstance(other, Hsp): # Check equality with other Hsp objects - name_bool = self._name == other._name # Ensure our name is equal to the other name - eval_bool = self._evalue == other._evalue # Is our evalue equal to the other evalue - score_bool = self._bits == other._bits # Is our score equal to the other score - return name_bool and eval_bool and score_bool # Return whether all of these are true + if isinstance(other, Hsp): # Check equality with other Hsp objects + name_bool = ( + self._name == other._name + ) # Ensure our name is equal to the other name + eval_bool = ( + self._evalue == other._evalue + ) # Is our evalue equal to the other evalue + score_bool = ( + self._bits == other._bits + ) # Is our score equal to the other score + return ( + name_bool and eval_bool and score_bool + ) # Return whether all of these are true elif isinstance(other, str): return self._name == other elif isinstance(other, float): @@ -158,70 +182,85 @@ def get_rc(self): def get_snp(self): """Return the SNP associated with this Hsp""" if not self._snp: - raise NoSNPError('No SNP for me: ' + str(self)) + raise NoSNPError("No SNP for me: " + str(self)) return self._snp @overload def get_snp_position(self, query_snp, expected): """Calculate the SNP position relative to reference""" - try: # Type checking + try: # Type checking assert isinstance(query_snp, str) assert isinstance(expected, int) except AssertionError: - raise TypeError("'query_snp' must be of type 'str'; 'expected' must be of type 'int'") - try: # Value checking - assert len(query_snp) is 1 + raise TypeError( + "'query_snp' must be of type 'str'; 'expected' must be of type 'int'" + ) + try: # Value checking + assert len(query_snp) == 1 assert query_snp in snp.Lookup.IUPAC_CODES except AssertionError: - raise ValueError("'query_snp' must be a single character and in the set '%s'" % ', '.join(snp.Lookup.IUPAC_CODES)) + raise ValueError( + "'query_snp' must be a single character and in the set '%s'" + % ", ".join(snp.Lookup.IUPAC_CODES) + ) # If we don't find the SNP in this hsp if self._query.find(query_snp) < 0: - raise NoSNPError("No query found in " + str(self)) # Error out + raise NoSNPError("No query found in " + str(self)) # Error out # If the hsp sequence is less than our expected value or the location of the first query SNP in our sequence is less than our expected value and there is only one query SNP found - elif (len(self._query) < expected) or (self._query.find(query_snp) <= expected and self._query.count(query_snp) == 1): + elif (len(self._query) < expected) or ( + self._query.find(query_snp) <= expected + and self._query.count(query_snp) == 1 + ): # Do some guesswork as to where the SNP is self._snp_pos = self._query.find(query_snp) - if self.get_rc(): # If we're on the reverse strand - return self._start - self._snp_pos # Start is actually the end - else: # If we're on the forward strand - return self._start + self._snp_pos # Start is actually the start - else: # Otherwise, we either have our position or need to deal with indels + if self.get_rc(): # If we're on the reverse strand + return self._start - self._snp_pos # Start is actually the end + else: # If we're on the forward strand + return self._start + self._snp_pos # Start is actually the start + else: # Otherwise, we either have our position or need to deal with indels # Start with the expected position self._snp_pos = expected # Get a substring up to and including this position - qsubstr = self._query[:self._snp_pos + 1] + qsubstr = self._query[: self._snp_pos + 1] # While we haven't found our query SNP (query SNP should be at the end of the substring) while qsubstr[-1] != query_snp: - old = self._snp_pos # Old position - self._snp_pos = expected + qsubstr.count('-') # Add any deletions to our position - if old is self._snp_pos: # If we didn't change our position guess - raise NoSNPError("Cannot find the SNP in " + self) # Error out to avoid infinite loops - qsubstr = self._query[:self._snp_pos + 1] # Replace our substring + old = self._snp_pos # Old position + self._snp_pos = expected + qsubstr.count( + "-" + ) # Add any deletions to our position + if old is self._snp_pos: # If we didn't change our position guess + raise NoSNPError( + "Cannot find the SNP in " + self + ) # Error out to avoid infinite loops + qsubstr = self._query[: self._snp_pos + 1] # Replace our substring # Calculate insertions - hsubstr = self._subject[:self._snp_pos + 1] - insertions = hsubstr.count('-') - if self.get_rc(): # If we're on the reverse strand - return self._start - self._snp_pos - insertions # Start is actually the end - else: # If we're on the forward strand - return self._start + self._snp_pos - insertions # Start is actually the start + hsubstr = self._subject[: self._snp_pos + 1] + insertions = hsubstr.count("-") + if self.get_rc(): # If we're on the reverse strand + return ( + self._start - self._snp_pos - insertions + ) # Start is actually the end + else: # If we're on the forward strand + return ( + self._start + self._snp_pos - insertions + ) # Start is actually the start @get_snp_position.add def get_snp_position(self, lookup): """Calculate the SNP position relative to reference""" - try: # Type checking + try: # Type checking assert isinstance(lookup, snp.Lookup) except AssertionError: raise TypeError - try: # Value checking + try: # Value checking assert lookup.get_snpid() == self._name except AssertionError: raise ValueError iupac = lookup.get_sequence(iupac=True).upper() - query = self._query.replace('-', '').upper() - if len(query) >= lookup.get_length() or iupac.find(query) is 0: + query = self._query.replace("-", "").upper() + if len(query) >= lookup.get_length() or iupac.find(query) == 0: return self.get_snp_position( - query_snp=lookup.get_code(), - expected=lookup.get_forward_position() + query_snp=lookup.get_code(), expected=lookup.get_forward_position() ) elif len(query) < lookup.get_length(): expected = iupac.find(lookup.get_code(), lookup.get_forward_position()) @@ -229,11 +268,12 @@ def get_snp_position(self, lookup): if expected > aligned: adj = expected - aligned elif expected < aligned: - adj = lookup.get_forward_position() + (aligned - expected) + query.find(lookup.get_code()) - return self.get_snp_position( - query_snp=lookup.get_code(), - expected=adj - ) + adj = ( + lookup.get_forward_position() + + (aligned - expected) + + query.find(lookup.get_code()) + ) + return self.get_snp_position(query_snp=lookup.get_code(), expected=adj) else: raise ValueError("Something happened...") @@ -242,15 +282,19 @@ def get_subject_allele(self): try: assert self._snp_pos is not None ref = self._subject[self._snp_pos] - assert ref.upper() in 'ACGTN' + assert ref.upper() in "ACGTN" return ref except AssertionError: - raise NoSNPError('No reference allele for ' + str(self) + ', have you run Hsp.get_snp_position() yet?') + raise NoSNPError( + "No reference allele for " + + str(self) + + ", have you run Hsp.get_snp_position() yet?" + ) @overload def add_snp(self, this_snp): """Add a SNP to our Hsp""" - try: # Type checking + try: # Type checking assert isinstance(this_snp, snp.SNP) except AssertionError: raise TypeError @@ -259,19 +303,21 @@ def add_snp(self, this_snp): @add_snp.add def add_snp(self, lookup): """Add a SNP to our HSP""" - try: # Type checking + try: # Type checking assert isinstance(lookup, snp.Lookup) except AssertionError: raise TypeError("'lookup' must be of type 'snp.Lookup'") - try: # Value checking + try: # Value checking assert self._name == lookup.get_snpid() # s = snp.SNP(lookup=lookup, hsp=self) # self.add_snp(this_snp=s) self.add_snp(this_snp=snp.SNP(lookup=lookup, hsp=self)) except AssertionError: - raise ValueError("Lookup " + str(lookup) + " doesn't match with " + str(self)) + raise ValueError( + "Lookup " + str(lookup) + " doesn't match with " + str(self) + ) except: - print('hmm', self, file=sys.stderr) + print("hmm", self, file=sys.stderr) raise @@ -283,9 +329,11 @@ def get_value(tag, value): assert isinstance(value, str) return tag.findChild(value).text except AssertionError: - raise TypeError("'tag' must be of type 'element.tag'; 'value' must be of type 'str'") + raise TypeError( + "'tag' must be of type 'element.tag'; 'value' must be of type 'str'" + ) except AttributeError: - return '' + return "" except: raise @@ -293,39 +341,51 @@ def get_value(tag, value): # A function to parse the HSP section of a BLAST XML file def parse_hsp(hsp): """Parse the HSP section of a BLAST XML result""" - try: # Type checking + try: # Type checking assert isinstance(hsp, element.Tag) except AssertionError: raise TypeError("'hsp' must be of type 'element.Tag'") # Ensure that our HSP has at least one mismatch # if hsp.findChild('Hsp_midline').text.count(' ') < 1: - if get_value(tag=hsp, value='Hsp_midline').count(' ') < 1: - raise NoSNPError('No mismatches found for this Hsp') + if get_value(tag=hsp, value="Hsp_midline").count(" ") < 1: + raise NoSNPError("No mismatches found for this Hsp") return tuple(get_value(tag=hsp, value=val) for val in VALS) # A function to parse the Hit section of a BLAST XML file def parse_hit(snpid, hit): """Parse the Hit section of a BLAST XML result""" - try: # Type checking + try: # Type checking assert isinstance(snpid, str) assert isinstance(hit, element.Tag) except AssertionError: - raise TypeError("'snpid' must be of type 'str'; 'hit' must be of type 'element.Tag'") + raise TypeError( + "'snpid' must be of type 'str'; 'hit' must be of type 'element.Tag'" + ) # Get and check the chromosome information - chrom = get_value(tag=hit, value='Hit_def') + chrom = get_value(tag=hit, value="Hit_def") if chrom == NO_DEF_MESSAGE: - chrom = get_value(tag=hit, value='Hit_accession') - hsps = list() # Create a holding list - for hsp in hit.findAll('Hsp'): + chrom = get_value(tag=hit, value="Hit_accession") + hsps = list() # Create a holding list + for hsp in hit.findAll("Hsp"): try: hsp_vals = parse_hsp(hsp) except NoSNPError as nosnp: - evalue = get_value(tag=hsp, value='Hsp_evalue') - print(nosnp.message, snpid + ':' + evalue, file=sys.stderr) + evalue = get_value(tag=hsp, value="Hsp_evalue") + print(nosnp.message, snpid + ":" + evalue, file=sys.stderr) else: - (bit_score, evalue, hsp_start, hsp_end, strand, identity, align_length, query, reference) = hsp_vals # Unpack our tuple - hsp = Hsp( # Create an Hsp object + ( + bit_score, + evalue, + hsp_start, + hsp_end, + strand, + identity, + align_length, + query, + reference, + ) = hsp_vals # Unpack our tuple + hsp = Hsp( # Create an Hsp object chrom=chrom, name=snpid, evalue=float(evalue), @@ -336,9 +396,9 @@ def parse_hit(snpid, hit): hstrand=int(strand), bit_score=float(bit_score), identity=int(identity), - aligned_length=int(align_length) + aligned_length=int(align_length), ) - hsps.append(hsp) # Add to our list + hsps.append(hsp) # Add to our list if hsps: return hsps else: @@ -362,5 +422,7 @@ def rank_hsps(hsps): else: hsp_dict[h.get_name()] = [h] # Rank-remove our HSPs - hsps_ranked = {name : rank_remove(hsp_list, lowest=True) for name, hsp_list in hsp_dict.items()} + hsps_ranked = { + name: rank_remove(hsp_list, lowest=True) for name, hsp_list in hsp_dict.items() + } return hsps_ranked diff --git a/Objects/snp.py b/Objects/snp.py index e65ea58..c287b5e 100644 --- a/Objects/snp.py +++ b/Objects/snp.py @@ -2,14 +2,15 @@ """A module for SNP stuff""" import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: sys.exit("Please install Python 3 for this module: " + __name__) import re from . import blast -from . blast import NoSNPError -from . alignment import Alignment +from .blast import NoSNPError +from .alignment import Alignment try: from overload import overload @@ -41,21 +42,24 @@ class SNP(object): @staticmethod def reverse_complement(sequence): """Get the reverse complement of a nucleotide sequence""" - forward = 'ACGTNacgtn' - reverse = 'TGCANtgcan' - try: # Type checking + forward = "ACGTNacgtn" + reverse = "TGCANtgcan" + try: # Type checking assert isinstance(sequence, (str, list, tuple)) if isinstance(sequence, (list, tuple)): for base in sequence: assert isinstance(base, str) - assert len(base) is 1 + assert len(base) == 1 for base in sequence: assert base in forward or base in Lookup.IUPAC_CODES except AssertionError: - raise TypeError("'sequence' must be of type 'str' or be a list or tuple of single-character 'str' objects within '%s' or 'RYSWKM'" % forward) + raise TypeError( + "'sequence' must be of type 'str' or be a list or tuple of single-character 'str' objects within '%s' or 'RYSWKM'" + % forward + ) else: rc_table = str.maketrans(forward, reverse) - return ''.join(tuple(base.translate(rc_table) for base in sequence)) + return "".join(tuple(base.translate(rc_table) for base in sequence)) @overload def __init__(self, lookup): @@ -84,10 +88,12 @@ def __init__(self, lookup, alignment, reference): raise NoMatchError # The contig is found in the alignment self._contig = alignment.get_contig() - self._flags.append('S') + self._flags.append("S") # Get the rest of the information - self._calculate_position(lookup, alignment) # True SNP position - self._find_states(lookup, alignment, reference) # Reference and alternate states + self._calculate_position(lookup, alignment) # True SNP position + self._find_states( + lookup, alignment, reference + ) # Reference and alternate states @__init__.add def __init__(self, lookup, hsp): @@ -101,20 +107,28 @@ def __init__(self, lookup, hsp): # The contig and SNP position are found in the hsp self._contig = hsp.get_chrom() self._position = hsp.get_snp_position(lookup=lookup) - self._flags.append('B') + self._flags.append("B") # Get the rest of the information - self._find_states(lookup, hsp) # Reference and alternate states + self._find_states(lookup, hsp) # Reference and alternate states def __repr__(self): return self._snpid def __eq__(self, other): if isinstance(other, SNP): - return self._snpid == other._snpid and self._contig == other._contig and self._position == other._position + return ( + self._snpid == other._snpid + and self._contig == other._contig + and self._position == other._position + ) elif isinstance(other, Map): return self._snpid == other.get_name() and self._contig == other.get_chrom() elif isinstance(other, Lookup): - return self._snpid == other.get_snpid() and self._alternate == other.get_alternate(self._reference) and self._alternate != 'N' + return ( + self._snpid == other.get_snpid() + and self._alternate == other.get_alternate(self._reference) + and self._alternate != "N" + ) elif isinstance(other, str): return self._snpid == other elif isinstance(other, int): @@ -179,49 +193,75 @@ def __rtruediv__(self, other): def _calculate_position(self, lookup, alignment): """Calculate the position of the SNP in the reference sequence""" - index = 0 # Index of our split CIGAR string - if alignment.get_rc(): # If we're reverse complementing - qpos = lookup.get_reverse_position() - 1 # Start with the reverse position of the SNP, must subtract one - else: # Otherwise - qpos = lookup.get_forward_position() # Start with the forward posittion - while True: # Endless loop to do weird things... - try: # While we have a CIGAR string to parse - old = qpos # Store our previously calculated SNP position + index = 0 # Index of our split CIGAR string + if alignment.get_rc(): # If we're reverse complementing + qpos = ( + lookup.get_reverse_position() - 1 + ) # Start with the reverse position of the SNP, must subtract one + else: # Otherwise + qpos = lookup.get_forward_position() # Start with the forward posittion + while True: # Endless loop to do weird things... + try: # While we have a CIGAR string to parse + old = qpos # Store our previously calculated SNP position # Seach the CIGAR string as a list, starting with index 0, for indels - if re.search('M', alignment.get_cigar()[index]): # If we have a perfect match - if qpos < int(''.join(re.findall(r'\d+', alignment.get_cigar()[index]))): # If our SNP is in the perfect match - break # Exit the loop, we have our position - if re.search('D', alignment.get_cigar()[index]): # If we have a deletion relative to reference - qpos += int(''.join(re.findall(r'\d+', alignment.get_cigar()[index]))) # Add the deletion to our SNP position - if re.search('[IS]', alignment.get_cigar()[index]): # If we have an insertion relative to reference - qpos -= int(''.join(re.findall(r'\d+', alignment.get_cigar()[index]))) # Subtract the insertion from our SNP postion - index += 1 # Increase the index - if qpos <= 0 or qpos >= lookup.get_length(): # If we've gone beyond the scope of our lookup: 0 is before the sequence, lookup.get_length() is after - qpos = old # Go back to our previously calculated SNP postion - break # Exit the loop, we have our position - except IndexError: # If we run out of CIGAR string codes - break # Exit the loop, we have our position - self._position = alignment.get_position() + qpos # Our SNP position is at the mapping position plus the SNP position + if re.search( + "M", alignment.get_cigar()[index] + ): # If we have a perfect match + if qpos < int( + "".join(re.findall(r"\d+", alignment.get_cigar()[index])) + ): # If our SNP is in the perfect match + break # Exit the loop, we have our position + if re.search( + "D", alignment.get_cigar()[index] + ): # If we have a deletion relative to reference + qpos += int( + "".join(re.findall(r"\d+", alignment.get_cigar()[index])) + ) # Add the deletion to our SNP position + if re.search( + "[IS]", alignment.get_cigar()[index] + ): # If we have an insertion relative to reference + qpos -= int( + "".join(re.findall(r"\d+", alignment.get_cigar()[index])) + ) # Subtract the insertion from our SNP postion + index += 1 # Increase the index + if ( + qpos <= 0 or qpos >= lookup.get_length() + ): # If we've gone beyond the scope of our lookup: 0 is before the sequence, lookup.get_length() is after + qpos = old # Go back to our previously calculated SNP postion + break # Exit the loop, we have our position + except IndexError: # If we run out of CIGAR string codes + break # Exit the loop, we have our position + self._position = ( + alignment.get_position() + qpos + ) # Our SNP position is at the mapping position plus the SNP position @overload def _find_states(self, lookup, alignment, reference): """Get the reference and alternate alleles""" # Get the reference allele, given our contig and position found above - self._reference = reference[self._contig][self._position - 1] # Subtract one as FASTA is 1-based and Python is 0-based - if alignment.get_rc(): # If we're reverse complement + self._reference = reference[self._contig][ + self._position - 1 + ] # Subtract one as FASTA is 1-based and Python is 0-based + if alignment.get_rc(): # If we're reverse complement alt = lookup.get_alternate(self.reverse_complement(self._reference)) self._alternate = self.reverse_complement(alt) else: - self._alternate = lookup.get_alternate(self._reference) # An 'N' will be returned if the reference allele doesn't match with our IUPAC code + self._alternate = lookup.get_alternate( + self._reference + ) # An 'N' will be returned if the reference allele doesn't match with our IUPAC code @_find_states.add def _find_states(self, lookup, hsp): """Get the reference and alternate alleles""" - try: # Type checking + try: # Type checking assert isinstance(lookup, Lookup) assert isinstance(hsp, blast.Hsp) - self._reference = hsp.get_subject_allele() # Get the reference allele from the Hit - self._alternate = lookup.get_alternate(self._reference) # Get the alternate from the Lookup + self._reference = ( + hsp.get_subject_allele() + ) # Get the reference allele from the Hit + self._alternate = lookup.get_alternate( + self._reference + ) # Get the alternate from the Lookup if hsp.get_rc(): self._reference = self.reverse_complement(self._reference) self._alternate = self.reverse_complement(self._alternate) @@ -246,7 +286,7 @@ def get_position(self): def check_masked(self): """Check to see if our alternate allele is masked""" - return self._alternate == 'N' + return self._alternate == "N" def add_flag(self, flag): """Add a flag to this SNP for VCF output""" @@ -271,7 +311,7 @@ def add_info(self, key, value): def format_vcf(self): """Format the information in VCF style""" # Create a list of information for a VCF file - info = [key + '=' + ','.join(value) for key, value in self._info.items()] + info = [key + "=" + ",".join(value) for key, value in self._info.items()] info.extend(self._flags) vcf_line = [ self._contig, @@ -279,11 +319,11 @@ def format_vcf(self): self._snpid, self._reference, self._alternate, - '.', - '.', - ';'.join(info) - ] - return '\t'.join(vcf_line) # Join everything together with a tab + ".", + ".", + ";".join(info), + ] + return "\t".join(vcf_line) # Join everything together with a tab # A class definition for a lookup sequence in Illumina format @@ -299,19 +339,12 @@ class Lookup(object): """ # A dictionary of IUPAC codes for SNPs - IUPAC_CODES = { - 'R' : 'AG', - 'Y' : 'CT', - 'S' : 'CG', - 'W' : 'AT', - 'K' : 'GT', - 'M' : 'AC' - } + IUPAC_CODES = {"R": "AG", "Y": "CT", "S": "CG", "W": "AT", "K": "GT", "M": "AC"} def __init__(self, snpid, sequence): # We're given the SNP ID and sequence when making the object, everything else # can be made with _capture_snp() and _find_iupac() - try: # Type checking + try: # Type checking assert isinstance(snpid, str) assert isinstance(sequence, str) except AssertionError: @@ -328,13 +361,13 @@ def __init__(self, snpid, sequence): self._find_iupac() def __repr__(self): - return self._snpid + ':' + self._code + return self._snpid + ":" + self._code def __eq__(self, other): if isinstance(other, Lookup): return self._snp == other._snpid and self._code == other._code elif isinstance(other, str): - if len(other) is 1: + if len(other) == 1: return self._code == other.upper() elif len(other) > 1: return self._snpid == other @@ -346,20 +379,24 @@ def __eq__(self, other): def _capture_snp(self): """Capture the SNP and it's position from the start and end of the sequence""" # Get the forward position - self._forward_position = self._sequence.find('[') + self._forward_position = self._sequence.find("[") # Get the reverse position - self._reverse_position = len(self._sequence) - self._sequence.find(']') + self._reverse_position = len(self._sequence) - self._sequence.find("]") # Get the SNP - self._snp = self._sequence[self._forward_position:self._sequence.find(']') + 1] + self._snp = self._sequence[ + self._forward_position : self._sequence.find("]") + 1 + ] def _find_iupac(self): """Create an IUPAC version of the sequence and calculate it's length""" # Create a string of the two states of the SNP in alphabetical order - ordered_snp = ''.join(sorted(re.findall('[ACGT]', self._snp))) + ordered_snp = "".join(sorted(re.findall("[ACGT]", self._snp))) # Find the IUPAC code for the SNP - self._code = ''.join([c for c, o in self.IUPAC_CODES.items() if ordered_snp == o]) + self._code = "".join( + [c for c, o in self.IUPAC_CODES.items() if ordered_snp == o] + ) # Create the IUPAC version of the sequence - self._iupac = re.sub(r'\[%s\]' % self._snp[1:-1], self._code, self._sequence) + self._iupac = re.sub(r"\[%s\]" % self._snp[1:-1], self._code, self._sequence) def get_snpid(self): """Get the SNP ID""" @@ -391,21 +428,24 @@ def get_sequence(self, iupac=False): # Search the IUPAC codes for an alternate allele of a SNP def get_alternate(self, reference): """Get the alternate allele given an IUPAC code and reference allele""" - ref = re.compile(u'(%s)' % reference) # Regex to ensure that our found reference allele is covered by the IUPAC code - alt = re.compile(u'([^%s])' % reference) # Regex to find the alternate allele - if ref.search(self.IUPAC_CODES[self._code]): # If our reference allele is plausible given our IUPCA code - alternate = alt.search(self.IUPAC_CODES[self._code]).group() # Get the alternate + ref = re.compile( + "(%s)" % reference + ) # Regex to ensure that our found reference allele is covered by the IUPAC code + alt = re.compile("([^%s])" % reference) # Regex to find the alternate allele + if ref.search( + self.IUPAC_CODES[self._code] + ): # If our reference allele is plausible given our IUPCA code + alternate = alt.search( + self.IUPAC_CODES[self._code] + ).group() # Get the alternate return alternate - else: # Otherwise, give an 'N' - return 'N' + else: # Otherwise, give an 'N' + return "N" def format_fasta(self): """Return this lookup in FASTA format""" - fasta = [ - '>' + self._snpid, - self._iupac - ] - return '\n'.join(fasta) + fasta = [">" + self._snpid, self._iupac] + return "\n".join(fasta) # A class for a Plink map file @@ -468,20 +508,17 @@ def format_map(self, map_distance=None, physical_position=None): """Create a map file for this Map, will update the Map with new distances""" try: assert isinstance(map_distance, float) or isinstance(map_distance, None) - assert isinstance(physical_position, int) or isinstance(physical_position, None) + assert isinstance(physical_position, int) or isinstance( + physical_position, None + ) except AssertionError: raise TypeError if map_distance is not None: self.update_distance(map_distance) if physical_position is not None: self.update_position(physical_position) - map_file = [ - self._chrom, - self._name, - str(self._mapd), - str(self._pos) - ] - return '\t'.join(map_file) + map_file = [self._chrom, self._name, str(self._mapd), str(self._pos)] + return "\t".join(map_file) def update_distance(self, map_distance): """Update the genetic map distance for this Map""" @@ -509,7 +546,7 @@ def read_map(mapfile): raise TypeError map_dict = dict() try: - with open(mapfile, 'r') as mf: + with open(mapfile, "r") as mf: print("Using genetic map", mapfile, file=sys.stderr) for line in mf: split = line.strip().split() @@ -517,7 +554,7 @@ def read_map(mapfile): chrom=split[0], name=split[1], map_distance=float(split[2]), - physical_position=int(split[3]) + physical_position=int(split[3]), ) map_dict[m.get_name()] = m except IndexError: @@ -528,7 +565,7 @@ def read_map(mapfile): # Filter snps using a genetic map -def filter_snps(snp_list, genetic_map, altchr='ALTCHR', altpos='ALTPOS'): +def filter_snps(snp_list, genetic_map, altchr="ALTCHR", altpos="ALTPOS"): """Filter a list or tuple of potential SNPs for the same marker by genetic map chromosome""" try: assert isinstance(snp_list, (list, tuple)) @@ -541,7 +578,7 @@ def filter_snps(snp_list, genetic_map, altchr='ALTCHR', altpos='ALTPOS'): raise TypeError try: snpids = {snp.get_snpid() for snp in snp_list} - assert len(snpids) is 1 + assert len(snpids) == 1 assert list(snpids)[0] == genetic_map except AssertionError: raise ValueError diff --git a/Objects/wrappers.py b/Objects/wrappers.py index 6f0c61f..9e836db 100644 --- a/Objects/wrappers.py +++ b/Objects/wrappers.py @@ -3,7 +3,8 @@ """Wrapper for various NCBI Blast+ applications not provided by BioPython""" import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: (sys.exit("Please use Python 3 for this modul, " + __name__)) @@ -21,57 +22,59 @@ class NcbiblastdbcmdCommandline(AbstractCommandline): # Possible output formats _OUTFMTS = OrderedDict( - [('%f', "Sequence in FASTA format"), - ('%s', "Sequence data without defline"), - ('%a', "Accession"), - ('%g', "GI"), - ('%o', "Original ID (OID)"), - ('%t', "Sequence title"), - ('%l', "Sequence length"), - ('%h', "Sequence hash value"), - ('%T', "Taxid"), - ('%X', "Leaf-node taxids"), - ('%e', "Memebership integer"), - ('%L', "Common taxonomic name"), - ('%C', "Common taxonomic name for leaf-node taxids"), - ('%S', "Scientific name"), - ('%N', "Scientific name for leaf-node taxids"), - ('%B', "BLAST name"), - ('%K', "Taxonomic super kingdom"), - ('%P', "PIG"), - ('%m', "Sequence masking data")] + [ + ("%f", "Sequence in FASTA format"), + ("%s", "Sequence data without defline"), + ("%a", "Accession"), + ("%g", "GI"), + ("%o", "Original ID (OID)"), + ("%t", "Sequence title"), + ("%l", "Sequence length"), + ("%h", "Sequence hash value"), + ("%T", "Taxid"), + ("%X", "Leaf-node taxids"), + ("%e", "Memebership integer"), + ("%L", "Common taxonomic name"), + ("%C", "Common taxonomic name for leaf-node taxids"), + ("%S", "Scientific name"), + ("%N", "Scientific name for leaf-node taxids"), + ("%B", "BLAST name"), + ("%K", "Taxonomic super kingdom"), + ("%P", "PIG"), + ("%m", "Sequence masking data"), + ] ) - def __init__(self, cmd='blastdbcmd', **kwargs): - if cmd is not 'blastdbcmd': + def __init__(self, cmd="blastdbcmd", **kwargs): + if cmd != "blastdbcmd": raise ValueError("This wrapper is specific for 'blastdbcmd'") outfmtmsg = "Output format where:\n" for item in self._OUTFMTS.items(): - outfmtmsg += '\t' + ' means '.join(item) + '\n' + outfmtmsg += "\t" + " means ".join(item) + "\n" self.parameters = [ _Option( - names=['-db', 'db'], + names=["-db", "db"], description="BLAST database name", is_required=True, filename=True, - equate=False + equate=False, ), _Option( - names=['-dbtype', 'dbtype'], + names=["-dbtype", "dbtype"], description="Molecule type stored in BLAST database", - checker_function=lambda value: value in ['guess', 'nucl', 'prot'], - equate=False + checker_function=lambda value: value in ["guess", "nucl", "prot"], + equate=False, ), _Option( - names=['-entry', 'entry'], + names=["-entry", "entry"], description="Search string for sequence identifier", - equate=False + equate=False, ), _Option( - names=['-outfmt', 'outfmt'], + names=["-outfmt", "outfmt"], description=outfmtmsg, checker_function=lambda value: value in self._OUTFMTS, - equate=False - ) + equate=False, + ), ] super(self.__class__, self).__init__(cmd, **kwargs) diff --git a/Utilities/arguments.py b/Utilities/arguments.py index 14411d6..b912125 100644 --- a/Utilities/arguments.py +++ b/Utilities/arguments.py @@ -3,14 +3,15 @@ """Create an argument parser for SNP Utils""" import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module") import os import argparse # Three constants -BLAST_CONFIG = os.getcwd() + '/blast_config.ini' +BLAST_CONFIG = os.getcwd() + "/blast_config.ini" LOOKUP_TABLE = """\ Specify the path for lookup table of SNP contextual sequences. This table should be a two-column tab-delimeted table where the @@ -20,7 +21,8 @@ SNP1 ACGTCGACAGTCAGA[G/A]CGACGTTCAAGGCTCA SNP2 TGCAGACCGTTGCAC[A/T]TGCCGATGCGATGACC """ -FILTER_OPTS = ('bychrom', 'bydistance') +FILTER_OPTS = ("bychrom", "bydistance") + # A function to validate filter arguments def validate_filters(args, parser): @@ -31,16 +33,18 @@ def validate_filters(args, parser): except AssertionError: raise TypeError try: - assert 'bychrom' in args - assert 'bydistance' in args + assert "bychrom" in args + assert "bydistance" in args except AssertionError: raise ValueError (bychrom, bydistance) = FILTER_OPTS if args[bychrom] or args[bydistance]: try: - assert 'map' in args + assert "map" in args except AssertionError: - parser.error(message="Filtering by chromosome/contig and/or genetic map distance requires '-m | --genetic-map'") + parser.error( + message="Filtering by chromosome/contig and/or genetic map distance requires '-m | --genetic-map'" + ) # A function to check that a value is a positive integer @@ -62,48 +66,48 @@ def _filter_parser(parser): raise TypeError (bychrom, bydistance) = FILTER_OPTS filters = parser.add_argument_group( - title='Filtering options', - description="Choose some optional filtering options for deduplicating the final SNPs" + title="Filtering options", + description="Choose some optional filtering options for deduplicating the final SNPs", ) filters.add_argument( - '-m', - '--genetic-map', - dest='map', + "-m", + "--genetic-map", + dest="map", type=str, default=None, required=False, - metavar='GENETIC MAP', - help="Genetic map in Plink 1.9 MAP format, used in filtering SNPs on different chromosomes/contigs" + metavar="GENETIC MAP", + help="Genetic map in Plink 1.9 MAP format, used in filtering SNPs on different chromosomes/contigs", ) filters.add_argument( - '-b', - '--by-chrom', + "-b", + "--by-chrom", dest=str(bychrom), - action='store_const', + action="store_const", const=True, default=False, required=False, - help="Do we filter potential SNPs by chromosome/contig found in a genetic map? Requires '-m | --genetic-map'" + help="Do we filter potential SNPs by chromosome/contig found in a genetic map? Requires '-m | --genetic-map'", ) filters.add_argument( - '-d', - '--by-distance', + "-d", + "--by-distance", dest=str(bydistance), - action='store_const', + action="store_const", const=True, default=False, required=False, - help="Do we filter potential SNPs by proportional distance along a chromosome/contig as given by the genetic map? Requires '-m | --genetic-map'" + help="Do we filter potential SNPs by proportional distance along a chromosome/contig as given by the genetic map? Requires '-m | --genetic-map'", ) filters.add_argument( - '-t', - '--threshold', - dest='threshold', + "-t", + "--threshold", + dest="threshold", type=_positive_integer, default=None, required=False, - metavar='DISTANCE THRESHOLD', - help="Set a minimum distance for two potential SNPs on the same chromosome/contig. SNPs for the same marker within this distance will be combined into the leftmost position" + metavar="DISTANCE THRESHOLD", + help="Set a minimum distance for two potential SNPs on the same chromosome/contig. SNPs for the same marker within this distance will be combined into the leftmost position", ) @@ -113,19 +117,16 @@ def _lookup_parser(parser): assert isinstance(parser, argparse.ArgumentParser) except AssertionError: raise TypeError - lookup = parser.add_argument_group( - title='Lookup table', - description=LOOKUP_TABLE - ) + lookup = parser.add_argument_group(title="Lookup table", description=LOOKUP_TABLE) lookup.add_argument( - '-l', - '--lookup', - dest='lookup', + "-l", + "--lookup", + dest="lookup", type=str, default=None, required=True, - metavar='ILLUMINA LOOKUP TABLE', - help="SNP lookup table in Illumina format" + metavar="ILLUMINA LOOKUP TABLE", + help="SNP lookup table in Illumina format", ) @@ -133,203 +134,200 @@ def _lookup_parser(parser): def make_argument_parser(): """Create an argument parser for SNP Utils""" parser = argparse.ArgumentParser( - add_help=True, - formatter_class=argparse.RawTextHelpFormatter, - prog=sys.argv[0] + add_help=True, formatter_class=argparse.RawTextHelpFormatter, prog=sys.argv[0] ) # Subparser for BLAST- vs Alignment-based SNP prediction subparsers = parser.add_subparsers( - title='Subroutine', - description='Choose a subroutine', - dest='method', - help="'BLAST' means we are running a BLAST search to find SNPs, 'SAM' means we are using a SAM file, 'CONFIG' is used to configure a BLAST search" + title="Subroutine", + description="Choose a subroutine", + dest="method", + help="'BLAST' means we are running a BLAST search to find SNPs, 'SAM' means we are using a SAM file, 'CONFIG' is used to configure a BLAST search", ) # Config subparser - config = subparsers.add_parser('CONFIG') + config = subparsers.add_parser("CONFIG") ref_opts = config.add_argument_group( - title='Reference format', - description="Choose whether the reference is a sequence or BLAST database" + title="Reference format", + description="Choose whether the reference is a sequence or BLAST database", ) ref_db = ref_opts.add_mutually_exclusive_group(required=True) ref_db.add_argument( - '-r', - '--reference', - dest='subject', + "-r", + "--reference", + dest="subject", type=str, default=None, - metavar='REFERNCE SEQUENCE', - help="Reference sequence in FASTA format, incompatible with '-d | --database'" + metavar="REFERNCE SEQUENCE", + help="Reference sequence in FASTA format, incompatible with '-d | --database'", ) ref_db.add_argument( - '-d', - '--database', - dest='database', + "-d", + "--database", + dest="database", type=str, default=None, - metavar='REFERENCE DATABASE', - help="Reference BLAST nucleotide database, incompatible with '-r | --reference'" + metavar="REFERENCE DATABASE", + help="Reference BLAST nucleotide database, incompatible with '-r | --reference'", ) blast_opts = config.add_argument_group( - title='BLAST Options', - description="Options for configuring your BLASTing experience" + title="BLAST Options", + description="Options for configuring your BLASTing experience", ) blast_opts.add_argument( - '-e', - '--evalue', - dest='evalue', + "-e", + "--evalue", + dest="evalue", type=float, default=1e-1, required=False, - metavar='E-VALUE', - help="E-value threshold, defaults to '1e-1'" + metavar="E-VALUE", + help="E-value threshold, defaults to '1e-1'", ) blast_opts.add_argument( - '-s', - '--max-hits', - dest='max_hits', + "-s", + "--max-hits", + dest="max_hits", type=int, default=3, required=False, - metavar='MAX HITS', - help="Max hits to keep per query, defaults to '3'" + metavar="MAX HITS", + help="Max hits to keep per query, defaults to '3'", ) blast_opts.add_argument( - '-m', - '--max-hsps', - dest='max_hsps', + "-m", + "--max-hsps", + dest="max_hsps", type=int, default=3, required=False, - metavar='MAX HSPS', - help="Max hsps to keep per hit, defaults to '3'" + metavar="MAX HSPS", + help="Max hsps to keep per hit, defaults to '3'", ) blast_opts.add_argument( - '-i', - '--identity', - dest='identity', + "-i", + "--identity", + dest="identity", type=float, default=99, required=False, - metavar='PERCENT IDENTITY', - help="Percent identity to match, defaults to '99'" + metavar="PERCENT IDENTITY", + help="Percent identity to match, defaults to '99'", ) blast_opts.add_argument( - '-k', - '--keep-query', - dest='keep_query', - action='store_const', + "-k", + "--keep-query", + dest="keep_query", + action="store_const", const=True, default=False, required=False, - help="Do we keep the query FASTA file? Pass '-k | --keep-query' to say yes" + help="Do we keep the query FASTA file? Pass '-k | --keep-query' to say yes", ) config.add_argument( - '-c', - '--config', - dest='outfile', + "-c", + "--config", + dest="outfile", default=BLAST_CONFIG, required=False, - metavar='BLAST CONFIG FILE', - help="Name of BLAST config file to write to, defaults to '" + BLAST_CONFIG + "'" + metavar="BLAST CONFIG FILE", + help="Name of BLAST config file to write to, defaults to '" + + BLAST_CONFIG + + "'", ) # BLAST subparser blast = subparsers.add_parser( - 'BLAST', - formatter_class=argparse.RawDescriptionHelpFormatter + "BLAST", formatter_class=argparse.RawDescriptionHelpFormatter ) _lookup_parser(blast) blast_in = blast.add_argument_group( - title='Input options', - description="Choose to input a BLAST config or XML file" + title="Input options", description="Choose to input a BLAST config or XML file" ) blast_modes = blast_in.add_mutually_exclusive_group(required=True) blast_modes.add_argument( - '-c', - '--config', - dest='config', + "-c", + "--config", + dest="config", type=str, default=None, - metavar='BLAST CONFIG FILE', - help="Configuration file for running BLAST, incompatible with '-x | --xml'" + metavar="BLAST CONFIG FILE", + help="Configuration file for running BLAST, incompatible with '-x | --xml'", ) blast_modes.add_argument( - '-x', - '--xml', - dest='xml', - type=argparse.FileType('r'), + "-x", + "--xml", + dest="xml", + type=argparse.FileType("r"), default=None, required=False, - metavar='BLAST XML RESULTS', - help="Optional BLAST XML results already found, incompatible with '-c | --config'" + metavar="BLAST XML RESULTS", + help="Optional BLAST XML results already found, incompatible with '-c | --config'", ) _filter_parser(blast) blast_out = blast.add_argument_group( - title='Output options', - description="Control the output of " + os.path.basename(sys.argv[0]) + title="Output options", + description="Control the output of " + os.path.basename(sys.argv[0]), ) blast_out.add_argument( - '-o', - '--outname', - dest='outname', + "-o", + "--outname", + dest="outname", type=str, - default='output', + default="output", required=False, - metavar='OUTPUT NAME', - help="Name of output file, without suffix. Defaults to 'output'" + metavar="OUTPUT NAME", + help="Name of output file, without suffix. Defaults to 'output'", ) blast_out.add_argument( - '-r', - '--rank-snps', - dest='rank', - action='store_const', + "-r", + "--rank-snps", + dest="rank", + action="store_const", const=True, default=False, required=False, - help="Do we rank the SNPs and take the lowest e-value/highest scoring BLAST results? Pass '-r | --rank-snps' to say yes" + help="Do we rank the SNPs and take the lowest e-value/highest scoring BLAST results? Pass '-r | --rank-snps' to say yes", ) # Alignment subparser sam = subparsers.add_parser( - 'SAM', - formatter_class=argparse.RawDescriptionHelpFormatter + "SAM", formatter_class=argparse.RawDescriptionHelpFormatter ) _lookup_parser(sam) sam_input = sam.add_argument_group( - title='Input options', - description="Choose a SAM file and reference FASTA file to find SNPs" + title="Input options", + description="Choose a SAM file and reference FASTA file to find SNPs", ) sam_input.add_argument( - '-s', - '--sam-file', - dest='samfile', + "-s", + "--sam-file", + dest="samfile", type=str, default=None, required=True, - metavar='SAM FILE', - help="SAM File with aligned SNP contextual sequences" + metavar="SAM FILE", + help="SAM File with aligned SNP contextual sequences", ) sam_input.add_argument( - '-r', - '--reference', - dest='reference', + "-r", + "--reference", + dest="reference", type=str, default=None, required=True, - metavar='REFERENCE SEQUENCE', - help="Reference sequence in FASTA format" + metavar="REFERENCE SEQUENCE", + help="Reference sequence in FASTA format", ) _filter_parser(sam) sam_out = sam.add_argument_group( - title='Output options', - description="Control the output of " + os.path.basename(sys.argv[0]) + title="Output options", + description="Control the output of " + os.path.basename(sys.argv[0]), ) sam_out.add_argument( - '-o', - '--outname', - dest='outname', + "-o", + "--outname", + dest="outname", type=str, - default='output', + default="output", required=False, - metavar='OUTPUT NAME', - help="Name of output file, without suffix. Defaults to 'output'" + metavar="OUTPUT NAME", + help="Name of output file, without suffix. Defaults to 'output'", ) return parser diff --git a/Utilities/filters.py b/Utilities/filters.py index 3006908..f30006a 100644 --- a/Utilities/filters.py +++ b/Utilities/filters.py @@ -3,7 +3,8 @@ """This module contains various functions to filter potential SNPs in hopes of removing duplicate potential SNPs""" import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module: " + __name__) @@ -11,7 +12,7 @@ from Objects.snp import SNP, read_map, filter_snps from Objects.wrappers import NcbiblastdbcmdCommandline -from . utilities import INFO, closest_value +from .utilities import INFO, closest_value try: from Bio import SeqIO @@ -20,19 +21,17 @@ sys.exit("Please install " + error.name) # Two INFO objects for alternate SNP positions information -ALTCHR = INFO( # Create an INFO object for alternate chromosomes - infoid='ALTCHR', - number='.', - infotype='String', - description='Alternate chromosomal positions' +ALTCHR = INFO( # Create an INFO object for alternate chromosomes + infoid="ALTCHR", + number=".", + infotype="String", + description="Alternate chromosomal positions", ) -ALTPOS = INFO( # Create an info object for alternate positions - infoid='ALTPOS', - number='.', - infotype='Integer', - description='Alternate positions' +ALTPOS = INFO( # Create an info object for alternate positions + infoid="ALTPOS", number=".", infotype="Integer", description="Alternate positions" ) + # A function to sort a list of SNPs by ID and optionally chromosome/contig def _sort_snps(snplist, bychrom=False): """Sort SNPs by SNP ID and, optionally, chromosome/contig""" @@ -60,8 +59,8 @@ def _sort_snps(snplist, bychrom=False): def chrom_filter(args, vcfinfo, propersnps): """Filter potential SNPs by chromosomes/contigs using a genetic map""" # Create holding collections - mapped_snps = list() # List of mapped SNPs - try: # Type checking + mapped_snps = list() # List of mapped SNPs + try: # Type checking assert isinstance(args, dict) assert isinstance(vcfinfo, list) for info in vcfinfo: @@ -73,17 +72,17 @@ def chrom_filter(args, vcfinfo, propersnps): raise TypeError except: raise - try: # Value checking - assert 'map' in args.keys() + try: # Value checking + assert "map" in args.keys() except AssertionError: raise ValueError print("Filtering SNPs by hit chromsome/contig", file=sys.stderr) # Add our VCF header - infoheader = set(deepcopy(vcfinfo)) # NO SIDE EFFECTS + infoheader = set(deepcopy(vcfinfo)) # NO SIDE EFFECTS infoheader.add(ALTCHR) infoheader.add(ALTPOS) # Open our map file - map_dict = read_map(args['map']) + map_dict = read_map(args["map"]) # Create our list of mapped snps for snpid, snplist in snp_dict.items(): # Only filter by map if there are multiple SNPs for the same marker and we have a map position for the marker @@ -92,9 +91,9 @@ def chrom_filter(args, vcfinfo, propersnps): snp_list=snplist, genetic_map=map_dict[snpid], altchr=ALTCHR.get_id(), - altpos=ALTPOS.get_id() + altpos=ALTPOS.get_id(), ) - else: # Otherwise, add the SNP list to our list of mapped SNPs + else: # Otherwise, add the SNP list to our list of mapped SNPs mapped_snps += snplist return (mapped_snps, list(infoheader)) @@ -103,9 +102,9 @@ def chrom_filter(args, vcfinfo, propersnps): def distance_filter(args, vcfinfo, propersnps): """Filter potential SNPs by a distance threshold""" # Create holding collections - filtered_snps = list() # List of filtered SNPs + filtered_snps = list() # List of filtered SNPs # mapped_snps = list() # List of mapped SNPs - try: # Type checking + try: # Type checking assert isinstance(args, dict) assert isinstance(vcfinfo, list) for info in vcfinfo: @@ -117,37 +116,45 @@ def distance_filter(args, vcfinfo, propersnps): raise TypeError except: raise - try: # Value checking - assert 'threshold' in args.keys() - assert isinstance(args['threshold'], int) and args['threshold'] >= 0 + try: # Value checking + assert "threshold" in args.keys() + assert isinstance(args["threshold"], int) and args["threshold"] >= 0 except AssertionError: raise ValueError - print("Filtering SNPs with a minimum distance threshold of", args['threshold'], file=sys.stderr) + print( + "Filtering SNPs with a minimum distance threshold of", + args["threshold"], + file=sys.stderr, + ) # Add our VCF header - infoheader = set(deepcopy(vcfinfo)) # NO SIDE EFFECTS + infoheader = set(deepcopy(vcfinfo)) # NO SIDE EFFECTS infoheader.add(ALTCHR) infoheader.add(ALTPOS) # Start filtering by distance threshold for snplist in snp_dict.values(): # Sort our SNP list (SNPs have a __lt__ attribute based on position so this works as is) sorted_snps = sorted(snplist) - left_snps = [sorted_snps.pop(0)] # First SNP in sorted list is the leftmost SNP - while sorted_snps: # The while loop stops when we're done with our list of sorted SNPs - left_index = len(left_snps) - 1 # Take the furthest right of our left SNPs - next_snp = sorted_snps.pop(0) # Get the next SNP - if next_snp - left_snps[left_index] < args['threshold']: # If the distance between the two SNPs are less than our minimum distance threshold + left_snps = [sorted_snps.pop(0)] # First SNP in sorted list is the leftmost SNP + while ( + sorted_snps + ): # The while loop stops when we're done with our list of sorted SNPs + left_index = len(left_snps) - 1 # Take the furthest right of our left SNPs + next_snp = sorted_snps.pop(0) # Get the next SNP + if ( + next_snp - left_snps[left_index] < args["threshold"] + ): # If the distance between the two SNPs are less than our minimum distance threshold # Add the next SNP as an alternate position to our left SNP left_snps[left_index].add_info( - key=ALTCHR.get_id(), - value=next_snp.get_chrom() + key=ALTCHR.get_id(), value=next_snp.get_chrom() ) left_snps[left_index].add_info( - key=ALTPOS.get_id(), - value=next_snp.get_position() + key=ALTPOS.get_id(), value=next_snp.get_position() ) - else: # Otherwise, add the next SNP to our list of left-most SNPs + else: # Otherwise, add the next SNP to our list of left-most SNPs left_snps.append(next_snp) - filtered_snps += left_snps # Add our list of left-most SNPs to the overall SNP list + filtered_snps += ( + left_snps # Add our list of left-most SNPs to the overall SNP list + ) return (filtered_snps, list(infoheader)) @@ -155,9 +162,9 @@ def distance_filter(args, vcfinfo, propersnps): def map_filter(args, vcfinfo, propersnps, refgen, method, bconf=None): """Filters potatoes""" # Create holding collections - chrom_lengths = dict() # Dictionary to hold chromosomal sequence lengths - closest_snps = list() # List of filtered SNPs - try: # Type checking + chrom_lengths = dict() # Dictionary to hold chromosomal sequence lengths + closest_snps = list() # List of filtered SNPs + try: # Type checking assert isinstance(args, dict) assert isinstance(vcfinfo, list) for info in vcfinfo: @@ -170,48 +177,47 @@ def map_filter(args, vcfinfo, propersnps, refgen, method, bconf=None): assert isinstance(bconf, dict) or bconf is None except AssertionError: raise TypeError - try: # Value checking - assert 'map' in args - assert method in {'BLAST', 'SAM'} - if method == 'BLAST': + try: # Value checking + assert "map" in args + assert method in {"BLAST", "SAM"} + if method == "BLAST": assert bconf is not None if bconf: - assert 'subject' in bconf or 'database' in bconf + assert "subject" in bconf or "database" in bconf except AssertionError: raise ValueError print("Filtering SNPs by relative location on the genetic map", file=sys.stderr) # Add our VCF header - infoheader = set(deepcopy(vcfinfo)) # NO SIDE EFFECTS + infoheader = set(deepcopy(vcfinfo)) # NO SIDE EFFECTS infoheader.add(ALTCHR) infoheader.add(ALTPOS) # Read in the genetic map - map_dict = read_map(args['map']) + map_dict = read_map(args["map"]) # Get the length of each chromosome in the reference - if method == 'SAM' or 'subject' in bconf: + if method == "SAM" or "subject" in bconf: try: - print("Reading in reference FASTA", refgen, "This might take a while...", file=sys.stderr) - reference = SeqIO.to_dict(SeqIO.parse(refgen, 'fasta')) + print( + "Reading in reference FASTA", + refgen, + "This might take a while...", + file=sys.stderr, + ) + reference = SeqIO.to_dict(SeqIO.parse(refgen, "fasta")) for chrom, seq in reference.items(): chrom_lengths[chrom] = len(seq) except FileNotFoundError as error: sys.exit("Failed to find " + error.filename) - elif 'database' in bconf: + elif "database" in bconf: try: - print("Parsing blast database", bconf['database'], file=sys.stderr) + print("Parsing blast database", bconf["database"], file=sys.stderr) accessioncmd = NcbiblastdbcmdCommandline( - db=bconf['database'], - dbtype='nucl', - entry='all', - outfmt='%a' + db=bconf["database"], dbtype="nucl", entry="all", outfmt="%a" ) accout = accessioncmd()[0] accessions = (acc for acc in accout.strip().split()) for acc in accessions: lengthcmd = NcbiblastdbcmdCommandline( - db=bconf['database'], - dbtype='nucl', - entry=acc, - outfmt='%l' + db=bconf["database"], dbtype="nucl", entry=acc, outfmt="%l" ) length = lengthcmd()[0] chrom_lengths[acc] = int(length) @@ -219,29 +225,33 @@ def map_filter(args, vcfinfo, propersnps, refgen, method, bconf=None): sys.exit(error) else: raise NotImplementedError - print("Found", len(chrom_lengths), 'chromosomes', file=sys.stderr) + print("Found", len(chrom_lengths), "chromosomes", file=sys.stderr) print("Filtering", len(snp_dict), "SNP IDs", file=sys.stderr) for key, snplist in snp_dict.items(): (snpid, chrom) = key try: assert len(snplist) > 1 these_snps = deepcopy(snplist) - map_length = max([m.get_map_distance() for m in map_dict.values() if m.get_chrom() == chrom]) + map_length = max( + [ + m.get_map_distance() + for m in map_dict.values() + if m.get_chrom() == chrom + ] + ) map_proportion = map_dict[snpid].get_map_distance() / map_length - length_dict = {snp / chrom_lengths[chrom] : snp for snp in these_snps} - closest = closest_value(iterable=length_dict, value=map_proportion, return_item=True) + length_dict = {snp / chrom_lengths[chrom]: snp for snp in these_snps} + closest = closest_value( + iterable=length_dict, value=map_proportion, return_item=True + ) these_snps.remove(closest) for remaining in these_snps: - closest.add_info( - key=ALTCHR.get_id(), - value=remaining.get_chrom() - ) - closest.add_info( - key=ALTPOS.get_id(), - value=remaining.get_position() - ) + closest.add_info(key=ALTCHR.get_id(), value=remaining.get_chrom()) + closest.add_info(key=ALTPOS.get_id(), value=remaining.get_position()) closest_snps.append(closest) - except AssertionError: # Don't need to do all this crap and spit out error messages + except ( + AssertionError + ): # Don't need to do all this crap and spit out error messages closest_snps.extend(snplist) except KeyError: print("No map found for", snpid, file=sys.stderr) @@ -250,10 +260,3 @@ def map_filter(args, vcfinfo, propersnps, refgen, method, bconf=None): print("No genetic map for chromosome", chrom, file=sys.stderr) closest_snps.extend(snplist) return (closest_snps, list(infoheader)) - - - - - - - diff --git a/Utilities/utilities.py b/Utilities/utilities.py index 4f0d6dd..2971b0e 100644 --- a/Utilities/utilities.py +++ b/Utilities/utilities.py @@ -2,13 +2,15 @@ """Extra utilties""" import sys -if sys.version_info.major is not 3: + +if sys.version_info.major != 3: sys.exit("Please use Python 3 for this module: " + __name__) import os from datetime import datetime + # A class to hold values for an ##INFO declaration class INFO(object): """A class for a VCF ##INFO declaration @@ -19,23 +21,23 @@ class INFO(object): INFO Description (str) """ - _VALID_TYPES = {'Integer', 'Float', 'Flag', 'Character', 'String'} - _CHAR_NUMS = {'A', 'G', 'R', '.'} + _VALID_TYPES = {"Integer", "Float", "Flag", "Character", "String"} + _CHAR_NUMS = {"A", "G", "R", "."} def __init__(self, infoid, number, infotype, description): - try: # Type checking + try: # Type checking assert isinstance(infoid, str) assert isinstance(number, (int, str)) assert isinstance(infotype, str) assert isinstance(description, str) except AssertionError: raise TypeError - try: # Value checking + try: # Value checking assert infotype in self._VALID_TYPES - if infotype == 'Flag': - assert int(number) is 0 + if infotype == "Flag": + assert int(number) == 0 if isinstance(number, str): - assert len(number) is 1 + assert len(number) == 1 assert number.upper() in self._CHAR_NUMS except AssertionError: raise ValueError @@ -72,13 +74,13 @@ def __hash__(self): def __call__(self): """Format the VCF ##INFO declaration for printing""" - vals = ( # Assemble the meat of the INFO declaration - 'ID=' + self._infoid, - 'Number=' + self._number, - 'Type=' + self._type, - 'Description="' + self._description + '"' + vals = ( # Assemble the meat of the INFO declaration + "ID=" + self._infoid, + "Number=" + self._number, + "Type=" + self._type, + 'Description="' + self._description + '"', ) - info = '##INFO=<' + ','.join(vals) + '>' # Create the full declaration + info = "##INFO=<" + ",".join(vals) + ">" # Create the full declaration return info def get_id(self): @@ -110,7 +112,7 @@ def no_error(func, errval=None, *args, **kwargs): # A function to remove duplicates from a list def deduplicate_list(with_dups, key): """Deduplicate a list, or tuple given a list, set, or tuple of values that should not be in the first list - with_dups is the list with values found in key""" + with_dups is the list with values found in key""" try: assert isinstance(with_dups, (list, tuple)) assert isinstance(key, list) or isinstance(key, set) or isinstance(key, tuple) @@ -132,33 +134,35 @@ def rank_remove(data, lowest=False): assert isinstance(data, (list, tuple)) assert isinstance(lowest, bool) except AssertionError: - raise TypeError("'data' must be of type 'list' or 'tuple', 'lowest' must be of type 'bool'") + raise TypeError( + "'data' must be of type 'list' or 'tuple', 'lowest' must be of type 'bool'" + ) # Create our first result try: result = [data[0]] - except KeyError: # There must be at least one data entry + except KeyError: # There must be at least one data entry raise ValueError("'data' must have at least one value") try: - for value in data[1:]: # For every other data point - if lowest: # If we're looking for the lowest rank - if value < result[0]: # If lower - result = [value] # Replace - elif value == result[0]: # If equal - result.append(value) # Add - else: # Otherwise - continue # Pass - else: # If we're looking for the highest rank - if value > result[0]: # If greater - result = [value] # Replace - elif value == result[0]: # If equal - result.append(value) # Add - else: # Otherwise - continue # Pass - except KeyError: # If there is only one data point (taken above) - pass # Skip this, we return the one point - except: # Other exceptions - raise # Let someone else deal with it - return result # Return our list + for value in data[1:]: # For every other data point + if lowest: # If we're looking for the lowest rank + if value < result[0]: # If lower + result = [value] # Replace + elif value == result[0]: # If equal + result.append(value) # Add + else: # Otherwise + continue # Pass + else: # If we're looking for the highest rank + if value > result[0]: # If greater + result = [value] # Replace + elif value == result[0]: # If equal + result.append(value) # Add + else: # Otherwise + continue # Pass + except KeyError: # If there is only one data point (taken above) + pass # Skip this, we return the one point + except: # Other exceptions + raise # Let someone else deal with it + return result # Return our list # A function to find the closest value @@ -175,7 +179,9 @@ def closest_value(iterable, value, return_item=True): except AssertionError: raise TypeError closest = lambda val: abs(val - value) - if isinstance(iterable, (list, tuple)) or (isinstance(iterable, dict) and not return_item): + if isinstance(iterable, (list, tuple)) or ( + isinstance(iterable, dict) and not return_item + ): return min(iterable, key=closest) elif isinstance(iterable, dict) and return_item: return iterable[min(iterable, key=closest)] @@ -186,38 +192,24 @@ def closest_value(iterable, value, return_item=True): # A function to make a VCF header def vcf_header(ref, info=None): """Make a header for a VCF file given a reference genome and an optional list of INFO objects""" - try: # Type checking + try: # Type checking assert isinstance(ref, str) assert isinstance(info, list) or info is None for i in info: assert isinstance(i, INFO) except TypeError: - pass # Ignore TypeErrors if 'info' is None - except AssertionError: # Catch assertions and raise as TypeError + pass # Ignore TypeErrors if 'info' is None + except AssertionError: # Catch assertions and raise as TypeError raise TypeError # The first several header lines - fileformat = '##fileformat=VCFv4.2' - filedate = '##fileDate=' + datetime.now().strftime("%Y%m%d") - source = '##source=' + os.path.basename(sys.argv[0]) - reference = '##reference=' + os.path.basename(ref) - colnames = ( - '#CHROM', - 'POS', - 'ID', - 'REF', - 'ALT', - 'QUAL', - 'FILTER', - 'INFO' - ) - header = [ - fileformat, - filedate, - source, - reference - ] + fileformat = "##fileformat=VCFv4.2" + filedate = "##fileDate=" + datetime.now().strftime("%Y%m%d") + source = "##source=" + os.path.basename(sys.argv[0]) + reference = "##reference=" + os.path.basename(ref) + colnames = ("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") + header = [fileformat, filedate, source, reference] # Add lines from INFO objects if info: header.extend([i() for i in info]) - header.append('\t'.join(colnames)) - return '\n'.join(header) + header.append("\t".join(colnames)) + return "\n".join(header) diff --git a/snp_utils.py b/snp_utils.py index 1d9b8bf..30d03b7 100755 --- a/snp_utils.py +++ b/snp_utils.py @@ -2,6 +2,7 @@ """A script to find SNP positions""" import sys + if sys.version_info.major != 3: sys.exit("Please use Python 3 for this script") @@ -20,10 +21,12 @@ from BLAST import runblastn from BLAST import configure except ImportError as error: - sys.exit("Please make sure you are in the 'SNP_Utils' directory to load custom modules:" + error.name) + sys.exit( + "Please make sure you are in the 'SNP_Utils' directory to load custom modules:" + + error.name + ) -from distutils import spawn from os.path import basename try: @@ -36,70 +39,76 @@ REFERENCE_DEFAULT = basename(sys.argv[0]) + # BLAST-based def blast_based(args, lookup_dict): """Run SNP_Utils using a BLAST-based method""" - try: # Type checking + try: # Type checking assert isinstance(args, dict) assert isinstance(lookup_dict, dict) except AssertionError: raise TypeError # If we weren't provided a BLAST XML file - if 'xml' not in args.keys(): + if "xml" not in args.keys(): # Run BLASTn print("Running BLAST", file=sys.stderr) # Configure BLAST - bconf = configure.parse_config(args['config']) + bconf = configure.parse_config(args["config"]) # Make our FASTA file - bconf['query'] = runblastn.write_fasta(lookup_dict, args['lookup']) + bconf["query"] = runblastn.write_fasta(lookup_dict, args["lookup"]) # Run BLAST blast_out = runblastn.run_blastn(bconf=bconf) # Open the XML file - blast_xml = open(blast_out, 'r') + blast_xml = open(blast_out, "r") else: # Read the XML file provided print("Loading BLAST XML file", file=sys.stderr) - blast_xml = args['xml'] - bconf = dict() # In case of filtering + blast_xml = args["xml"] + bconf = dict() # In case of filtering try: # Make soup out of the XML - blast_soup = BeautifulSoup(blast_xml, 'xml') + blast_soup = BeautifulSoup(blast_xml, "xml") except FeatureNotFound: sys.exit("Please install 'lxml' to properly parse the BLAST results") # Holding collections - no_hits = set() # A set to hold no hits - snp_list = [] # A list to hold all SNPs found - hsps = [] # A list of HSPs from the XML file - with_snps = [] # A list of HSPs with SNPs + no_hits = set() # A set to hold no hits + snp_list = [] # A list to hold all SNPs found + hsps = [] # A list of HSPs from the XML file + with_snps = [] # A list of HSPs with SNPs # Get reference genome information for filtering - ref_gen = blast.get_value(tag=blast_soup, value='BlastOutput_db') - bconf['database'] = ref_gen - if not ref_gen: # If our referenge genome is None - ref_gen = REFERENCE_DEFAULT # Give it a default value - bconf = None # Make bconf a NoneType for filtering functions + ref_gen = blast.get_value(tag=blast_soup, value="BlastOutput_db") + bconf["database"] = ref_gen + if not ref_gen: # If our referenge genome is None + ref_gen = REFERENCE_DEFAULT # Give it a default value + bconf = None # Make bconf a NoneType for filtering functions # Start parsing queries from the BLAST XML file - for query in blast_soup.findAll('Iteration'): - snpid = blast.get_value(tag=query, value='Iteration_query-def') + for query in blast_soup.findAll("Iteration"): + snpid = blast.get_value(tag=query, value="Iteration_query-def") # Ask if no hits were found - if blast.get_value(tag=query, value='Iteration_message').capitalize() == blast.NO_HIT_MESSAGE: - print('No hit for', snpid, file=sys.stderr) + if ( + blast.get_value(tag=query, value="Iteration_message").capitalize() + == blast.NO_HIT_MESSAGE + ): + print("No hit for", snpid, file=sys.stderr) no_hits.add(snpid) else: # For every hit found - for hit in query.findAll('Hit'): - hit_num = blast.get_value(tag=hit, value='Hit_num') - this_hsps = blast.parse_hit(snpid=snpid, hit=hit) # Parse the HSPs from this Hit - try: # blast.parse_hit() returns a list or a None + for hit in query.findAll("Hit"): + hit_num = blast.get_value(tag=hit, value="Hit_num") + this_hsps = blast.parse_hit( + snpid=snpid, hit=hit + ) # Parse the HSPs from this Hit + try: # blast.parse_hit() returns a list or a None hsps += this_hsps - except TypeError: # If this_hsps is a None - print('No HSPs for', snpid, 'hit number:', hit_num, file=sys.stderr) - no_hits.add(snpid) # Log as a failure + except TypeError: # If this_hsps is a None + print("No HSPs for", snpid, "hit number:", hit_num, file=sys.stderr) + no_hits.add(snpid) # Log as a failure # For the parsed HSPs for hsp in hsps: # Get the name of the SNP and corresponding Lookup information snpid = hsp.get_name() lookup = lookup_dict[snpid] - try: # Try to make a SNP out of this HSP + try: # Try to make a SNP out of this HSP hsp.add_snp(lookup=lookup) with_snps.append(hsp) # If no SNP, log and remove @@ -108,17 +117,17 @@ def blast_based(args, lookup_dict): no_hits.add(snpid) # hsps.remove(hsp) except NotABaseError: - print('We captured something weird for SNP', hsp, file=sys.stderr) + print("We captured something weird for SNP", hsp, file=sys.stderr) no_hits.add(snpid) # hsps.remove(hsp) except NoMatchError: - print('The lookup provided does not match with SNP', hsp, file=sys.stderr) + print("The lookup provided does not match with SNP", hsp, file=sys.stderr) no_hits.add(snpid) # hsps.remove(hsp) # Close the XML file blast_xml.close() # Rank, if asked for - if 'rank' in args.keys(): + if "rank" in args.keys(): final_hsps = blast.rank_hsps(hsps=with_snps) else: # final_hsps = {h.get_name() : h for h in hsps} @@ -140,13 +149,13 @@ def blast_based(args, lookup_dict): # Clean out false failures from no_hits hit_snps = {s.get_snpid() for s in snp_list} no_hits -= hit_snps - return(snp_list, no_hits, ref_gen, bconf) + return (snp_list, no_hits, ref_gen, bconf) # Alignment-blast_based def alignment_based(args, lookup_dict): """Run SNP_Utils using a SAM file""" - try: # Type checking + try: # Type checking assert isinstance(args, dict) assert isinstance(lookup_dict, dict) except AssertionError: @@ -156,13 +165,18 @@ def alignment_based(args, lookup_dict): snp_list = list() try: # Read in the reference FASTA file - print("Reading in reference FASTA", args['reference'], "This might take a while...", file=sys.stderr) - reference = SeqIO.to_dict(SeqIO.parse(args['reference'], 'fasta')) + print( + "Reading in reference FASTA", + args["reference"], + "This might take a while...", + file=sys.stderr, + ) + reference = SeqIO.to_dict(SeqIO.parse(args["reference"], "fasta")) # Read in the SAM alignment - print("Reading in SAM file", args['samfile'], file=sys.stderr) - with open(args['samfile'], 'r') as s: + print("Reading in SAM file", args["samfile"], file=sys.stderr) + with open(args["samfile"], "r") as s: for line in s: - if line.startswith('@'): + if line.startswith("@"): continue else: a = alignment.Alignment(line) @@ -171,18 +185,23 @@ def alignment_based(args, lookup_dict): except FileNotFoundError as error: sys.exit("Failed to find " + error.filename) for snpid in lookup_dict: - if snpid in alignment_dict.keys() and alignment_dict[snpid].get_contig() != '*': + if snpid in alignment_dict.keys() and alignment_dict[snpid].get_contig() != "*": s = snp.SNP( lookup=lookup_dict[snpid], alignment=alignment_dict[snpid], - reference=reference + reference=reference, ) snp_list.append(s) else: print("No map for", snpid) no_hits.add(snpid) if len(snp_list) < 1: - sys.exit("Failed to find any SNPs in " + args['samfile'] + " that were listed in " + args['lookup']) + sys.exit( + "Failed to find any SNPs in " + + args["samfile"] + + " that were listed in " + + args["lookup"] + ) no_hits -= {s.get_snpid() for s in snp_list} return (snp_list, no_hits) @@ -200,37 +219,39 @@ def write_outputs(args, snp_filter, masked_filter, no_snps, ref_gen, vcf_info): except AssertionError: raise TypeError header = utilities.vcf_header(ref_gen, sorted(vcf_info)) - outfile = args['outname'] + '.vcf' - maskedfile = args['outname'] + '_masked.vcf' - failedfile = args['outname'] + '_failed.log' + outfile = args["outname"] + ".vcf" + maskedfile = args["outname"] + "_masked.vcf" + failedfile = args["outname"] + "_failed.log" snp_list = list(snp_filter) snp_list.sort(key=lambda s: (s.get_chrom(), s.get_position())) if len(snp_list) < 1: print("Failed to find any SNPs", file=sys.stderr) else: print("Writing", len(snp_list), "SNPs to", outfile, file=sys.stderr) - with open(outfile, 'w') as o: + with open(outfile, "w") as o: o.write(header) - o.write('\n') + o.write("\n") for s in snp_list: o.write(s.format_vcf()) - o.write('\n') + o.write("\n") print("Removing masked SNPs that were actually found", file=sys.stderr) masked_list = list(masked_filter) if len(masked_list) > 0: - print("Writing", len(masked_list), "masked SNPs to", maskedfile, file=sys.stderr) - with open(maskedfile, 'w') as m: + print( + "Writing", len(masked_list), "masked SNPs to", maskedfile, file=sys.stderr + ) + with open(maskedfile, "w") as m: m.write(header) - m.write('\n') + m.write("\n") for s in masked_list: m.write(s.format_vcf()) - m.write('\n') + m.write("\n") if len(no_snps) > 0: print("Writing", len(no_snps), "failed SNPs to", failedfile, file=sys.stderr) - with open(failedfile, 'w') as f: + with open(failedfile, "w") as f: for s in sorted(no_snps): f.write(s) - f.write('\n') + f.write("\n") # Run the program @@ -239,47 +260,79 @@ def main(): parser = arguments.make_argument_parser() if not sys.argv[1:]: sys.exit(parser.print_help()) - args = {key : value for key, value in vars(parser.parse_args()).items() if value is not None} - if args['method'] == 'CONFIG': + args = { + key: value + for key, value in vars(parser.parse_args()).items() + if value is not None + } + if args["method"] == "CONFIG": configure.make_config(args) else: - arguments.validate_filters(args=args, parser=parser) # Validate filtering options + arguments.validate_filters( + args=args, parser=parser + ) # Validate filtering options # Two holding collections lookup_dict = {} vcf_info = [] # Read in the lookup table - with open(args['lookup'], 'r') as lk: - print("Reading in lookup table", args['lookup'], file=sys.stderr) + with open(args["lookup"], "r") as lk: + print("Reading in lookup table", args["lookup"], file=sys.stderr) for line in lk: split = line.strip().split() l = snp.Lookup(split[0], split[1]) lookup_dict[l.get_snpid()] = l # Find SNPs given our method of BLAST or SAM - if args['method'] == 'BLAST': - method = 'BLAST' - if not spawn.find_executable('blastn'): - sys.exit("Please install BLASTn from NCBI") + if args["method"] == "BLAST": + method = "BLAST" snp_list, no_snps, ref_gen, bconf = blast_based(args, lookup_dict) - vcf_info.append(INFO(infoid='B', number=0, infotype='Flag', description='Variant Calculated from BLAST')) - elif args['method'] == 'SAM': - method = 'SAM' - ref_gen = args['reference'] + vcf_info.append( + INFO( + infoid="B", + number=0, + infotype="Flag", + description="Variant Calculated from BLAST", + ) + ) + elif args["method"] == "SAM": + method = "SAM" + ref_gen = args["reference"] bconf = None snp_list, no_snps = alignment_based(args, lookup_dict) - vcf_info.append(INFO(infoid='S', number=0, infotype='Flag', description='Variant Calculated from SAM')) + vcf_info.append( + INFO( + infoid="S", + number=0, + infotype="Flag", + description="Variant Calculated from SAM", + ) + ) # Start our filtering masked = filter(lambda s: s.check_masked(), snp_list) proper_snps = tuple(filter(lambda s: not s.check_masked(), snp_list)) - if 'map' in args and args['bychrom']: - proper_snps, vcf_info = filters.chrom_filter(args=args, vcfinfo=vcf_info, propersnps=proper_snps) - if 'threshold' in args.keys(): - proper_snps, vcf_info = filters.distance_filter(args=args, vcfinfo=vcf_info, propersnps=proper_snps) - if 'map' in args and args['bydistance'] and ref_gen is not REFERENCE_DEFAULT: - proper_snps, vcf_info = filters.map_filter(args=args, vcfinfo=vcf_info, propersnps=proper_snps, refgen=ref_gen, method=method, bconf=bconf) + if "map" in args and args["bychrom"]: + proper_snps, vcf_info = filters.chrom_filter( + args=args, vcfinfo=vcf_info, propersnps=proper_snps + ) + if "threshold" in args.keys(): + proper_snps, vcf_info = filters.distance_filter( + args=args, vcfinfo=vcf_info, propersnps=proper_snps + ) + if "map" in args and args["bydistance"] and ref_gen is not REFERENCE_DEFAULT: + proper_snps, vcf_info = filters.map_filter( + args=args, + vcfinfo=vcf_info, + propersnps=proper_snps, + refgen=ref_gen, + method=method, + bconf=bconf, + ) elif ref_gen is REFERENCE_DEFAULT: - print("Could not determine reference genome, not filtering by genetic map distance", file=sys.stderr) + print( + "Could not determine reference genome, not filtering by genetic map distance", + file=sys.stderr, + ) write_outputs(args, proper_snps, masked, list(no_snps), ref_gen, vcf_info) -if __name__ == '__main__': +if __name__ == "__main__": main() From 07bd351184c6329a089f1cbae3e39ca4581ddc73 Mon Sep 17 00:00:00 2001 From: Stefan Kranz Date: Tue, 13 May 2025 11:47:44 +0200 Subject: [PATCH 2/3] add dependency files for pipenv --- Pipfile | 17 +++ Pipfile.lock | 410 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..5573584 --- /dev/null +++ b/Pipfile @@ -0,0 +1,17 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +biopython = "*" +beautifulsoup4 = "*" +lxml = "*" +overload = "*" +black = "*" +pylint = "*" + +[dev-packages] + +[requires] +python_version = "3.12" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..4b91dce --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,410 @@ +{ + "_meta": { + "hash": { + "sha256": "c40feacb0d7d48b2856ec48c1a3c9ee973ba5351aa5316f72c3417a1870fa5a7" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.12" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "astroid": { + "hashes": [ + "sha256:104fb9cb9b27ea95e847a94c003be03a9e039334a8ebca5ee27dafaf5c5711eb", + "sha256:c332157953060c6deb9caa57303ae0d20b0fbdb2e59b4a4f2a6ba49d0a7961ce" + ], + "markers": "python_full_version >= '3.9.0'", + "version": "==3.3.10" + }, + "beautifulsoup4": { + "hashes": [ + "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", + "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195" + ], + "index": "pypi", + "markers": "python_full_version >= '3.7.0'", + "version": "==4.13.4" + }, + "biopython": { + "hashes": [ + "sha256:0ffb03cd982cb3a79326b84e789f2093880175c44eea10f3030c632f98de24f6", + "sha256:1b61593765e9ebdb71d73307d55fd4b46eb976608d329ae6803c084d90ed34c7", + "sha256:327184048b5a50634ae0970119bcb8a35b76d7cefb2658a01f772915f2fb7686", + "sha256:434dd23e972b0c89e128f2ebbd16b38075d609184f4f1fd16368035f923019c2", + "sha256:5916eb56df7ecd4a3babc07a48d4894c40cfb45dc18ccda1c148d0131017ce04", + "sha256:5a236ab1e2797c7dcf1577d80fdaafabead2908bc338eaed0aa1509dab769fef", + "sha256:5dafab74059de4e78f49f6b5684eddae6e7ce46f09cfa059c1d1339e8b1ea0a6", + "sha256:66b08905fb1b3f5194f908aaf04bbad474c4e3eaebad6d9f889a04e64dd1faf4", + "sha256:6985e17a2911defcbd30275a12f5ed5de2765e4bc91a60439740d572fdbfdf43", + "sha256:6fe47d704c2d3afac99aeb461219ec5f00273120d2d99835dc0a9a617f520141", + "sha256:7c8326cd2825ba166abecaf72843b9b15823affd6cec04fde65f0d2526767da4", + "sha256:8208bf2d87ade066fafe9a63a2eb77486c233bc1bdda2cbf721ebee54715f1bf", + "sha256:8469095907a17f156c76b6644829227efdf4996164f7726e6f4ca15039329776", + "sha256:86e487b6fe0f20a2b0138cb53f3d4dc26a7e4060ac4cb6fb1d19e194d445ef46", + "sha256:8e2bbe58cc1a592b239ef6d68396745d3fbfaafc668ce38283871d8ff070dbab", + "sha256:9a08d082e85778259a83501063871e00ba57336136403b75050eea14d523c00a", + "sha256:9a630b3804f6c3fcae2f9b7485d7a05425e143fc570f25babbc5a4b3d3db00d7", + "sha256:a6308053a61f3bdbb11504ece4cf24e264c6f1d6fad278f7e59e6b84b0d9a7b4", + "sha256:b0cec8833bf3036049129944ee5382dd576dac9670c3814ff2314b52aa94f199", + "sha256:b1e4918e6399ab0183dd863527fec18b53b7c61b6f0ef95db84b4adfa430ce75", + "sha256:bae6f17262f20e9587d419ba5683e9dc93a31ee1858b98fa0cff203694d5b786", + "sha256:cc5b981b9e3060db7c355b6145dfe3ce0b6572e1601b31211f6d742b10543874", + "sha256:cf88a4c8d8af13138be115949639a5e4a201618185a72ff09adbe175b7946b28", + "sha256:cffc15ac46688cd4cf662b24d03037234ce00b571df67be45a942264f101f990", + "sha256:d024ad48997ad53d53a77da24b072aaba8a550bd816af8f2e7e606a9918a3b43", + "sha256:d3c99db65d57ae4fc5034e42ac6cd8ddce069e664903f04c8a4f684d7609d6fa", + "sha256:d6f8efb2db03f2ec115c5e8c764dbadf635e0c9ecd4c0e91fc8216c1b62f85f5", + "sha256:d76e44b46f555da2e72ac36e757efd327f7f5f690e9f00ede6f723b48538b6d5", + "sha256:d93464e629950e4df87d810125dc4e904538a4344b924f340908ea5bc95db986", + "sha256:db8822adab0cd75a6e6ae845acf312addd8eab5f9b731c191454b961fc2c2cdc", + "sha256:e54e495239e623660ad367498c2f7a1a294b1997ba603f2ceafb36fd18f0eba6", + "sha256:f2f45ab3f1e43fdaa697fd753148999090298623278097c19c2c3c0ba134e57c" + ], + "index": "pypi", + "markers": "python_version >= '3.9'", + "version": "==1.85" + }, + "black": { + "hashes": [ + "sha256:030b9759066a4ee5e5aca28c3c77f9c64789cdd4de8ac1df642c40b708be6171", + "sha256:055e59b198df7ac0b7efca5ad7ff2516bca343276c466be72eb04a3bcc1f82d7", + "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da", + "sha256:172b1dbff09f86ce6f4eb8edf9dede08b1fce58ba194c87d7a4f1a5aa2f5b3c2", + "sha256:1e2978f6df243b155ef5fa7e558a43037c3079093ed5d10fd84c43900f2d8ecc", + "sha256:33496d5cd1222ad73391352b4ae8da15253c5de89b93a80b3e2c8d9a19ec2666", + "sha256:3b48735872ec535027d979e8dcb20bf4f70b5ac75a8ea99f127c106a7d7aba9f", + "sha256:4b60580e829091e6f9238c848ea6750efed72140b91b048770b64e74fe04908b", + "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32", + "sha256:8f0b18a02996a836cc9c9c78e5babec10930862827b1b724ddfe98ccf2f2fe4f", + "sha256:95e8176dae143ba9097f351d174fdaf0ccd29efb414b362ae3fd72bf0f710717", + "sha256:96c1c7cd856bba8e20094e36e0f948718dc688dba4a9d78c3adde52b9e6c2299", + "sha256:a1ee0a0c330f7b5130ce0caed9936a904793576ef4d2b98c40835d6a65afa6a0", + "sha256:a22f402b410566e2d1c950708c77ebf5ebd5d0d88a6a2e87c86d9fb48afa0d18", + "sha256:a39337598244de4bae26475f77dda852ea00a93bd4c728e09eacd827ec929df0", + "sha256:afebb7098bfbc70037a053b91ae8437c3857482d3a690fefc03e9ff7aa9a5fd3", + "sha256:bacabb307dca5ebaf9c118d2d2f6903da0d62c9faa82bd21a33eecc319559355", + "sha256:bce2e264d59c91e52d8000d507eb20a9aca4a778731a08cfff7e5ac4a4bb7096", + "sha256:d9e6827d563a2c820772b32ce8a42828dc6790f095f441beef18f96aa6f8294e", + "sha256:db8ea9917d6f8fc62abd90d944920d95e73c83a5ee3383493e35d271aca872e9", + "sha256:ea0213189960bda9cf99be5b8c8ce66bb054af5e9e861249cd23471bd7b0b3ba", + "sha256:f3df5f1bf91d36002b0a75389ca8663510cf0531cca8aa5c1ef695b46d98655f" + ], + "index": "pypi", + "markers": "python_version >= '3.9'", + "version": "==25.1.0" + }, + "click": { + "hashes": [ + "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c", + "sha256:f5452aeddd9988eefa20f90f05ab66f17fce1ee2a36907fd30b05bbb5953814d" + ], + "markers": "python_version >= '3.10'", + "version": "==8.2.0" + }, + "dill": { + "hashes": [ + "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", + "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049" + ], + "markers": "python_version >= '3.11'", + "version": "==0.4.0" + }, + "isort": { + "hashes": [ + "sha256:1cb5df28dfbc742e490c5e41bad6da41b805b0a8be7bc93cd0fb2a8a890ac450", + "sha256:2dc5d7f65c9678d94c88dfc29161a320eec67328bc97aad576874cb4be1e9615" + ], + "markers": "python_full_version >= '3.9.0'", + "version": "==6.0.1" + }, + "lxml": { + "hashes": [ + "sha256:00b8686694423ddae324cf614e1b9659c2edb754de617703c3d29ff568448df5", + "sha256:073eb6dcdf1f587d9b88c8c93528b57eccda40209cf9be549d469b942b41d70b", + "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49", + "sha256:0a01ce7d8479dce84fc03324e3b0c9c90b1ece9a9bb6a1b6c9025e7e4520e78c", + "sha256:0be91891bdb06ebe65122aa6bf3fc94489960cf7e03033c6f83a90863b23c58b", + "sha256:0cef4feae82709eed352cd7e97ae062ef6ae9c7b5dbe3663f104cd2c0e8d94ba", + "sha256:0e108352e203c7afd0eb91d782582f00a0b16a948d204d4dec8565024fafeea5", + "sha256:0ea0252b51d296a75f6118ed0d8696888e7403408ad42345d7dfd0d1e93309a7", + "sha256:0fce1294a0497edb034cb416ad3e77ecc89b313cff7adbee5334e4dc0d11f422", + "sha256:1320091caa89805df7dcb9e908add28166113dcd062590668514dbd510798c88", + "sha256:142accb3e4d1edae4b392bd165a9abdee8a3c432a2cca193df995bc3886249c8", + "sha256:14479c2ad1cb08b62bb941ba8e0e05938524ee3c3114644df905d2331c76cd57", + "sha256:151d6c40bc9db11e960619d2bf2ec5829f0aaffb10b41dcf6ad2ce0f3c0b2325", + "sha256:15a665ad90054a3d4f397bc40f73948d48e36e4c09f9bcffc7d90c87410e478a", + "sha256:1a42b3a19346e5601d1b8296ff6ef3d76038058f311902edd574461e9c036982", + "sha256:1af80c6316ae68aded77e91cd9d80648f7dd40406cef73df841aa3c36f6907c8", + "sha256:1b717b00a71b901b4667226bba282dd462c42ccf618ade12f9ba3674e1fabc55", + "sha256:1dc4ca99e89c335a7ed47d38964abcb36c5910790f9bd106f2a8fa2ee0b909d2", + "sha256:20e16c08254b9b6466526bc1828d9370ee6c0d60a4b64836bc3ac2917d1e16df", + "sha256:226046e386556a45ebc787871d6d2467b32c37ce76c2680f5c608e25823ffc84", + "sha256:24974f774f3a78ac12b95e3a20ef0931795ff04dbb16db81a90c37f589819551", + "sha256:24f6df5f24fc3385f622c0c9d63fe34604893bc1a5bdbb2dbf5870f85f9a404a", + "sha256:27a9ded0f0b52098ff89dd4c418325b987feed2ea5cc86e8860b0f844285d740", + "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e", + "sha256:2b31a3a77501d86d8ade128abb01082724c0dfd9524f542f2f07d693c9f1175f", + "sha256:2c62891b1ea3094bb12097822b3d44b93fc6c325f2043c4d2736a8ff09e65f60", + "sha256:2dc191e60425ad70e75a68c9fd90ab284df64d9cd410ba8d2b641c0c45bc006e", + "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6", + "sha256:32697d2ea994e0db19c1df9e40275ffe84973e4232b5c274f47e7c1ec9763cdd", + "sha256:3a3178b4873df8ef9457a4875703488eb1622632a9cee6d76464b60e90adbfcd", + "sha256:3b9c2754cef6963f3408ab381ea55f47dabc6f78f4b8ebb0f0b25cf1ac1f7609", + "sha256:3d3c30ba1c9b48c68489dc1829a6eede9873f52edca1dda900066542528d6b20", + "sha256:3e6d5557989cdc3ebb5302bbdc42b439733a841891762ded9514e74f60319ad6", + "sha256:4025bf2884ac4370a3243c5aa8d66d3cb9e15d3ddd0af2d796eccc5f0244390e", + "sha256:4291d3c409a17febf817259cb37bc62cb7eb398bcc95c1356947e2871911ae61", + "sha256:4329422de653cdb2b72afa39b0aa04252fca9071550044904b2e7036d9d97fe4", + "sha256:43d549b876ce64aa18b2328faff70f5877f8c6dede415f80a2f799d31644d776", + "sha256:460508a4b07364d6abf53acaa0a90b6d370fafde5693ef37602566613a9b0779", + "sha256:47fb24cc0f052f0576ea382872b3fc7e1f7e3028e53299ea751839418ade92a6", + "sha256:48b4afaf38bf79109bb060d9016fad014a9a48fb244e11b94f74ae366a64d252", + "sha256:497cab4d8254c2a90bf988f162ace2ddbfdd806fce3bda3f581b9d24c852e03c", + "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92", + "sha256:4b7ce10634113651d6f383aa712a194179dcd496bd8c41e191cec2099fa09de5", + "sha256:4cd915c0fb1bed47b5e6d6edd424ac25856252f09120e3e8ba5154b6b921860e", + "sha256:4d885698f5019abe0de3d352caf9466d5de2baded00a06ef3f1216c1a58ae78f", + "sha256:4f5322cf38fe0e21c2d73901abf68e6329dc02a4994e483adbcf92b568a09a54", + "sha256:50441c9de951a153c698b9b99992e806b71c1f36d14b154592580ff4a9d0d877", + "sha256:529024ab3a505fed78fe3cc5ddc079464e709f6c892733e3f5842007cec8ac6e", + "sha256:53370c26500d22b45182f98847243efb518d268374a9570409d2e2276232fd37", + "sha256:53d9469ab5460402c19553b56c3648746774ecd0681b1b27ea74d5d8a3ef5590", + "sha256:56dbdbab0551532bb26c19c914848d7251d73edb507c3079d6805fa8bba5b706", + "sha256:5a99d86351f9c15e4a901fc56404b485b1462039db59288b203f8c629260a142", + "sha256:5cca36a194a4eb4e2ed6be36923d3cffd03dcdf477515dea687185506583d4c9", + "sha256:5f11a1526ebd0dee85e7b1e39e39a0cc0d9d03fb527f56d8457f6df48a10dc0c", + "sha256:61c7bbf432f09ee44b1ccaa24896d21075e533cd01477966a5ff5a71d88b2f56", + "sha256:639978bccb04c42677db43c79bdaa23785dc7f9b83bfd87570da8207872f1ce5", + "sha256:63e7968ff83da2eb6fdda967483a7a023aa497d85ad8f05c3ad9b1f2e8c84987", + "sha256:664cdc733bc87449fe781dbb1f309090966c11cc0c0cd7b84af956a02a8a4729", + "sha256:67ed8a40665b84d161bae3181aa2763beea3747f748bca5874b4af4d75998f87", + "sha256:67f779374c6b9753ae0a0195a892a1c234ce8416e4448fe1e9f34746482070a7", + "sha256:6854f8bd8a1536f8a1d9a3655e6354faa6406621cf857dc27b681b69860645c7", + "sha256:696ea9e87442467819ac22394ca36cb3d01848dad1be6fac3fb612d3bd5a12cf", + "sha256:6ef80aeac414f33c24b3815ecd560cee272786c3adfa5f31316d8b349bfade28", + "sha256:72ac9762a9f8ce74c9eed4a4e74306f2f18613a6b71fa065495a67ac227b3056", + "sha256:75133890e40d229d6c5837b0312abbe5bac1c342452cf0e12523477cd3aa21e7", + "sha256:7605c1c32c3d6e8c990dd28a0970a3cbbf1429d5b92279e37fda05fb0c92190e", + "sha256:773e27b62920199c6197130632c18fb7ead3257fce1ffb7d286912e56ddb79e0", + "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872", + "sha256:79d5bfa9c1b455336f52343130b2067164040604e41f6dc4d8313867ed540079", + "sha256:7a62cc23d754bb449d63ff35334acc9f5c02e6dae830d78dab4dd12b78a524f4", + "sha256:7be701c24e7f843e6788353c055d806e8bd8466b52907bafe5d13ec6a6dbaecd", + "sha256:7ca56ebc2c474e8f3d5761debfd9283b8b18c76c4fc0967b74aeafba1f5647f9", + "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121", + "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0", + "sha256:8f82125bc7203c5ae8633a7d5d20bcfdff0ba33e436e4ab0abc026a53a8960b7", + "sha256:91505d3ddebf268bb1588eb0f63821f738d20e1e7f05d3c647a5ca900288760b", + "sha256:942a5d73f739ad7c452bf739a62a0f83e2578afd6b8e5406308731f4ce78b16d", + "sha256:9454b8d8200ec99a224df8854786262b1bd6461f4280064c807303c642c05e76", + "sha256:9459e6892f59ecea2e2584ee1058f5d8f629446eab52ba2305ae13a32a059530", + "sha256:9776af1aad5a4b4a1317242ee2bea51da54b2a7b7b48674be736d463c999f37d", + "sha256:97dac543661e84a284502e0cf8a67b5c711b0ad5fb661d1bd505c02f8cf716d7", + "sha256:98a3912194c079ef37e716ed228ae0dcb960992100461b704aea4e93af6b0bb9", + "sha256:9b4a3bd174cc9cdaa1afbc4620c049038b441d6ba07629d89a83b408e54c35cd", + "sha256:9c886b481aefdf818ad44846145f6eaf373a20d200b5ce1a5c8e1bc2d8745410", + "sha256:9ceaf423b50ecfc23ca00b7f50b64baba85fb3fb91c53e2c9d00bc86150c7e40", + "sha256:a11a96c3b3f7551c8a8109aa65e8594e551d5a84c76bf950da33d0fb6dfafab7", + "sha256:a3bcdde35d82ff385f4ede021df801b5c4a5bcdfb61ea87caabcebfc4945dc1b", + "sha256:a7fb111eef4d05909b82152721a59c1b14d0f365e2be4c742a473c5d7372f4f5", + "sha256:a81e1196f0a5b4167a8dafe3a66aa67c4addac1b22dc47947abd5d5c7a3f24b5", + "sha256:a8c9b7f16b63e65bbba889acb436a1034a82d34fa09752d754f88d708eca80e1", + "sha256:a8ef956fce64c8551221f395ba21d0724fed6b9b6242ca4f2f7beb4ce2f41997", + "sha256:ab339536aa798b1e17750733663d272038bf28069761d5be57cb4a9b0137b4f8", + "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e", + "sha256:aea53d51859b6c64e7c51d522c03cc2c48b9b5d6172126854cc7f01aa11f52bc", + "sha256:aea7c06667b987787c7d1f5e1dfcd70419b711cdb47d6b4bb4ad4b76777a0563", + "sha256:aefe1a7cb852fa61150fcb21a8c8fcea7b58c4cb11fbe59c97a0a4b31cae3c8c", + "sha256:b0989737a3ba6cf2a16efb857fb0dfa20bc5c542737fddb6d893fde48be45433", + "sha256:b108134b9667bcd71236c5a02aad5ddd073e372fb5d48ea74853e009fe38acb6", + "sha256:b12cb6527599808ada9eb2cd6e0e7d3d8f13fe7bbb01c6311255a15ded4c7ab4", + "sha256:b5aff6f3e818e6bdbbb38e5967520f174b18f539c2b9de867b1e7fde6f8d95a4", + "sha256:b67319b4aef1a6c56576ff544b67a2a6fbd7eaee485b241cabf53115e8908b8f", + "sha256:b7c86884ad23d61b025989d99bfdd92a7351de956e01c61307cb87035960bcb1", + "sha256:b92b69441d1bd39f4940f9eadfa417a25862242ca2c396b406f9272ef09cdcaa", + "sha256:bcb7a1096b4b6b24ce1ac24d4942ad98f983cd3810f9711bcd0293f43a9d8b9f", + "sha256:bda3ea44c39eb74e2488297bb39d47186ed01342f0022c8ff407c250ac3f498e", + "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063", + "sha256:c5681160758d3f6ac5b4fea370495c48aac0989d6a0f01bb9a72ad8ef5ab75c4", + "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5", + "sha256:c6364038c519dffdbe07e3cf42e6a7f8b90c275d4d1617a69bb59734c1a2d571", + "sha256:c70e93fba207106cb16bf852e421c37bbded92acd5964390aad07cb50d60f5cf", + "sha256:ca755eebf0d9e62d6cb013f1261e510317a41bf4650f22963474a663fdfe02aa", + "sha256:cccd007d5c95279e529c146d095f1d39ac05139de26c098166c4beb9374b0f4d", + "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188", + "sha256:ce9c671845de9699904b1e9df95acfe8dfc183f2310f163cdaa91a3535af95de", + "sha256:d12832e1dbea4be280b22fd0ea7c9b87f0d8fc51ba06e92dc62d52f804f78ebd", + "sha256:d2ed1b3cb9ff1c10e6e8b00941bb2e5bb568b307bfc6b17dffbbe8be5eecba86", + "sha256:d5663bc1b471c79f5c833cffbc9b87d7bf13f87e055a5c86c363ccd2348d7e82", + "sha256:d90b729fd2732df28130c064aac9bb8aff14ba20baa4aee7bd0795ff1187545f", + "sha256:dc0af80267edc68adf85f2a5d9be1cdf062f973db6790c1d065e45025fa26140", + "sha256:de5b4e1088523e2b6f730d0509a9a813355b7f5659d70eb4f319c76beea2e250", + "sha256:de6f6bb8a7840c7bf216fb83eec4e2f79f7325eca8858167b68708b929ab2172", + "sha256:df53330a3bff250f10472ce96a9af28628ff1f4efc51ccba351a8820bca2a8ba", + "sha256:e094ec83694b59d263802ed03a8384594fcce477ce484b0cbcd0008a211ca751", + "sha256:e794f698ae4c5084414efea0f5cc9f4ac562ec02d66e1484ff822ef97c2cadff", + "sha256:e7bc6df34d42322c5289e37e9971d6ed114e3776b45fa879f734bded9d1fea9c", + "sha256:eaf24066ad0b30917186420d51e2e3edf4b0e2ea68d8cd885b14dc8afdcf6556", + "sha256:ecf4c4b83f1ab3d5a7ace10bafcb6f11df6156857a3c418244cef41ca9fa3e44", + "sha256:ef5a7178fcc73b7d8c07229e89f8eb45b2908a9238eb90dcfc46571ccf0383b8", + "sha256:f5cb182f6396706dc6cc1896dd02b1c889d644c081b0cdec38747573db88a7d7", + "sha256:fa0e294046de09acd6146be0ed6727d1f42ded4ce3ea1e9a19c11b6774eea27c", + "sha256:fb54f7c6bafaa808f27166569b1511fc42701a7713858dddc08afdde9746849e", + "sha256:fd3be6481ef54b8cfd0e1e953323b7aa9d9789b94842d0e5b142ef4bb7999539" + ], + "index": "pypi", + "markers": "python_version >= '3.6'", + "version": "==5.4.0" + }, + "mccabe": { + "hashes": [ + "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", + "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e" + ], + "markers": "python_version >= '3.6'", + "version": "==0.7.0" + }, + "mypy-extensions": { + "hashes": [ + "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", + "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558" + ], + "markers": "python_version >= '3.8'", + "version": "==1.1.0" + }, + "numpy": { + "hashes": [ + "sha256:0255732338c4fdd00996c0421884ea8a3651eea555c3a56b84892b66f696eb70", + "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", + "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", + "sha256:0bcb1d057b7571334139129b7f941588f69ce7c4ed15a9d6162b2ea54ded700c", + "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", + "sha256:19f4718c9012e3baea91a7dba661dcab2451cda2550678dc30d53acb91a7290f", + "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", + "sha256:1f4a922da1729f4c40932b2af4fe84909c7a6e167e6e99f71838ce3a29f3fe26", + "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", + "sha256:262d23f383170f99cd9191a7c85b9a50970fe9069b2f8ab5d786eca8a675d60b", + "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", + "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", + "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", + "sha256:369e0d4647c17c9363244f3468f2227d557a74b6781cb62ce57cf3ef5cc7c610", + "sha256:36ab5b23915887543441efd0417e6a3baa08634308894316f446027611b53bf1", + "sha256:37e32e985f03c06206582a7323ef926b4e78bdaa6915095ef08070471865b906", + "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", + "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", + "sha256:422cc684f17bc963da5f59a31530b3936f57c95a29743056ef7a7903a5dbdf88", + "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", + "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", + "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", + "sha256:498815b96f67dc347e03b719ef49c772589fb74b8ee9ea2c37feae915ad6ebda", + "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", + "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", + "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", + "sha256:6411f744f7f20081b1b4e7112e0f4c9c5b08f94b9f086e6f0adf3645f85d3a4d", + "sha256:6413d48a9be53e183eb06495d8e3b006ef8f87c324af68241bbe7a39e8ff54c3", + "sha256:7451f92eddf8503c9b8aa4fe6aa7e87fd51a29c2cfc5f7dbd72efde6c65acf57", + "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", + "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", + "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", + "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", + "sha256:9de6832228f617c9ef45d948ec1cd8949c482238d68b2477e6f642c33a7b0a54", + "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", + "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", + "sha256:aa70fdbdc3b169d69e8c59e65c07a1c9351ceb438e627f0fdcd471015cd956be", + "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", + "sha256:b13f04968b46ad705f7c8a80122a42ae8f620536ea38cf4bdd374302926424dd", + "sha256:b4ea7e1cff6784e58fe281ce7e7f05036b3e1c89c6f922a6bfbc0a7e8768adbe", + "sha256:b6f91524d31b34f4a5fee24f5bc16dcd1491b668798b6d85585d836c1e633a6a", + "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", + "sha256:c42365005c7a6c42436a54d28c43fe0e01ca11eb2ac3cefe796c25a5f98e5e9b", + "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", + "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", + "sha256:d2e3bdadaba0e040d1e7ab39db73e0afe2c74ae277f5614dad53eadbecbbb169", + "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", + "sha256:d7543263084a85fbc09c704b515395398d31d6395518446237eac219eab9e55e", + "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", + "sha256:e4f0b035d9d0ed519c813ee23e0a733db81ec37d2e9503afbb6e54ccfdee0fa7", + "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", + "sha256:eb7fd5b184e5d277afa9ec0ad5e4eb562ecff541e7f60e69ee69c8d59e9aeaba", + "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", + "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", + "sha256:f5045039100ed58fa817a6227a356240ea1b9a1bc141018864c306c1a16d4175" + ], + "markers": "python_version >= '3.10'", + "version": "==2.2.5" + }, + "overload": { + "hashes": [ + "sha256:215740c5c374b6039e1fb196e127316d3203b76795e2aa137a7f9376621b5592" + ], + "index": "pypi", + "version": "==1.1" + }, + "packaging": { + "hashes": [ + "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", + "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f" + ], + "markers": "python_version >= '3.8'", + "version": "==25.0" + }, + "pathspec": { + "hashes": [ + "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", + "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712" + ], + "markers": "python_version >= '3.8'", + "version": "==0.12.1" + }, + "platformdirs": { + "hashes": [ + "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", + "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4" + ], + "markers": "python_version >= '3.9'", + "version": "==4.3.8" + }, + "pylint": { + "hashes": [ + "sha256:2b11de8bde49f9c5059452e0c310c079c746a0a8eeaa789e5aa966ecc23e4559", + "sha256:43860aafefce92fca4cf6b61fe199cdc5ae54ea28f9bf4cd49de267b5195803d" + ], + "index": "pypi", + "markers": "python_full_version >= '3.9.0'", + "version": "==3.3.7" + }, + "soupsieve": { + "hashes": [ + "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", + "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a" + ], + "markers": "python_version >= '3.8'", + "version": "==2.7" + }, + "tomlkit": { + "hashes": [ + "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde", + "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79" + ], + "markers": "python_version >= '3.8'", + "version": "==0.13.2" + }, + "typing-extensions": { + "hashes": [ + "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", + "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef" + ], + "markers": "python_version >= '3.8'", + "version": "==4.13.2" + } + }, + "develop": {} +} From 92706c866ae4394f73ca3fb245d9425ada795984 Mon Sep 17 00:00:00 2001 From: Stefan Kranz Date: Tue, 13 May 2025 12:03:19 +0200 Subject: [PATCH 3/3] requirements.txt if venv is used instead of pipenv --- requirements.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a23d54e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +-i https://pypi.org/simple +astroid==3.3.10; python_full_version >= '3.9.0' +beautifulsoup4==4.13.4; python_full_version >= '3.7.0' +biopython==1.85; python_version >= '3.9' +black==25.1.0; python_version >= '3.9' +click==8.2.0; python_version >= '3.10' +dill==0.4.0; python_version >= '3.11' +isort==6.0.1; python_full_version >= '3.9.0' +lxml==5.4.0; python_version >= '3.6' +mccabe==0.7.0; python_version >= '3.6' +mypy-extensions==1.1.0; python_version >= '3.8' +numpy==2.2.5; python_version >= '3.10' +overload==1.1 +packaging==25.0; python_version >= '3.8' +pathspec==0.12.1; python_version >= '3.8' +platformdirs==4.3.8; python_version >= '3.9' +pylint==3.3.7; python_full_version >= '3.9.0' +soupsieve==2.7; python_version >= '3.8' +tomlkit==0.13.2; python_version >= '3.8' +typing-extensions==4.13.2; python_version >= '3.8'