Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions BioSeqTools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from Modules.filter_fastq import run_filter_fastq
from Modules.dna_rna_tools import run_dna_rna_tools
from Modules.aminoacids_tools import run_aminoacid_tools

def run_BioSeqTools(tool_name, *args):
if tool_name == "run_aminoacid_tools":
if len(args) < 2:
raise ValueError("Not enough arguments for run_aminoacid_tools")
return run_aminoacid_tools(*args[:-1], operation=args[-1])
elif tool_name == "run_dna_rna_tools":
return run_dna_rna_tools(*args)
elif tool_name == "run_filter_fastq":
return run_filter_fastq(*args)
else:
raise ValueError("Invalid tool_name")
Binary file added Modules/.DS_Store
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added Modules/__pycache__/dna_rna_tools.cpython-310.pyc
Binary file not shown.
Binary file added Modules/__pycache__/dna_rna_tools.cpython-311.pyc
Binary file not shown.
Binary file added Modules/__pycache__/filter_fastq.cpython-310.pyc
Binary file not shown.
Binary file added Modules/__pycache__/filter_fastq.cpython-311.pyc
Binary file not shown.
224 changes: 224 additions & 0 deletions Modules/aminoacids_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
from typing import Dict


def calculate_percentage(seq: str) -> str:
"""
Calculates the percentage of amino acids in the entered amino acid
sequence
Arguments:
- seq (str): amino acid sequences to be analyzed
Return:
- str: a string with the percentage of each amino acid
"""
amino_acid_counts: Dict[str, int] = {} # dict to store count of each amino acid
for amino_acid in seq:
if amino_acid in amino_acid_counts:
amino_acid_counts[amino_acid] += 1
else:
amino_acid_counts[amino_acid] = 1
total_amino_acids = len(seq)
amino_acid_percentages = {} # dict to store each amino acid and its %
for amino_acid, count in amino_acid_counts.items():
percentage = round(((count / total_amino_acids) * 100), 2)
amino_acid_percentages[amino_acid] = percentage
return f'Amino acids percentage of the sequence {seq}: {amino_acid_percentages}'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Почему возвращаешь строку? ))



def calculate_molecular_weight(seq: str) -> str:
"""
Calculates the molecular weight of entered amino acid sequence
Arguments:
- seq (str): amino acid sequences to be analyzed
Return:
- str: a string with the molecular weight value for amino acid
sequence
"""
amino_acid_weights = {
'G': 57.051, 'A': 71.078, 'S': 87.077, 'P': 97.115, 'V': 99.131,
'T': 101.104, 'C': 103.143, 'I': 113.158, 'L': 113.158, 'N':
114.103,
'D': 115.087, 'Q': 128.129, 'K': 128.172, 'E': 129.114, 'M':
131.196,
'H': 137.139, 'F': 147.174, 'R': 156.186, 'Y': 163.173, 'W':
186.210
}
weight = 18.02 # for the H and OH at the termini
for amino_acid in seq:
weight += amino_acid_weights[amino_acid]
return f'Molecular weight of the sequence {seq}: {round(weight, 2)} Da'

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Почему возвращаешь строку? ))



def calculate_hydrophobicity_eisenberg(sequence):

# Amino acid hydrophilicity/hydrophobicity scale by Eisengerg
hydrophobicity_values = {
'A': 0.5, 'R': 0.65, 'N': 1.0, 'D': 1.3, 'C': -0.15,
'Q': 1.0, 'E': 1.5, 'G': 0.75, 'H': 0.7, 'I': -1.3,
'L': -1.3, 'K': 0.75, 'M': -1.1, 'F': -1.9, 'P': 0.55,
'S': 0.6, 'T': 0.3, 'W': -0.5, 'Y': -1.65, 'V': -0.9
}

# Calculate sum of hydrophilicities for all amino acids in the sequence
hydrophobicity_sum = sum(hydrophobicity_values.get(aa, 0) for aa in sequence)

# Determine hydrophilicity/hydrophobicity of sequence
if hydrophobicity_sum > 0:
return f"Sequence {sequence}: Hydrophilic"
elif hydrophobicity_sum < 0:
return f"Sequence {sequence}: Hydrophobic"
else:
return f"Sequence {sequence}: Neutral"


def calculate_pI(sequence):
"""Create a dictionary of pK values (COO-, NH3+, R) information taken
from source
http://www.sev-chem.narod.ru/spravochnik/piaminoacid.htm"""
pK_values = {
'A': (2.34, 9.60),
'R': (2.17, 9.04, 12.48),
'N': (2.02, 8.80),
'D': (2.09, 9.82, 3.86),
'C': (1.71, 8.33, 10.30),
'Q': (2.17, 9.13),
'E': (2.19, 9.76, 4.25),
'G': (2.34, 9.60),
'H': (1.82, 9.17, 6.00),
'I': (2.32, 9.76),
'L': (2.36, 9.60),
'K': (2.18, 8.95, 10.5),
'M': (2.28, 9.21),
'F': (2.58, 9.24),
'P': (2.00, 10.60),
'S': (2.21, 9.15),
'T': (2.63, 10.43),
'W': (1.22, 9.39),
'Y': (2.20, 9.11, 10.10),
'V': (2.29, 9.72)
}

# Initialization of variables for leftmost and rightmost elements
N_end_pK = None
C_end_pK = None

# Find the marginal elements and their corresponding pKs
for amino_acid in sequence:
if amino_acid in pK_values:
pK_list = pK_values[amino_acid]
if len(pK_list) >= 2:
if N_end_pK is None:
N_end_pK = pK_list[1] # Второй pK
C_end_pK = pK_list[0] # Первый pK

# If no amino acid sequence is specified - return None
if N_end_pK is None or C_end_pK is None:
return None

# Calculate pI
total_pK = N_end_pK + C_end_pK
count = 2 # We take into account the found pKs - there are at least 2

# Also add pK of AA radicals - the dictionary contains 3 pK values
for amino_acid in sequence:
if amino_acid in pK_values:
pK_list = pK_values[amino_acid]
if len(pK_list) >= 3:
total_pK += pK_list[2] # Третий pK
count += 1

# Substitute all found values into the formula and calculate pI
pI = total_pK / count
return f"Isoelectric point for the sequence {sequence}: {pI}"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Почему возвращаешь строку? ))



def find_cleavage_sites(seq: str, motif: list) -> list:
"""Find cleavage sites for motif-specific proteases.
Arguments:
- seq - string sequence to be analyzed
- motif - subsequence to be found in a sequence. Subsequence is
specified as list of lists.
Each nested list means more than one possible aminoacid at a single
position (checked by OR condition).
Return:
- list of cleavage sites coordinates (C-end aminoacid of *potentially*
cleaved sequence)
"""
cleavage_sites = []
seq_idx = 0
while seq_idx < len(seq):
motif_idx = 0
chars_at_motif_idx = motif[motif_idx]
seq_char = seq[seq_idx]
if seq_char in chars_at_motif_idx:
motif_idx += 1
while motif_idx < len(motif):
chars_at_motif_idx = motif[motif_idx]
seq_char = seq[seq_idx+motif_idx]
if seq_char in chars_at_motif_idx:
motif_idx += 1
else:
break
if motif_idx == len(motif):
cleavage_sites.append(seq_idx + motif_idx)
seq_idx += 1
return cleavage_sites


motif_dict = {
'Caspase 3': [['D'], ['M'], ['Q'], ['D']],
'Caspase 6': [['V'], ['E'], ['H', 'I'], ['D']],
'Caspase 7': [['D'], ['E'], ['V'], ['D']],
'Enterokinase': [['D', 'E'], ['D', 'E'], ['D', 'E'], ['K']]
}


def get_cleavage_sites(seq: str) -> str:
"Return amount and coordinates of cleavage sites for proteases, specified in motif_dict"
output = f'{seq}\n'
for motif_name, motif_value in motif_dict.items():
sites = find_cleavage_sites(seq, motif_value)
output += f'{len(sites)} protease cleavage site(s) for {motif_name}: {sites}\n'
return output


all_aminoacids = {
'A', 'R', 'N', 'D', 'C', 'H', 'G', 'Q', 'E', 'I',
'L', 'K', 'M', 'P', 'S', 'Y', 'T', 'W', 'F', 'V'
}


def is_peptide(seq: str) -> bool:
"Check whether the incoming sequence is an aminoacid"
if set(seq).issubset(all_aminoacids): # if set(seq) <= all_aminoacids
return True
raise ValueError(f'Incoming sequence {seq} is not a peptide')


operation_dict = {
'get_cleavage_sites': get_cleavage_sites,
'calculate_molecular_weight': calculate_molecular_weight,
'calculate_percentage': calculate_percentage,
'calculate_pI': calculate_pI,
'calculate_hydrophobicity_eisenberg':
calculate_hydrophobicity_eisenberg
}


def run_aminoacid_tools(*seqs: str, operation: str) -> str:
"""Run AminoAcid Tools
Arguments:
- *seqs - one or more string sequences to be analyzed
- operation - action to be done with sequence(s)
Return:
- string that contains incoming sequence and result of operation"""
if operation == '':
raise ValueError('Operation value is not specified')
if operation not in operation_dict:
raise ValueError(f'Incorrect operation value\nSupported operations: {list(operation_dict.keys())}')
for seq in seqs:
is_peptide(seq)
output = ''
for seq in seqs:
output += operation_dict[operation](seq)
output += '\n\n'
return output
53 changes: 53 additions & 0 deletions Modules/dna_rna_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
def transcribe(dna_sequence):
rna_sequence = ''
for base in dna_sequence:
if base == 'T':
rna_sequence += 'U'
elif base == 't':
rna_sequence += 'u'
elif base == 'U':
rna_sequence += 'T'
elif base == 'u':
rna_sequence += 't'
else:
rna_sequence += base
return rna_sequence

def reverse(sequence):
return sequence[::-1]

def complement(sequence):
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't',
't': 'a', 'c': 'g', 'g': 'c'}
complement_sequence = ''.join(complement_dict.get(base, base)
for base in sequence)
return complement_sequence

def reverse_complement(dna_sequence):
# можно также "complement_sequence = complement(dna_sequence)"
complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't',
't': 'a', 'c': 'g', 'g': 'c'}
complement_sequence = ''.join(complement_dict.get(base, base)
for base in dna_sequence)
return complement_sequence[::-1]

def run_dna_rna_tools(*args):
if not args:
return "There are no function arguments"
action = args[-1].lower()
sequences = args[:-1]
results = []
for sequence in sequences:
if not all(base in 'ACGTU' for base in sequence.upper()):
results.append(f"Invalid sequence: {sequence}")
elif action == 'transcribe':
results.append(transcribe(sequence))
elif action == 'reverse':
results.append(reverse(sequence))
elif action == 'complement':
results.append(complement(sequence))
elif action == 'reverse_complement':
results.append(reverse_complement(sequence))
else:
results.append(f"Invalid action: {action}")
return results[0] if len(results) == 1 else results
20 changes: 20 additions & 0 deletions Modules/filter_fastq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
def run_filter_fastq(seqs, gc_bounds=(0, 100), length_bounds=(0, 2**32), quality_threshold=0):
filtered_seqs = {}

for name, (sequence, quality) in seqs.items():
# Estimate GC-content
gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100

# Estimate average quality
avg_quality = sum(ord(q) - 33 for q in quality) / len(quality)

# Filtering by user`s conditions
if (
gc_bounds[0] <= gc_content <= gc_bounds[1] and
length_bounds[0] <= len(sequence) <= length_bounds[1] and
avg_quality >= quality_threshold
):
filtered_seqs[name] = (sequence, quality)

return filtered_seqs

Loading