-
Notifications
You must be signed in to change notification settings - Fork 0
Completed HW6_Files #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
DAChernikov
wants to merge
14
commits into
main
Choose a base branch
from
HW6_ChernikovDA
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
516f05d
Add HW5 files
DAChernikov fe455ee
Update README.md
DAChernikov 109d69a
Add files HW6
DAChernikov a1794cd
Move example data to dir
DAChernikov 7b12a66
Delete example_blast_results.txt
DAChernikov 0c0be1c
Delete example_fastq.fastq
DAChernikov 17e3faa
Delete example_gbk.gbk
DAChernikov f2134bb
Delete example_multiline_fasta.fasta
DAChernikov 78a6310
Delete example_multiline_fasta_converted.fasta
DAChernikov 76fa555
Delete filtered_output.fastq
DAChernikov 15e78af
Delete output.fasta
DAChernikov 0162896
Delete output_selected_genes.fasta
DAChernikov 21dbae7
Delete parsed_blast_results.txt
DAChernikov 26b79e4
Update README.md
DAChernikov File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| from Modules.filter_fastq import run_filter_fastq | ||
| from Modules.dna_rna_tools import run_dna_rna_tools | ||
| from Modules.aminoacids_tools import run_aminoacid_tools | ||
|
|
||
| def run_BioSeqTools(tool_name, *args): | ||
| if tool_name == "run_aminoacid_tools": | ||
| if len(args) < 2: | ||
| raise ValueError("Not enough arguments for run_aminoacid_tools") | ||
| return run_aminoacid_tools(*args[:-1], operation=args[-1]) | ||
| elif tool_name == "run_dna_rna_tools": | ||
| return run_dna_rna_tools(*args) | ||
| elif tool_name == "run_filter_fastq": | ||
| return run_filter_fastq(*args) | ||
| else: | ||
| raise ValueError("Invalid tool_name") |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,224 @@ | ||
| from typing import Dict | ||
|
|
||
|
|
||
| def calculate_percentage(seq: str) -> str: | ||
| """ | ||
| Calculates the percentage of amino acids in the entered amino acid | ||
| sequence | ||
| Arguments: | ||
| - seq (str): amino acid sequences to be analyzed | ||
| Return: | ||
| - str: a string with the percentage of each amino acid | ||
| """ | ||
| amino_acid_counts: Dict[str, int] = {} # dict to store count of each amino acid | ||
| for amino_acid in seq: | ||
| if amino_acid in amino_acid_counts: | ||
| amino_acid_counts[amino_acid] += 1 | ||
| else: | ||
| amino_acid_counts[amino_acid] = 1 | ||
| total_amino_acids = len(seq) | ||
| amino_acid_percentages = {} # dict to store each amino acid and its % | ||
| for amino_acid, count in amino_acid_counts.items(): | ||
| percentage = round(((count / total_amino_acids) * 100), 2) | ||
| amino_acid_percentages[amino_acid] = percentage | ||
| return f'Amino acids percentage of the sequence {seq}: {amino_acid_percentages}' | ||
|
|
||
|
|
||
| def calculate_molecular_weight(seq: str) -> str: | ||
| """ | ||
| Calculates the molecular weight of entered amino acid sequence | ||
| Arguments: | ||
| - seq (str): amino acid sequences to be analyzed | ||
| Return: | ||
| - str: a string with the molecular weight value for amino acid | ||
| sequence | ||
| """ | ||
| amino_acid_weights = { | ||
| 'G': 57.051, 'A': 71.078, 'S': 87.077, 'P': 97.115, 'V': 99.131, | ||
| 'T': 101.104, 'C': 103.143, 'I': 113.158, 'L': 113.158, 'N': | ||
| 114.103, | ||
| 'D': 115.087, 'Q': 128.129, 'K': 128.172, 'E': 129.114, 'M': | ||
| 131.196, | ||
| 'H': 137.139, 'F': 147.174, 'R': 156.186, 'Y': 163.173, 'W': | ||
| 186.210 | ||
| } | ||
| weight = 18.02 # for the H and OH at the termini | ||
| for amino_acid in seq: | ||
| weight += amino_acid_weights[amino_acid] | ||
| return f'Molecular weight of the sequence {seq}: {round(weight, 2)} Da' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Почему возвращаешь строку? )) |
||
|
|
||
|
|
||
| def calculate_hydrophobicity_eisenberg(sequence): | ||
|
|
||
| # Amino acid hydrophilicity/hydrophobicity scale by Eisengerg | ||
| hydrophobicity_values = { | ||
| 'A': 0.5, 'R': 0.65, 'N': 1.0, 'D': 1.3, 'C': -0.15, | ||
| 'Q': 1.0, 'E': 1.5, 'G': 0.75, 'H': 0.7, 'I': -1.3, | ||
| 'L': -1.3, 'K': 0.75, 'M': -1.1, 'F': -1.9, 'P': 0.55, | ||
| 'S': 0.6, 'T': 0.3, 'W': -0.5, 'Y': -1.65, 'V': -0.9 | ||
| } | ||
|
|
||
| # Calculate sum of hydrophilicities for all amino acids in the sequence | ||
| hydrophobicity_sum = sum(hydrophobicity_values.get(aa, 0) for aa in sequence) | ||
|
|
||
| # Determine hydrophilicity/hydrophobicity of sequence | ||
| if hydrophobicity_sum > 0: | ||
| return f"Sequence {sequence}: Hydrophilic" | ||
| elif hydrophobicity_sum < 0: | ||
| return f"Sequence {sequence}: Hydrophobic" | ||
| else: | ||
| return f"Sequence {sequence}: Neutral" | ||
|
|
||
|
|
||
| def calculate_pI(sequence): | ||
| """Create a dictionary of pK values (COO-, NH3+, R) information taken | ||
| from source | ||
| http://www.sev-chem.narod.ru/spravochnik/piaminoacid.htm""" | ||
| pK_values = { | ||
| 'A': (2.34, 9.60), | ||
| 'R': (2.17, 9.04, 12.48), | ||
| 'N': (2.02, 8.80), | ||
| 'D': (2.09, 9.82, 3.86), | ||
| 'C': (1.71, 8.33, 10.30), | ||
| 'Q': (2.17, 9.13), | ||
| 'E': (2.19, 9.76, 4.25), | ||
| 'G': (2.34, 9.60), | ||
| 'H': (1.82, 9.17, 6.00), | ||
| 'I': (2.32, 9.76), | ||
| 'L': (2.36, 9.60), | ||
| 'K': (2.18, 8.95, 10.5), | ||
| 'M': (2.28, 9.21), | ||
| 'F': (2.58, 9.24), | ||
| 'P': (2.00, 10.60), | ||
| 'S': (2.21, 9.15), | ||
| 'T': (2.63, 10.43), | ||
| 'W': (1.22, 9.39), | ||
| 'Y': (2.20, 9.11, 10.10), | ||
| 'V': (2.29, 9.72) | ||
| } | ||
|
|
||
| # Initialization of variables for leftmost and rightmost elements | ||
| N_end_pK = None | ||
| C_end_pK = None | ||
|
|
||
| # Find the marginal elements and their corresponding pKs | ||
| for amino_acid in sequence: | ||
| if amino_acid in pK_values: | ||
| pK_list = pK_values[amino_acid] | ||
| if len(pK_list) >= 2: | ||
| if N_end_pK is None: | ||
| N_end_pK = pK_list[1] # Второй pK | ||
| C_end_pK = pK_list[0] # Первый pK | ||
|
|
||
| # If no amino acid sequence is specified - return None | ||
| if N_end_pK is None or C_end_pK is None: | ||
| return None | ||
|
|
||
| # Calculate pI | ||
| total_pK = N_end_pK + C_end_pK | ||
| count = 2 # We take into account the found pKs - there are at least 2 | ||
|
|
||
| # Also add pK of AA radicals - the dictionary contains 3 pK values | ||
| for amino_acid in sequence: | ||
| if amino_acid in pK_values: | ||
| pK_list = pK_values[amino_acid] | ||
| if len(pK_list) >= 3: | ||
| total_pK += pK_list[2] # Третий pK | ||
| count += 1 | ||
|
|
||
| # Substitute all found values into the formula and calculate pI | ||
| pI = total_pK / count | ||
| return f"Isoelectric point for the sequence {sequence}: {pI}" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Почему возвращаешь строку? )) |
||
|
|
||
|
|
||
| def find_cleavage_sites(seq: str, motif: list) -> list: | ||
| """Find cleavage sites for motif-specific proteases. | ||
| Arguments: | ||
| - seq - string sequence to be analyzed | ||
| - motif - subsequence to be found in a sequence. Subsequence is | ||
| specified as list of lists. | ||
| Each nested list means more than one possible aminoacid at a single | ||
| position (checked by OR condition). | ||
| Return: | ||
| - list of cleavage sites coordinates (C-end aminoacid of *potentially* | ||
| cleaved sequence) | ||
| """ | ||
| cleavage_sites = [] | ||
| seq_idx = 0 | ||
| while seq_idx < len(seq): | ||
| motif_idx = 0 | ||
| chars_at_motif_idx = motif[motif_idx] | ||
| seq_char = seq[seq_idx] | ||
| if seq_char in chars_at_motif_idx: | ||
| motif_idx += 1 | ||
| while motif_idx < len(motif): | ||
| chars_at_motif_idx = motif[motif_idx] | ||
| seq_char = seq[seq_idx+motif_idx] | ||
| if seq_char in chars_at_motif_idx: | ||
| motif_idx += 1 | ||
| else: | ||
| break | ||
| if motif_idx == len(motif): | ||
| cleavage_sites.append(seq_idx + motif_idx) | ||
| seq_idx += 1 | ||
| return cleavage_sites | ||
|
|
||
|
|
||
| motif_dict = { | ||
| 'Caspase 3': [['D'], ['M'], ['Q'], ['D']], | ||
| 'Caspase 6': [['V'], ['E'], ['H', 'I'], ['D']], | ||
| 'Caspase 7': [['D'], ['E'], ['V'], ['D']], | ||
| 'Enterokinase': [['D', 'E'], ['D', 'E'], ['D', 'E'], ['K']] | ||
| } | ||
|
|
||
|
|
||
| def get_cleavage_sites(seq: str) -> str: | ||
| "Return amount and coordinates of cleavage sites for proteases, specified in motif_dict" | ||
| output = f'{seq}\n' | ||
| for motif_name, motif_value in motif_dict.items(): | ||
| sites = find_cleavage_sites(seq, motif_value) | ||
| output += f'{len(sites)} protease cleavage site(s) for {motif_name}: {sites}\n' | ||
| return output | ||
|
|
||
|
|
||
| all_aminoacids = { | ||
| 'A', 'R', 'N', 'D', 'C', 'H', 'G', 'Q', 'E', 'I', | ||
| 'L', 'K', 'M', 'P', 'S', 'Y', 'T', 'W', 'F', 'V' | ||
| } | ||
|
|
||
|
|
||
| def is_peptide(seq: str) -> bool: | ||
| "Check whether the incoming sequence is an aminoacid" | ||
| if set(seq).issubset(all_aminoacids): # if set(seq) <= all_aminoacids | ||
| return True | ||
| raise ValueError(f'Incoming sequence {seq} is not a peptide') | ||
|
|
||
|
|
||
| operation_dict = { | ||
| 'get_cleavage_sites': get_cleavage_sites, | ||
| 'calculate_molecular_weight': calculate_molecular_weight, | ||
| 'calculate_percentage': calculate_percentage, | ||
| 'calculate_pI': calculate_pI, | ||
| 'calculate_hydrophobicity_eisenberg': | ||
| calculate_hydrophobicity_eisenberg | ||
| } | ||
|
|
||
|
|
||
| def run_aminoacid_tools(*seqs: str, operation: str) -> str: | ||
| """Run AminoAcid Tools | ||
| Arguments: | ||
| - *seqs - one or more string sequences to be analyzed | ||
| - operation - action to be done with sequence(s) | ||
| Return: | ||
| - string that contains incoming sequence and result of operation""" | ||
| if operation == '': | ||
| raise ValueError('Operation value is not specified') | ||
| if operation not in operation_dict: | ||
| raise ValueError(f'Incorrect operation value\nSupported operations: {list(operation_dict.keys())}') | ||
| for seq in seqs: | ||
| is_peptide(seq) | ||
| output = '' | ||
| for seq in seqs: | ||
| output += operation_dict[operation](seq) | ||
| output += '\n\n' | ||
| return output | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| def transcribe(dna_sequence): | ||
| rna_sequence = '' | ||
| for base in dna_sequence: | ||
| if base == 'T': | ||
| rna_sequence += 'U' | ||
| elif base == 't': | ||
| rna_sequence += 'u' | ||
| elif base == 'U': | ||
| rna_sequence += 'T' | ||
| elif base == 'u': | ||
| rna_sequence += 't' | ||
| else: | ||
| rna_sequence += base | ||
| return rna_sequence | ||
|
|
||
| def reverse(sequence): | ||
| return sequence[::-1] | ||
|
|
||
| def complement(sequence): | ||
| complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', | ||
| 't': 'a', 'c': 'g', 'g': 'c'} | ||
| complement_sequence = ''.join(complement_dict.get(base, base) | ||
| for base in sequence) | ||
| return complement_sequence | ||
|
|
||
| def reverse_complement(dna_sequence): | ||
| # можно также "complement_sequence = complement(dna_sequence)" | ||
| complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', | ||
| 't': 'a', 'c': 'g', 'g': 'c'} | ||
| complement_sequence = ''.join(complement_dict.get(base, base) | ||
| for base in dna_sequence) | ||
| return complement_sequence[::-1] | ||
|
|
||
| def run_dna_rna_tools(*args): | ||
| if not args: | ||
| return "There are no function arguments" | ||
| action = args[-1].lower() | ||
| sequences = args[:-1] | ||
| results = [] | ||
| for sequence in sequences: | ||
| if not all(base in 'ACGTU' for base in sequence.upper()): | ||
| results.append(f"Invalid sequence: {sequence}") | ||
| elif action == 'transcribe': | ||
| results.append(transcribe(sequence)) | ||
| elif action == 'reverse': | ||
| results.append(reverse(sequence)) | ||
| elif action == 'complement': | ||
| results.append(complement(sequence)) | ||
| elif action == 'reverse_complement': | ||
| results.append(reverse_complement(sequence)) | ||
| else: | ||
| results.append(f"Invalid action: {action}") | ||
| return results[0] if len(results) == 1 else results |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| def run_filter_fastq(seqs, gc_bounds=(0, 100), length_bounds=(0, 2**32), quality_threshold=0): | ||
| filtered_seqs = {} | ||
|
|
||
| for name, (sequence, quality) in seqs.items(): | ||
| # Estimate GC-content | ||
| gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence) * 100 | ||
|
|
||
| # Estimate average quality | ||
| avg_quality = sum(ord(q) - 33 for q in quality) / len(quality) | ||
|
|
||
| # Filtering by user`s conditions | ||
| if ( | ||
| gc_bounds[0] <= gc_content <= gc_bounds[1] and | ||
| length_bounds[0] <= len(sequence) <= length_bounds[1] and | ||
| avg_quality >= quality_threshold | ||
| ): | ||
| filtered_seqs[name] = (sequence, quality) | ||
|
|
||
| return filtered_seqs | ||
|
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Почему возвращаешь строку? ))