Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
323 changes: 14 additions & 309 deletions README.md

Large diffs are not rendered by default.

401 changes: 401 additions & 0 deletions Showcases.ipynb

Large diffs are not rendered by default.

606 changes: 606 additions & 0 deletions additional_modules.py

Large diffs are not rendered by default.

85 changes: 81 additions & 4 deletions bio_files_processor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,76 @@
import os
from dataclasses import dataclass


@dataclass
class FastaRecord:
fasta_id: str
description: str
seq: str

def __repr__(self):
return f"FastaRecord:\nid = {self.fasta_id} \n" \
f"description = {self.description} \n" \
f"sequence = {self.seq}"


class OpenFasta:
def __init__(self, file_path, mode='r'):
self.file_path = file_path
self.mode = mode
self.file = None
self._current_line = None
self._id = None
self._description = None

def __enter__(self):
self.file = open(self.file_path, self.mode)
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.file.close()

def __iter__(self):
return self

def __next__(self):
record = self.read_record()
if record is None:
raise StopIteration
return record

def read_record(self):
if self._current_line is None:
self._current_line = self.file.readline().strip()
if self._current_line.startswith(">"):
self._id, self._description = self._current_line.split(" ", 1)
if not self._current_line:
return None

seq = ""
fasta_record = FastaRecord(fasta_id=self._id[1:],
description=self._description,
seq="")
while True:
self._current_line = self.file.readline().strip()
if not self._current_line:
break
if self._current_line.startswith(">"):
self._id, self._description = self._current_line.split(" ", 1)
break
seq += self._current_line
fasta_record.seq = seq
return fasta_record

def read_records(self):
records = []
while True:
record = self.read_record()
if record:
records.append(record)
else:
break
return records


def convert_multiline_fasta_to_oneline(input_fasta: str,
Expand All @@ -17,14 +89,18 @@ def convert_multiline_fasta_to_oneline(input_fasta: str,
output_fasta = output_fasta.split('.')
output_fasta = f'{output_fasta[0]}_oneline.{output_fasta[1]}'

first_line = True
with open(input_fasta, 'r', encoding='utf-8') as input_file:
try:
with open(output_fasta, 'x', encoding='utf-8') as output_file:
output_file.write(input_file.readline())
for line in input_file.readlines():
if line.startswith('>'):
output_file.write('\n')
output_file.write(line)
if first_line:
output_file.write(line)
first_line = False
else:
output_file.write('\n')
output_file.write(line)
else:
output_file.write(line.strip())
except FileExistsError:
Expand Down Expand Up @@ -93,7 +169,8 @@ def select_genes_from_gbk_to_fasta(input_gbk: str,
if right_border >= len(input_dict_position):
right_border = len(input_dict_position)
for key, value in input_dict_position.items():
if value in range(left_border, right_border+1) and key != gene:
if value in range(left_border,
right_border + 1) and key != gene:
neighbours_of_gene[key] = input_dict_sequences[key]
else:
raise ValueError(f'The gene {gene} is not in the data.')
Expand Down
Loading