yvolko · yvolko · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 25, 2024
diff --git a/README.md b/README.md
diff --git a/Showcases.ipynb b/Showcases.ipynb
diff --git a/additional_modules.py b/additional_modules.py
diff --git a/bio_files_processor.py b/bio_files_processor.py
@@ -1,4 +1,76 @@
 import os
+from dataclasses import dataclass
+
+
+@dataclass
+class FastaRecord:
+    fasta_id: str
+    description: str
+    seq: str
+
+    def __repr__(self):
+        return f"FastaRecord:\nid = {self.fasta_id} \n" \
+               f"description = {self.description} \n" \
+               f"sequence = {self.seq}"
+
+
+class OpenFasta:
+    def __init__(self, file_path, mode='r'):
+        self.file_path = file_path
+        self.mode = mode
+        self.file = None
+        self._current_line = None
+        self._id = None
+        self._description = None
+
+    def __enter__(self):
+        self.file = open(self.file_path, self.mode)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        record = self.read_record()
+        if record is None:
+            raise StopIteration
+        return record
+
+    def read_record(self):
+        if self._current_line is None:
+            self._current_line = self.file.readline().strip()
+        if self._current_line.startswith(">"):
+            self._id, self._description = self._current_line.split(" ", 1)
+        if not self._current_line:
+            return None
+
+        seq = ""
+        fasta_record = FastaRecord(fasta_id=self._id[1:],
+                                   description=self._description,
+                                   seq="")
+        while True:
+            self._current_line = self.file.readline().strip()
+            if not self._current_line:
+                break
+            if self._current_line.startswith(">"):
+                self._id, self._description = self._current_line.split(" ", 1)
+                break
+            seq += self._current_line
+        fasta_record.seq = seq
+        return fasta_record
+
+    def read_records(self):
+        records = []
+        while True:
+            record = self.read_record()
+            if record:
+                records.append(record)
+            else:
+                break
+        return records
 
 
 def convert_multiline_fasta_to_oneline(input_fasta: str,
@@ -17,14 +89,18 @@ def convert_multiline_fasta_to_oneline(input_fasta: str,
         output_fasta = output_fasta.split('.')
         output_fasta = f'{output_fasta[0]}_oneline.{output_fasta[1]}'
 
+    first_line = True
     with open(input_fasta, 'r', encoding='utf-8') as input_file:
         try:
             with open(output_fasta, 'x', encoding='utf-8') as output_file:
-                output_file.write(input_file.readline())
                 for line in input_file.readlines():
                     if line.startswith('>'):
-                        output_file.write('\n')
-                        output_file.write(line)
+                        if first_line:
+                            output_file.write(line)
+                            first_line = False
+                        else:
+                            output_file.write('\n')
+                            output_file.write(line)
                     else:
                         output_file.write(line.strip())
         except FileExistsError:
@@ -93,7 +169,8 @@ def select_genes_from_gbk_to_fasta(input_gbk: str,
             if right_border >= len(input_dict_position):
                 right_border = len(input_dict_position)
             for key, value in input_dict_position.items():
-                if value in range(left_border, right_border+1) and key != gene:
+                if value in range(left_border,
+                                  right_border + 1) and key != gene:
                     neighbours_of_gene[key] = input_dict_sequences[key]
         else:
             raise ValueError(f'The gene {gene} is not in the data.')