LabStrangeLoop · ayeganov · Aug 29, 2025 · Aug 29, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,15 @@
+name: Tests
+on: [pull_request]
+jobs:
+  unit_tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+      - run: uv sync --group dev
+      - run: uv run pytest tests
diff --git a/README.md b/README.md
@@ -19,8 +19,8 @@ repo is educational, so the aim is to keep the code as legible as possible.
 
 [x] Switch to uv
 [x] Make it easy to modify with a config file
-[] Extract the loss calculation from the model
-[] Rename main to train
+[x] Extract the loss calculation from the model
+[x] Rename main to train
 [] Create an easy to use interface
 [] Create or check tokenizer interface
 [] Make it into a package

diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,6 @@ dev = [
 ]
 
 [tool.pytest.ini_options]
-asyncio_mode = "auto"
 
 [tool.mypy]
 python_version = "3.12"
@@ -73,6 +72,6 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [project.scripts]
-train = "scratchgpt.main:main"
+train = "scratchgpt.train:main"
 infer = "scratchgpt.infer:main"
 tiktoken = "scratchgpt.tokenizer.tiktoken:main"
diff --git a/scratchgpt/dataloader.py b/scratchgpt/dataloader.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Literal, override
+from typing import override
 
+import numpy as np
 import torch
 from torch import Tensor
 from torch.utils.data import Dataset
+from tqdm import tqdm
 
 from .tokenizer.base_tokenizer import Tokenizer
 
@@ -40,17 +42,13 @@ def __init__(self, dir_path: Path) -> None:
             raise ValueError(f"Directory path {dir_path} is not a directory")
 
         self._data = ""
+        file_paths = list(dir_path.rglob("*"))
         print(f"Loading data from {dir_path}")
-        total_read: int = 0
-        for idx, file_path in enumerate(dir_path.rglob("*")):
+        for file_path in tqdm(file_paths, desc="Reading data files", unit="file"):
             if file_path.is_file() and not file_path.name.startswith("."):
                 with open(file_path, encoding="utf-8") as f:
                     self._data += f.read() + "\n"
 
-            if idx % 500 == 1:
-                total_read += 500
-                print(f"Read {total_read} files")
-
         print("Data Loaded")
 
     @override
@@ -64,32 +62,39 @@ def __init__(
         text_provider: TextProvider,
         tokenizer: Tokenizer,
         block_size: int,
-        split: Literal["train", "validation", "test"],
-        train_ratio: float = 0.8,
-        val_ratio: float = 0.1,
     ) -> None:
         self.tokenizer = tokenizer
         self.block_size = block_size
 
         self.data = torch.tensor(self.tokenizer.encode(text_provider.get_text()), dtype=torch.long)
 
-        total_size = len(self.data)
-        train_size = int(total_size * train_ratio)
-        val_size = int(total_size * val_ratio)
-
-        if split == "train":
-            self.data = self.data[:train_size]
-        elif split == "validation":
-            self.data = self.data[train_size : train_size + val_size]
-        elif split == "test":
-            self.data = self.data[train_size + val_size :]
-        else:
-            raise ValueError(f"Invalid split: {split}. Must be 'train', 'validation', or 'test'.")
-
     def __len__(self) -> int:
         return len(self.data) - self.block_size
 
     def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]:
         block = self.data[idx : idx + self.block_size]
         target = self.data[idx + 1 : idx + self.block_size + 1]
         return block, target
+
+
+class PretokenizedDataset(Dataset[tuple[Tensor, Tensor]]):
+    def __init__(
+        self,
+        token_file: Path,
+        block_size: int,
+        dtype: np.dtype,
+    ) -> None:
+        super().__init__()
+        self.block_size = block_size
+
+        all_tokens = np.memmap(token_file, dtype=dtype, mode="c")
+        self.data = torch.from_numpy(all_tokens)
+
+    def __len__(self) -> int:
+        return max(0, len(self.data) - self.block_size)
+
+    def __getitem__(self, idx: int) -> tuple[Tensor, Tensor]:
+        block = self.data[idx : idx + self.block_size]
+        target = self.data[idx + 1 : idx + self.block_size + 1]
+
+        return block.long(), target.long()
diff --git a/scratchgpt/preprocess.py b/scratchgpt/preprocess.py
@@ -0,0 +1,137 @@
+import io
+from pathlib import Path
+from typing import Any, Protocol
+
+import numpy as np
+from numpy.typing import DTypeLike
+from tqdm import tqdm
+
+from .tokenizer.base_tokenizer import Tokenizer
+
+
+class SupportsUpdate(Protocol):
+    def update(self, n: int) -> Any: ...
+
+
+class Preprocessor(Protocol):
+    """
+    Preprocessor protocol for handling dataset conversion using a specific tokenizer.
+    """
+
+    def __call__(
+        self,
+        source: io.TextIOBase,
+        sink: io.BufferedIOBase,
+        chunk_size: int,
+        pbar: SupportsUpdate | None = None,
+    ) -> None:
+        """
+        Process the input text source and write the result to the binary sink.
+        Optionally updates a tqdm progress bar.
+        """
+
+
+class FilePreprocessor(Protocol):
+    """
+    Preprocessor that deals specifically with file system io.
+    """
+
+    def __call__(self, input_path: Path, output_path: Path, chunk_size: int = 10 * 1024 * 1024) -> None:
+        """
+        Process input and output paths
+        """
+
+
+class TokenizerPreprocessor(Preprocessor):
+    """
+    Default pre-processor. Tokenizes a text stream and writes the output
+    to a binary stream, managing progress updates internally.
+    """
+
+    def __init__(self, tokenizer: Tokenizer) -> None:
+        self.tokenizer = tokenizer
+        vocab_size = self.tokenizer.vocab_size
+        if vocab_size < 2**8:
+            self.dtype: DTypeLike = np.uint8
+        elif vocab_size < 2**16:
+            self.dtype = np.uint16
+        elif vocab_size < 2**32:
+            self.dtype = np.uint32
+        else:
+            self.dtype = np.uint64
+        print(f"Preprocessor initialized. Selected {np.dtype(self.dtype).name} for token storage.")
+
+    def __call__(
+        self,
+        source: io.TextIOBase,
+        sink: io.BufferedIOBase,
+        chunk_size: int = 10 * 1024 * 1024,
+        pbar: SupportsUpdate | None = None,
+    ) -> None:
+        """
+        Reads from the source stream, tokenizes content in chunks, writes to the
+        sink stream, and updates the provided progress bar.
+        """
+        while chunk := source.read(chunk_size):
+            tokens = self.tokenizer.encode(chunk)
+            token_array = np.array(tokens, dtype=self.dtype)
+            sink.write(token_array.tobytes())
+            if pbar:
+                pbar.update(len(chunk.encode("utf-8", errors="ignore")))
+
+
+class File2FileTokenizerPreprocessor:
+    """
+    Orchestrates preprocessing for a single source file to a single destination file.
+    """
+
+    def __init__(self, tokenizer: Tokenizer) -> None:
+        self._preprocessor = TokenizerPreprocessor(tokenizer)
+
+    def __call__(self, input_path: Path, output_path: Path, chunk_size: int = 10 * 1024 * 1024) -> None:
+        if not input_path.is_file():
+            raise ValueError(f"Input path must be a file: {input_path}")
+        if output_path.exists():
+            raise FileExistsError(f"Output path already exists: {output_path}")
+
+        total_size = input_path.stat().st_size
+
+        with (
+            open(input_path, encoding="utf-8", errors="ignore") as source,
+            open(output_path, "wb") as sink,
+            tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Tokenizing {input_path.name}") as pbar,
+        ):
+            self._preprocessor(source, sink, chunk_size, pbar)
+
+        print(f"Successfully preprocessed '{input_path}' to '{output_path}'")
+
+
+class Folder2FileTokenizerPreprocessor:
+    """
+    Orchestrates preprocessing for a directory of source files to a single destination file.
+    """
+
+    def __init__(self, tokenizer: Tokenizer) -> None:
+        self._preprocessor = TokenizerPreprocessor(tokenizer)
+
+    def __call__(self, input_path: Path, output_path: Path, chunk_size: int = 10 * 1024 * 1024) -> None:
+        if not input_path.is_dir():
+            raise ValueError(f"Input path must be a directory: {input_path}")
+        if output_path.exists():
+            raise FileExistsError(f"Output path already exists: {output_path}")
+
+        files_to_process = [p for p in input_path.rglob("*") if p.is_file() and not p.name.startswith(".")]
+        total_size = sum(p.stat().st_size for p in files_to_process)
+
+        print(f"Found {len(files_to_process)} files to process.")
+
+        with (
+            open(output_path, "wb") as sink,
+            tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Tokenizing Folder '{input_path.name}'") as pbar,
+        ):
+            for file_path in files_to_process:
+                pbar.set_postfix_str(f"Processing: {file_path.name}", refresh=True)
+                with open(file_path, encoding="utf-8", errors="ignore") as source:
+                    self._preprocessor(source, sink, chunk_size, pbar)
+
+        print(f"\nSuccessfully preprocessed folder '{input_path}' to '{output_path}'")