From 397f7d943de56016f941535f78b26715beef01b2 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 01:42:41 +0000 Subject: [PATCH 01/32] add: uv pyproject.toml --- .python-version | 1 + pyproject.toml | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 .python-version create mode 100644 pyproject.toml diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c8cfe39 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d1ff0ba --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "semikong" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +authors = [ + { name = "popiemon", email = "kryo.over@gmail.com" } +] +requires-python = ">=3.10" +dependencies = [ + "datasets>=3.3.2", + "loguru>=0.7.3", + "numpy>=2.2.3", + "peft>=0.14.0", + "transformers>=4.49.0", + "trl>=0.15.2", + "vllm>=0.5.0.post1", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" From b9884e3fe743e936f9a77f09e6155998ac5d8cf8 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 01:43:24 +0000 Subject: [PATCH 02/32] add: src --- src/semikong/__init__.py | 2 ++ src/semikong/py.typed | 0 2 files changed, 2 insertions(+) create mode 100644 src/semikong/__init__.py create mode 100644 src/semikong/py.typed diff --git a/src/semikong/__init__.py b/src/semikong/__init__.py new file mode 100644 index 0000000..96900f8 --- /dev/null +++ b/src/semikong/__init__.py @@ -0,0 +1,2 @@ +def hello() -> str: + return "Hello from workspace!" diff --git a/src/semikong/py.typed b/src/semikong/py.typed new file mode 100644 index 0000000..e69de29 From 27bb4857f4bf55041faf134b789bb32166f243b0 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 01:51:41 +0000 Subject: [PATCH 03/32] add: Dockerfile --- Dockerfile | 21 +++++++++++++++++++++ compose.yaml | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 Dockerfile create mode 100644 compose.yaml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7b604d5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime + +# The installer requires curl (and certificates) to download the release archive +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + ca-certificates \ + git \ + vim + +# Download the latest installer +ADD https://astral.sh/uv/install.sh /uv-installer.sh + +# Run the installer then remove it +RUN sh /uv-installer.sh && rm /uv-installer.sh + +# Ensure the installed binary is on the `PATH` +ENV PATH="/opt/conda/bin:$PATH" +ENV UV_PROJECT_ENVIRONMENT="/opt/conda" +ENV UV_SYSTEM_PYTHON=true + +WORKDIR /workspace \ No newline at end of file diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..e443f56 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,18 @@ +services: + semikong: + build: + context: . + dockerfile: Dockerfile + image: semikong + container_name: semikong + tty: true + stdin_open: true + volumes: + - ./:/workspace + working_dir: /workspace + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] \ No newline at end of file From 5965d75883b3983c1b2ea26767adef9110417090 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 01:52:35 +0000 Subject: [PATCH 04/32] add: devcontainer.json --- .devcontainer/devcontainer.json | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..e5bf5bb --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,24 @@ +{ + "name": "semikong", + "dockerComposeFile": "../compose.yaml", + "service": "semikong", + "workspaceFolder": "/workspace", + "customizations": { + "vscode": { + "extensions": [ + "GitHub.copilot-chat", + "codezombiech.gitignore", + "GitHub.copilot", + "saoudrizwan.claude-dev", + "njpwerner.autodocstring", + "ms-python.mypy-type-checker", + "ms-python.vscode-pylance", + "ms-python.python", + "ms-python.debugpy", + "ms-python.isort", + "ms-python.black-formatter" + ] + } + }, + } + From 83399c92092d3520f79749984eab6a7517af2dd2 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 01:52:58 +0000 Subject: [PATCH 05/32] fix: toml --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d1ff0ba..56a1b5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ authors = [ ] requires-python = ">=3.10" dependencies = [ + "bitsandbytes>=0.45.3", "datasets>=3.3.2", "loguru>=0.7.3", "numpy>=2.2.3", @@ -17,6 +18,3 @@ dependencies = [ "vllm>=0.5.0.post1", ] -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" From 76f8caab363020128b084f1644f909d26c478db0 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 02:14:13 +0000 Subject: [PATCH 06/32] feat: ignore uv.lock --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 82f9275..9e82ec1 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +*.lock From 0c0ed7675998de8745b79abb2bbf1eaad67860b2 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 02:14:27 +0000 Subject: [PATCH 07/32] add: .env.sample --- .env.sample | 1 + 1 file changed, 1 insertion(+) create mode 100644 .env.sample diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..900692b --- /dev/null +++ b/.env.sample @@ -0,0 +1 @@ +HF_TOKEN= \ No newline at end of file From d8d703f7d19e73899e4c230702f92c93bf31e940 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 02:14:45 +0000 Subject: [PATCH 08/32] fix: delete src --- src/semikong/__init__.py | 2 -- src/semikong/py.typed | 0 2 files changed, 2 deletions(-) delete mode 100644 src/semikong/__init__.py delete mode 100644 src/semikong/py.typed diff --git a/src/semikong/__init__.py b/src/semikong/__init__.py deleted file mode 100644 index 96900f8..0000000 --- a/src/semikong/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -def hello() -> str: - return "Hello from workspace!" diff --git a/src/semikong/py.typed b/src/semikong/py.typed deleted file mode 100644 index e69de29..0000000 From 4b3442be7c3b25691a6abaf27485afb9513b7849 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 02:15:07 +0000 Subject: [PATCH 09/32] add: update dependencies to include huggingface-hub --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 56a1b5d..9011cb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ requires-python = ">=3.10" dependencies = [ "bitsandbytes>=0.45.3", "datasets>=3.3.2", + "huggingface-hub>=0.29.1", "loguru>=0.7.3", "numpy>=2.2.3", "peft>=0.14.0", From d8aa952993db961f42654ab4a46ce09e2f8504fc Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 03:51:19 +0000 Subject: [PATCH 10/32] feat: ignore results --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9e82ec1..5b4a620 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ #.idea/ *.lock +results/ From e5f0946f5447331f03ec7a0db3ff47737e49284e Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 04:12:48 +0000 Subject: [PATCH 11/32] feat: pft success --- configs/training-config.yaml | 10 +++++----- training.py | 20 +++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/configs/training-config.yaml b/configs/training-config.yaml index b6db6fe..a7494a0 100644 --- a/configs/training-config.yaml +++ b/configs/training-config.yaml @@ -1,18 +1,18 @@ model: - model_name: "model_path_folder_or_model_name_hf" - new_model: "semikong-8b" + model_name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + new_model: "semikong-1.1b" use_4bit: True use_nested_quant: False bnb_4bit_compute_dtype: "float16" bnb_4bit_quant_type: "nf4" device_map: {"": 0} output_dir: "./results" - dataset_name: "dataset_path_folder_or_dataset_name_hf" + dataset_name: "pentagoniac/SemiKong_Training_Datset" training: num_train_epochs: 2 - per_device_train_batch_size: 4 - per_device_eval_batch_size: 4 + per_device_train_batch_size: 3 + per_device_eval_batch_size: 3 gradient_accumulation_steps: 1 learning_rate: 2e-4 max_grad_norm: 0.3 diff --git a/training.py b/training.py index 077572f..647535b 100644 --- a/training.py +++ b/training.py @@ -4,7 +4,8 @@ import yaml from datasets import load_dataset from peft import LoraConfig -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, TrainingArguments) from trl import SFTTrainer @@ -51,6 +52,7 @@ def configure_lora(config): r=lora_params["lora_r"], bias="none", task_type="CAUSAL_LM", + target_modules=["q_proj", "v_proj"], ) @@ -85,11 +87,14 @@ def main(): # Load and process the dataset dataset_name = config["model"]["dataset_name"] - dataset = load_dataset("json", data_files=dataset_name, split="train") + dataset_dict = load_dataset(dataset_name) + dataset = dataset_dict["train"] dataset = dataset.shuffle(seed=42) - dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo - dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=list(dataset.features)) - + dataset = dataset.map( + lambda sample: template_dataset(sample, tokenizer), + remove_columns=["instruction", "input", "output"] # 必要に応じて追加 + ) + # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], @@ -98,7 +103,7 @@ def main(): optim=config["training"]["optim"], save_steps=config["training"]["save_steps"], logging_steps=config["training"]["logging_steps"], - learning_rate=config["training"]["learning_rate"], + learning_rate=float(config["training"]["learning_rate"]), fp16=config["training"]["fp16"], bf16=config["training"]["bf16"], max_grad_norm=config["training"]["max_grad_norm"], @@ -113,11 +118,8 @@ def main(): model=model, train_dataset=dataset, peft_config=peft_config, - dataset_text_field="text", - max_seq_length=config["training"]["max_seq_length"], tokenizer=tokenizer, args=training_arguments, - packing=config["training"]["packing"], ) trainer.train() From 643d9dd86be436bcc4d7c82258e4ac5fd50705b7 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 04:14:37 +0000 Subject: [PATCH 12/32] add: dev group --- pyproject.toml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9011cb6..98aa265 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Add your description here" readme = "README.md" authors = [ - { name = "popiemon", email = "kryo.over@gmail.com" } + { name = "", email = "" } ] requires-python = ">=3.10" dependencies = [ @@ -19,3 +19,12 @@ dependencies = [ "vllm>=0.5.0.post1", ] +[dependency-groups] +dev = [ + "black>=25.1.0", + "isort>=6.0.1", + "pre-commit>=4.1.0", + "pylint>=3.3.4", + "tqdm>=4.67.1", +] + From 645a25e8d9db0df3b32cbb6592fb7c00275545b3 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 04:23:36 +0000 Subject: [PATCH 13/32] docs: Docker --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 98bce2f..92cd7e9 100644 --- a/README.md +++ b/README.md @@ -323,7 +323,12 @@ You can perform inference with SEMIKONG chat or base models as below.

### Quick start - Docker -TBA +You shold install Docker and VSCode. + +Install the Dev Containers extension. +Please select "Reopen in Container." + +You should now be able to work with the SEMIKONG project inside the Docker container using VSCode.

[ Back to top ⬆️ ] From 6de1bd621953bbc176dd2cbabbfd6031a255f0ad Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 12:51:59 +0000 Subject: [PATCH 14/32] feat: remove_columns --- training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training.py b/training.py index 647535b..8a607d1 100644 --- a/training.py +++ b/training.py @@ -92,9 +92,9 @@ def main(): dataset = dataset.shuffle(seed=42) dataset = dataset.map( lambda sample: template_dataset(sample, tokenizer), - remove_columns=["instruction", "input", "output"] # 必要に応じて追加 + remove_columns=["input"] ) - + # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], From 26c8119dbe7b3caf2a8cb2291783dc423fb5ce51 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 12:52:45 +0000 Subject: [PATCH 15/32] feat: ignore results --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5b4a620..ea74cc5 100644 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,4 @@ cython_debug/ #.idea/ *.lock -results/ +results* From 14287e4a8dd9a8c46b3daede7f9f41aed0a72a40 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 13:37:21 +0000 Subject: [PATCH 16/32] del: .env --- .env.sample | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .env.sample diff --git a/.env.sample b/.env.sample deleted file mode 100644 index 900692b..0000000 --- a/.env.sample +++ /dev/null @@ -1 +0,0 @@ -HF_TOKEN= \ No newline at end of file From 3f61dbe7c8388799450a50689f9a0b50ab9db445 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 13:39:22 +0000 Subject: [PATCH 17/32] fix: model_name --- configs/training-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/training-config.yaml b/configs/training-config.yaml index a7494a0..6184d13 100644 --- a/configs/training-config.yaml +++ b/configs/training-config.yaml @@ -1,5 +1,5 @@ model: - model_name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + model_name: "model_path_folder_or_model_name_hf" new_model: "semikong-1.1b" use_4bit: True use_nested_quant: False From a41e4c7f59f6c4b2d9869eefeae595c9cdf03697 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 13:45:58 +0000 Subject: [PATCH 18/32] docs: uv sync --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 92cd7e9..d709685 100644 --- a/README.md +++ b/README.md @@ -330,6 +330,11 @@ Please select "Reopen in Container." You should now be able to work with the SEMIKONG project inside the Docker container using VSCode. +Run the following command to install all packages. +```sh +uv sync --all-extras +``` +

[ Back to top ⬆️ ]

From 8ba3fdd48cc86302d980113381e4946569e58b4b Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 13:54:47 +0000 Subject: [PATCH 19/32] fix: logging --- raw_inference.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/raw_inference.py b/raw_inference.py index 144bbff..ab36fde 100644 --- a/raw_inference.py +++ b/raw_inference.py @@ -1,10 +1,11 @@ import argparse -import logging import torch +import transformers import yaml from peft import LoraConfig, PeftModel -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, pipeline) def load_config(config_file="./configs/inference-config.yaml"): @@ -70,7 +71,7 @@ def text_gen_eval_wrapper(model, tokenizer, prompt, max_length=200, temperature= A wrapper function for inferencing, generating text based on a prompt. """ # Suppress logging - logging.set_verbosity(logging.CRITICAL) + transformers.logging.set_verbosity(transformers.logging.CRITICAL) # Initialize text generation pipeline pipe = pipeline( From 319fdbfaca3269c9d54591a7262359da850ee510 Mon Sep 17 00:00:00 2001 From: popiemon <85440427+popiemon@users.noreply.github.com> Date: Wed, 5 Mar 2025 23:02:16 +0900 Subject: [PATCH 20/32] Update Dockerfile Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com> --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7b604d5..c546467 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime # The installer requires curl (and certificates) to download the release archive -RUN apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates git vim && rm -rf /var/lib/apt/lists/* curl \ ca-certificates \ git \ From d8d0bec17e5f674edf3803cce45b865c3e0dabdc Mon Sep 17 00:00:00 2001 From: popiemon <85440427+popiemon@users.noreply.github.com> Date: Wed, 5 Mar 2025 23:04:16 +0900 Subject: [PATCH 21/32] Update training.py Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com> --- training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training.py b/training.py index 8a607d1..9c106aa 100644 --- a/training.py +++ b/training.py @@ -88,7 +88,7 @@ def main(): # Load and process the dataset dataset_name = config["model"]["dataset_name"] dataset_dict = load_dataset(dataset_name) - dataset = dataset_dict["train"] + dataset = dataset_dict["train"] dataset = dataset.shuffle(seed=42) dataset = dataset.map( lambda sample: template_dataset(sample, tokenizer), From 856ac3ed09e71ae26da543b501c81511a61867db Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 14:08:23 +0000 Subject: [PATCH 22/32] fix: devcontainer.json --- .devcontainer/devcontainer.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e5bf5bb..b829302 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -19,6 +19,6 @@ "ms-python.black-formatter" ] } - }, - } + } +} From 3e837e4845cf7c9832a3dfca398852f6ea534c81 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 14:14:07 +0000 Subject: [PATCH 23/32] fix: Dockerfile --- Dockerfile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index c546467..c0fe9ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,6 @@ FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime # The installer requires curl (and certificates) to download the release archive RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates git vim && rm -rf /var/lib/apt/lists/* - curl \ - ca-certificates \ - git \ - vim # Download the latest installer ADD https://astral.sh/uv/install.sh /uv-installer.sh @@ -18,4 +14,4 @@ ENV PATH="/opt/conda/bin:$PATH" ENV UV_PROJECT_ENVIRONMENT="/opt/conda" ENV UV_SYSTEM_PYTHON=true -WORKDIR /workspace \ No newline at end of file +WORKDIR /workspace From 8b8e1bbbbee6cbd08be2b332c93b00c6ec264194 Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 14:21:24 +0000 Subject: [PATCH 24/32] feat: package version --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c0fe9ca..f3fc16d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime # The installer requires curl (and certificates) to download the release archive -RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates git vim && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y --no-install-recommends curl=7.81.0-1ubuntu1.20 ca-certificates=20240203~22.04.1 git=1:2.34.1-1ubuntu1.12 vim=2:8.2.3995-1ubuntu2.23 && rm -rf /var/lib/apt/lists/* # Download the latest installer ADD https://astral.sh/uv/install.sh /uv-installer.sh From be7ee11f6b1912be03a7eeaf41189675f128bebb Mon Sep 17 00:00:00 2001 From: popiemon Date: Wed, 5 Mar 2025 14:56:50 +0000 Subject: [PATCH 25/32] revert training-config --- configs/training-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/training-config.yaml b/configs/training-config.yaml index 6184d13..b1d3b56 100644 --- a/configs/training-config.yaml +++ b/configs/training-config.yaml @@ -1,6 +1,6 @@ model: model_name: "model_path_folder_or_model_name_hf" - new_model: "semikong-1.1b" + new_model: "semikong-8b" use_4bit: True use_nested_quant: False bnb_4bit_compute_dtype: "float16" @@ -11,8 +11,8 @@ model: training: num_train_epochs: 2 - per_device_train_batch_size: 3 - per_device_eval_batch_size: 3 + per_device_train_batch_size: 4 + per_device_eval_batch_size: 4 gradient_accumulation_steps: 1 learning_rate: 2e-4 max_grad_norm: 0.3 From d03094328f7706e3001f778d5363c69628cc4943 Mon Sep 17 00:00:00 2001 From: popiemon Date: Fri, 7 Mar 2025 15:02:02 +0000 Subject: [PATCH 26/32] fix: dataset.map --- training.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/training.py b/training.py index 9c106aa..e8a4856 100644 --- a/training.py +++ b/training.py @@ -87,14 +87,13 @@ def main(): # Load and process the dataset dataset_name = config["model"]["dataset_name"] - dataset_dict = load_dataset(dataset_name) - dataset = dataset_dict["train"] + extension = dataset_name.split(".")[-1] + dataset = load_dataset(extension, data_files=dataset_name, split="train") dataset = dataset.shuffle(seed=42) - dataset = dataset.map( - lambda sample: template_dataset(sample, tokenizer), - remove_columns=["input"] - ) - + if config["demo"]: + dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo + dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) + # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], From 5bfcc5fb80e7f838331ed7305b715674b549e6b2 Mon Sep 17 00:00:00 2001 From: popiemon Date: Fri, 7 Mar 2025 15:02:21 +0000 Subject: [PATCH 27/32] feat: ignore data --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ea74cc5..018577f 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ cython_debug/ *.lock results* +data/* From be908e32e59396fefb69adda1d6aaf3d38842a52 Mon Sep 17 00:00:00 2001 From: popiemon Date: Fri, 7 Mar 2025 15:07:42 +0000 Subject: [PATCH 28/32] feat: training config --- configs/training-config.yaml | 5 ++++- training.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/configs/training-config.yaml b/configs/training-config.yaml index b1d3b56..478a3db 100644 --- a/configs/training-config.yaml +++ b/configs/training-config.yaml @@ -1,3 +1,5 @@ +demo: True + model: model_name: "model_path_folder_or_model_name_hf" new_model: "semikong-8b" @@ -7,7 +9,8 @@ model: bnb_4bit_quant_type: "nf4" device_map: {"": 0} output_dir: "./results" - dataset_name: "pentagoniac/SemiKong_Training_Datset" + dataset_name: /workspace/data/SemiKong_Training_Dataset/train-00000-of-00001.parquet + remove_columns: ["input"] training: num_train_epochs: 2 diff --git a/training.py b/training.py index e8a4856..ffa97ee 100644 --- a/training.py +++ b/training.py @@ -73,7 +73,7 @@ def template_dataset(sample, tokenizer): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Fine-tune a model with LoRA and 4-bit precision.") - parser.add_argument("--config", type=str, default="config.yaml", help="Path to the YAML config file.") + parser.add_argument("--config", type=str, default="configs/training-config.yaml", help="Path to the YAML config file.") args = parser.parse_args() # Load configuration From 667d9558ffbc6b6db71c3adbe0ca69468263c5cf Mon Sep 17 00:00:00 2001 From: popiemon <85440427+popiemon@users.noreply.github.com> Date: Sat, 8 Mar 2025 00:08:58 +0900 Subject: [PATCH 29/32] Update training.py Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com> --- training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training.py b/training.py index ffa97ee..842f028 100644 --- a/training.py +++ b/training.py @@ -93,7 +93,7 @@ def main(): if config["demo"]: dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) - + fp16=config["training"]["fp16"], # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], From 7b0db2f5d52e60aa78d1ba15cae0925ed72b943a Mon Sep 17 00:00:00 2001 From: popiemon Date: Fri, 7 Mar 2025 15:11:55 +0000 Subject: [PATCH 30/32] fix: training.py --- training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training.py b/training.py index 842f028..ffa97ee 100644 --- a/training.py +++ b/training.py @@ -93,7 +93,7 @@ def main(): if config["demo"]: dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) - fp16=config["training"]["fp16"], + # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], From 9a90c035fc0fab1a836e4fe7056a39a00a43ce1e Mon Sep 17 00:00:00 2001 From: popiemon Date: Fri, 7 Mar 2025 15:22:33 +0000 Subject: [PATCH 31/32] fix: training.py --- training.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/training.py b/training.py index ffa97ee..4dde15b 100644 --- a/training.py +++ b/training.py @@ -92,8 +92,7 @@ def main(): dataset = dataset.shuffle(seed=42) if config["demo"]: dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo - dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) - + dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], From f3c2dc81093002280972fd24d215a52214a72378 Mon Sep 17 00:00:00 2001 From: popiemon <85440427+popiemon@users.noreply.github.com> Date: Sat, 8 Mar 2025 00:27:48 +0900 Subject: [PATCH 32/32] Update training.py Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com> --- training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training.py b/training.py index 4dde15b..5d3ed30 100644 --- a/training.py +++ b/training.py @@ -92,7 +92,7 @@ def main(): dataset = dataset.shuffle(seed=42) if config["demo"]: dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo - dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) + dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"],