From 397f7d943de56016f941535f78b26715beef01b2 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 01:42:41 +0000
Subject: [PATCH 01/32] add: uv pyproject.toml
---
.python-version | 1 +
pyproject.toml | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+)
create mode 100644 .python-version
create mode 100644 pyproject.toml
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..c8cfe39
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..d1ff0ba
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "semikong"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+ { name = "popiemon", email = "kryo.over@gmail.com" }
+]
+requires-python = ">=3.10"
+dependencies = [
+ "datasets>=3.3.2",
+ "loguru>=0.7.3",
+ "numpy>=2.2.3",
+ "peft>=0.14.0",
+ "transformers>=4.49.0",
+ "trl>=0.15.2",
+ "vllm>=0.5.0.post1",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
From b9884e3fe743e936f9a77f09e6155998ac5d8cf8 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 01:43:24 +0000
Subject: [PATCH 02/32] add: src
---
src/semikong/__init__.py | 2 ++
src/semikong/py.typed | 0
2 files changed, 2 insertions(+)
create mode 100644 src/semikong/__init__.py
create mode 100644 src/semikong/py.typed
diff --git a/src/semikong/__init__.py b/src/semikong/__init__.py
new file mode 100644
index 0000000..96900f8
--- /dev/null
+++ b/src/semikong/__init__.py
@@ -0,0 +1,2 @@
+def hello() -> str:
+ return "Hello from workspace!"
diff --git a/src/semikong/py.typed b/src/semikong/py.typed
new file mode 100644
index 0000000..e69de29
From 27bb4857f4bf55041faf134b789bb32166f243b0 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 01:51:41 +0000
Subject: [PATCH 03/32] add: Dockerfile
---
Dockerfile | 21 +++++++++++++++++++++
compose.yaml | 18 ++++++++++++++++++
2 files changed, 39 insertions(+)
create mode 100644 Dockerfile
create mode 100644 compose.yaml
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..7b604d5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,21 @@
+FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+# The installer requires curl (and certificates) to download the release archive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ curl \
+ ca-certificates \
+ git \
+ vim
+
+# Download the latest installer
+ADD https://astral.sh/uv/install.sh /uv-installer.sh
+
+# Run the installer then remove it
+RUN sh /uv-installer.sh && rm /uv-installer.sh
+
+# Ensure the installed binary is on the `PATH`
+ENV PATH="/opt/conda/bin:$PATH"
+ENV UV_PROJECT_ENVIRONMENT="/opt/conda"
+ENV UV_SYSTEM_PYTHON=true
+
+WORKDIR /workspace
\ No newline at end of file
diff --git a/compose.yaml b/compose.yaml
new file mode 100644
index 0000000..e443f56
--- /dev/null
+++ b/compose.yaml
@@ -0,0 +1,18 @@
+services:
+ semikong:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ image: semikong
+ container_name: semikong
+ tty: true
+ stdin_open: true
+ volumes:
+ - ./:/workspace
+ working_dir: /workspace
+ deploy:
+ resources:
+ reservations:
+ devices:
+ - driver: nvidia
+ capabilities: [gpu]
\ No newline at end of file
From 5965d75883b3983c1b2ea26767adef9110417090 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 01:52:35 +0000
Subject: [PATCH 04/32] add: devcontainer.json
---
.devcontainer/devcontainer.json | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
create mode 100644 .devcontainer/devcontainer.json
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..e5bf5bb
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
+{
+ "name": "semikong",
+ "dockerComposeFile": "../compose.yaml",
+ "service": "semikong",
+ "workspaceFolder": "/workspace",
+ "customizations": {
+ "vscode": {
+ "extensions": [
+ "GitHub.copilot-chat",
+ "codezombiech.gitignore",
+ "GitHub.copilot",
+ "saoudrizwan.claude-dev",
+ "njpwerner.autodocstring",
+ "ms-python.mypy-type-checker",
+ "ms-python.vscode-pylance",
+ "ms-python.python",
+ "ms-python.debugpy",
+ "ms-python.isort",
+ "ms-python.black-formatter"
+ ]
+ }
+ },
+ }
+
From 83399c92092d3520f79749984eab6a7517af2dd2 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 01:52:58 +0000
Subject: [PATCH 05/32] fix: toml
---
pyproject.toml | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index d1ff0ba..56a1b5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ authors = [
]
requires-python = ">=3.10"
dependencies = [
+ "bitsandbytes>=0.45.3",
"datasets>=3.3.2",
"loguru>=0.7.3",
"numpy>=2.2.3",
@@ -17,6 +18,3 @@ dependencies = [
"vllm>=0.5.0.post1",
]
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
From 76f8caab363020128b084f1644f909d26c478db0 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 02:14:13 +0000
Subject: [PATCH 06/32] feat: ignore uv.lock
---
.gitignore | 2 ++
1 file changed, 2 insertions(+)
diff --git a/.gitignore b/.gitignore
index 82f9275..9e82ec1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
+
+*.lock
From 0c0ed7675998de8745b79abb2bbf1eaad67860b2 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 02:14:27 +0000
Subject: [PATCH 07/32] add: .env.sample
---
.env.sample | 1 +
1 file changed, 1 insertion(+)
create mode 100644 .env.sample
diff --git a/.env.sample b/.env.sample
new file mode 100644
index 0000000..900692b
--- /dev/null
+++ b/.env.sample
@@ -0,0 +1 @@
+HF_TOKEN=
\ No newline at end of file
From d8d703f7d19e73899e4c230702f92c93bf31e940 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 02:14:45 +0000
Subject: [PATCH 08/32] fix: delete src
---
src/semikong/__init__.py | 2 --
src/semikong/py.typed | 0
2 files changed, 2 deletions(-)
delete mode 100644 src/semikong/__init__.py
delete mode 100644 src/semikong/py.typed
diff --git a/src/semikong/__init__.py b/src/semikong/__init__.py
deleted file mode 100644
index 96900f8..0000000
--- a/src/semikong/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def hello() -> str:
- return "Hello from workspace!"
diff --git a/src/semikong/py.typed b/src/semikong/py.typed
deleted file mode 100644
index e69de29..0000000
From 4b3442be7c3b25691a6abaf27485afb9513b7849 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 02:15:07 +0000
Subject: [PATCH 09/32] add: update dependencies to include huggingface-hub
---
pyproject.toml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pyproject.toml b/pyproject.toml
index 56a1b5d..9011cb6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,6 +10,7 @@ requires-python = ">=3.10"
dependencies = [
"bitsandbytes>=0.45.3",
"datasets>=3.3.2",
+ "huggingface-hub>=0.29.1",
"loguru>=0.7.3",
"numpy>=2.2.3",
"peft>=0.14.0",
From d8aa952993db961f42654ab4a46ce09e2f8504fc Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 03:51:19 +0000
Subject: [PATCH 10/32] feat: ignore results
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index 9e82ec1..5b4a620 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
#.idea/
*.lock
+results/
From e5f0946f5447331f03ec7a0db3ff47737e49284e Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 04:12:48 +0000
Subject: [PATCH 11/32] feat: pft success
---
configs/training-config.yaml | 10 +++++-----
training.py | 20 +++++++++++---------
2 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/configs/training-config.yaml b/configs/training-config.yaml
index b6db6fe..a7494a0 100644
--- a/configs/training-config.yaml
+++ b/configs/training-config.yaml
@@ -1,18 +1,18 @@
model:
- model_name: "model_path_folder_or_model_name_hf"
- new_model: "semikong-8b"
+ model_name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+ new_model: "semikong-1.1b"
use_4bit: True
use_nested_quant: False
bnb_4bit_compute_dtype: "float16"
bnb_4bit_quant_type: "nf4"
device_map: {"": 0}
output_dir: "./results"
- dataset_name: "dataset_path_folder_or_dataset_name_hf"
+ dataset_name: "pentagoniac/SemiKong_Training_Datset"
training:
num_train_epochs: 2
- per_device_train_batch_size: 4
- per_device_eval_batch_size: 4
+ per_device_train_batch_size: 3
+ per_device_eval_batch_size: 3
gradient_accumulation_steps: 1
learning_rate: 2e-4
max_grad_norm: 0.3
diff --git a/training.py b/training.py
index 077572f..647535b 100644
--- a/training.py
+++ b/training.py
@@ -4,7 +4,8 @@
import yaml
from datasets import load_dataset
from peft import LoraConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+ BitsAndBytesConfig, TrainingArguments)
from trl import SFTTrainer
@@ -51,6 +52,7 @@ def configure_lora(config):
r=lora_params["lora_r"],
bias="none",
task_type="CAUSAL_LM",
+ target_modules=["q_proj", "v_proj"],
)
@@ -85,11 +87,14 @@ def main():
# Load and process the dataset
dataset_name = config["model"]["dataset_name"]
- dataset = load_dataset("json", data_files=dataset_name, split="train")
+ dataset_dict = load_dataset(dataset_name)
+ dataset = dataset_dict["train"]
dataset = dataset.shuffle(seed=42)
- dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
- dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=list(dataset.features))
-
+ dataset = dataset.map(
+ lambda sample: template_dataset(sample, tokenizer),
+ remove_columns=["instruction", "input", "output"] # 必要に応じて追加
+ )
+
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
@@ -98,7 +103,7 @@ def main():
optim=config["training"]["optim"],
save_steps=config["training"]["save_steps"],
logging_steps=config["training"]["logging_steps"],
- learning_rate=config["training"]["learning_rate"],
+ learning_rate=float(config["training"]["learning_rate"]),
fp16=config["training"]["fp16"],
bf16=config["training"]["bf16"],
max_grad_norm=config["training"]["max_grad_norm"],
@@ -113,11 +118,8 @@ def main():
model=model,
train_dataset=dataset,
peft_config=peft_config,
- dataset_text_field="text",
- max_seq_length=config["training"]["max_seq_length"],
tokenizer=tokenizer,
args=training_arguments,
- packing=config["training"]["packing"],
)
trainer.train()
From 643d9dd86be436bcc4d7c82258e4ac5fd50705b7 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 04:14:37 +0000
Subject: [PATCH 12/32] add: dev group
---
pyproject.toml | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index 9011cb6..98aa265 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
- { name = "popiemon", email = "kryo.over@gmail.com" }
+ { name = "", email = "" }
]
requires-python = ">=3.10"
dependencies = [
@@ -19,3 +19,12 @@ dependencies = [
"vllm>=0.5.0.post1",
]
+[dependency-groups]
+dev = [
+ "black>=25.1.0",
+ "isort>=6.0.1",
+ "pre-commit>=4.1.0",
+ "pylint>=3.3.4",
+ "tqdm>=4.67.1",
+]
+
From 645a25e8d9db0df3b32cbb6592fb7c00275545b3 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 04:23:36 +0000
Subject: [PATCH 13/32] docs: Docker
---
README.md | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 98bce2f..92cd7e9 100644
--- a/README.md
+++ b/README.md
@@ -323,7 +323,12 @@ You can perform inference with SEMIKONG chat or base models as below.
### Quick start - Docker
-TBA
+You shold install Docker and VSCode.
+
+Install the Dev Containers extension.
+Please select "Reopen in Container."
+
+You should now be able to work with the SEMIKONG project inside the Docker container using VSCode.
[
Back to top ⬆️ ]
From 6de1bd621953bbc176dd2cbabbfd6031a255f0ad Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 12:51:59 +0000
Subject: [PATCH 14/32] feat: remove_columns
---
training.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/training.py b/training.py
index 647535b..8a607d1 100644
--- a/training.py
+++ b/training.py
@@ -92,9 +92,9 @@ def main():
dataset = dataset.shuffle(seed=42)
dataset = dataset.map(
lambda sample: template_dataset(sample, tokenizer),
- remove_columns=["instruction", "input", "output"] # 必要に応じて追加
+ remove_columns=["input"]
)
-
+
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
From 26c8119dbe7b3caf2a8cb2291783dc423fb5ce51 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 12:52:45 +0000
Subject: [PATCH 15/32] feat: ignore results
---
.gitignore | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 5b4a620..ea74cc5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,4 +162,4 @@ cython_debug/
#.idea/
*.lock
-results/
+results*
From 14287e4a8dd9a8c46b3daede7f9f41aed0a72a40 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 13:37:21 +0000
Subject: [PATCH 16/32] del: .env
---
.env.sample | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 .env.sample
diff --git a/.env.sample b/.env.sample
deleted file mode 100644
index 900692b..0000000
--- a/.env.sample
+++ /dev/null
@@ -1 +0,0 @@
-HF_TOKEN=
\ No newline at end of file
From 3f61dbe7c8388799450a50689f9a0b50ab9db445 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 13:39:22 +0000
Subject: [PATCH 17/32] fix: model_name
---
configs/training-config.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/configs/training-config.yaml b/configs/training-config.yaml
index a7494a0..6184d13 100644
--- a/configs/training-config.yaml
+++ b/configs/training-config.yaml
@@ -1,5 +1,5 @@
model:
- model_name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+ model_name: "model_path_folder_or_model_name_hf"
new_model: "semikong-1.1b"
use_4bit: True
use_nested_quant: False
From a41e4c7f59f6c4b2d9869eefeae595c9cdf03697 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 13:45:58 +0000
Subject: [PATCH 18/32] docs: uv sync
---
README.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/README.md b/README.md
index 92cd7e9..d709685 100644
--- a/README.md
+++ b/README.md
@@ -330,6 +330,11 @@ Please select "Reopen in Container."
You should now be able to work with the SEMIKONG project inside the Docker container using VSCode.
+Run the following command to install all packages.
+```sh
+uv sync --all-extras
+```
+
[
Back to top ⬆️ ]
From 8ba3fdd48cc86302d980113381e4946569e58b4b Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 13:54:47 +0000
Subject: [PATCH 19/32] fix: logging
---
raw_inference.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/raw_inference.py b/raw_inference.py
index 144bbff..ab36fde 100644
--- a/raw_inference.py
+++ b/raw_inference.py
@@ -1,10 +1,11 @@
import argparse
-import logging
import torch
+import transformers
import yaml
from peft import LoraConfig, PeftModel
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+ BitsAndBytesConfig, pipeline)
def load_config(config_file="./configs/inference-config.yaml"):
@@ -70,7 +71,7 @@ def text_gen_eval_wrapper(model, tokenizer, prompt, max_length=200, temperature=
A wrapper function for inferencing, generating text based on a prompt.
"""
# Suppress logging
- logging.set_verbosity(logging.CRITICAL)
+ transformers.logging.set_verbosity(transformers.logging.CRITICAL)
# Initialize text generation pipeline
pipe = pipeline(
From 319fdbfaca3269c9d54591a7262359da850ee510 Mon Sep 17 00:00:00 2001
From: popiemon <85440427+popiemon@users.noreply.github.com>
Date: Wed, 5 Mar 2025 23:02:16 +0900
Subject: [PATCH 20/32] Update Dockerfile
Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com>
---
Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Dockerfile b/Dockerfile
index 7b604d5..c546467 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
# The installer requires curl (and certificates) to download the release archive
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates git vim && rm -rf /var/lib/apt/lists/*
curl \
ca-certificates \
git \
From d8d0bec17e5f674edf3803cce45b865c3e0dabdc Mon Sep 17 00:00:00 2001
From: popiemon <85440427+popiemon@users.noreply.github.com>
Date: Wed, 5 Mar 2025 23:04:16 +0900
Subject: [PATCH 21/32] Update training.py
Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com>
---
training.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/training.py b/training.py
index 8a607d1..9c106aa 100644
--- a/training.py
+++ b/training.py
@@ -88,7 +88,7 @@ def main():
# Load and process the dataset
dataset_name = config["model"]["dataset_name"]
dataset_dict = load_dataset(dataset_name)
- dataset = dataset_dict["train"]
+ dataset = dataset_dict["train"]
dataset = dataset.shuffle(seed=42)
dataset = dataset.map(
lambda sample: template_dataset(sample, tokenizer),
From 856ac3ed09e71ae26da543b501c81511a61867db Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 14:08:23 +0000
Subject: [PATCH 22/32] fix: devcontainer.json
---
.devcontainer/devcontainer.json | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index e5bf5bb..b829302 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -19,6 +19,6 @@
"ms-python.black-formatter"
]
}
- },
- }
+ }
+}
From 3e837e4845cf7c9832a3dfca398852f6ea534c81 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 14:14:07 +0000
Subject: [PATCH 23/32] fix: Dockerfile
---
Dockerfile | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/Dockerfile b/Dockerfile
index c546467..c0fe9ca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,10 +2,6 @@ FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
# The installer requires curl (and certificates) to download the release archive
RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates git vim && rm -rf /var/lib/apt/lists/*
- curl \
- ca-certificates \
- git \
- vim
# Download the latest installer
ADD https://astral.sh/uv/install.sh /uv-installer.sh
@@ -18,4 +14,4 @@ ENV PATH="/opt/conda/bin:$PATH"
ENV UV_PROJECT_ENVIRONMENT="/opt/conda"
ENV UV_SYSTEM_PYTHON=true
-WORKDIR /workspace
\ No newline at end of file
+WORKDIR /workspace
From 8b8e1bbbbee6cbd08be2b332c93b00c6ec264194 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 14:21:24 +0000
Subject: [PATCH 24/32] feat: package version
---
Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Dockerfile b/Dockerfile
index c0fe9ca..f3fc16d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
# The installer requires curl (and certificates) to download the release archive
-RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates git vim && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends curl=7.81.0-1ubuntu1.20 ca-certificates=20240203~22.04.1 git=1:2.34.1-1ubuntu1.12 vim=2:8.2.3995-1ubuntu2.23 && rm -rf /var/lib/apt/lists/*
# Download the latest installer
ADD https://astral.sh/uv/install.sh /uv-installer.sh
From be7ee11f6b1912be03a7eeaf41189675f128bebb Mon Sep 17 00:00:00 2001
From: popiemon
Date: Wed, 5 Mar 2025 14:56:50 +0000
Subject: [PATCH 25/32] revert training-config
---
configs/training-config.yaml | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/configs/training-config.yaml b/configs/training-config.yaml
index 6184d13..b1d3b56 100644
--- a/configs/training-config.yaml
+++ b/configs/training-config.yaml
@@ -1,6 +1,6 @@
model:
model_name: "model_path_folder_or_model_name_hf"
- new_model: "semikong-1.1b"
+ new_model: "semikong-8b"
use_4bit: True
use_nested_quant: False
bnb_4bit_compute_dtype: "float16"
@@ -11,8 +11,8 @@ model:
training:
num_train_epochs: 2
- per_device_train_batch_size: 3
- per_device_eval_batch_size: 3
+ per_device_train_batch_size: 4
+ per_device_eval_batch_size: 4
gradient_accumulation_steps: 1
learning_rate: 2e-4
max_grad_norm: 0.3
From d03094328f7706e3001f778d5363c69628cc4943 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Fri, 7 Mar 2025 15:02:02 +0000
Subject: [PATCH 26/32] fix: dataset.map
---
training.py | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/training.py b/training.py
index 9c106aa..e8a4856 100644
--- a/training.py
+++ b/training.py
@@ -87,14 +87,13 @@ def main():
# Load and process the dataset
dataset_name = config["model"]["dataset_name"]
- dataset_dict = load_dataset(dataset_name)
- dataset = dataset_dict["train"]
+ extension = dataset_name.split(".")[-1]
+ dataset = load_dataset(extension, data_files=dataset_name, split="train")
dataset = dataset.shuffle(seed=42)
- dataset = dataset.map(
- lambda sample: template_dataset(sample, tokenizer),
- remove_columns=["input"]
- )
-
+ if config["demo"]:
+ dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
+ dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
+
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
From 5bfcc5fb80e7f838331ed7305b715674b549e6b2 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Fri, 7 Mar 2025 15:02:21 +0000
Subject: [PATCH 27/32] feat: ignore data
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index ea74cc5..018577f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
*.lock
results*
+data/*
From be908e32e59396fefb69adda1d6aaf3d38842a52 Mon Sep 17 00:00:00 2001
From: popiemon
Date: Fri, 7 Mar 2025 15:07:42 +0000
Subject: [PATCH 28/32] feat: training config
---
configs/training-config.yaml | 5 ++++-
training.py | 2 +-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/configs/training-config.yaml b/configs/training-config.yaml
index b1d3b56..478a3db 100644
--- a/configs/training-config.yaml
+++ b/configs/training-config.yaml
@@ -1,3 +1,5 @@
+demo: True
+
model:
model_name: "model_path_folder_or_model_name_hf"
new_model: "semikong-8b"
@@ -7,7 +9,8 @@ model:
bnb_4bit_quant_type: "nf4"
device_map: {"": 0}
output_dir: "./results"
- dataset_name: "pentagoniac/SemiKong_Training_Datset"
+ dataset_name: /workspace/data/SemiKong_Training_Dataset/train-00000-of-00001.parquet
+ remove_columns: ["input"]
training:
num_train_epochs: 2
diff --git a/training.py b/training.py
index e8a4856..ffa97ee 100644
--- a/training.py
+++ b/training.py
@@ -73,7 +73,7 @@ def template_dataset(sample, tokenizer):
def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Fine-tune a model with LoRA and 4-bit precision.")
- parser.add_argument("--config", type=str, default="config.yaml", help="Path to the YAML config file.")
+ parser.add_argument("--config", type=str, default="configs/training-config.yaml", help="Path to the YAML config file.")
args = parser.parse_args()
# Load configuration
From 667d9558ffbc6b6db71c3adbe0ca69468263c5cf Mon Sep 17 00:00:00 2001
From: popiemon <85440427+popiemon@users.noreply.github.com>
Date: Sat, 8 Mar 2025 00:08:58 +0900
Subject: [PATCH 29/32] Update training.py
Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com>
---
training.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/training.py b/training.py
index ffa97ee..842f028 100644
--- a/training.py
+++ b/training.py
@@ -93,7 +93,7 @@ def main():
if config["demo"]:
dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
-
+ fp16=config["training"]["fp16"],
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
From 7b0db2f5d52e60aa78d1ba15cae0925ed72b943a Mon Sep 17 00:00:00 2001
From: popiemon
Date: Fri, 7 Mar 2025 15:11:55 +0000
Subject: [PATCH 30/32] fix: training.py
---
training.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/training.py b/training.py
index 842f028..ffa97ee 100644
--- a/training.py
+++ b/training.py
@@ -93,7 +93,7 @@ def main():
if config["demo"]:
dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
- fp16=config["training"]["fp16"],
+
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
From 9a90c035fc0fab1a836e4fe7056a39a00a43ce1e Mon Sep 17 00:00:00 2001
From: popiemon
Date: Fri, 7 Mar 2025 15:22:33 +0000
Subject: [PATCH 31/32] fix: training.py
---
training.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/training.py b/training.py
index ffa97ee..4dde15b 100644
--- a/training.py
+++ b/training.py
@@ -92,8 +92,7 @@ def main():
dataset = dataset.shuffle(seed=42)
if config["demo"]:
dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
- dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
-
+ dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],
From f3c2dc81093002280972fd24d215a52214a72378 Mon Sep 17 00:00:00 2001
From: popiemon <85440427+popiemon@users.noreply.github.com>
Date: Sat, 8 Mar 2025 00:27:48 +0900
Subject: [PATCH 32/32] Update training.py
Co-authored-by: codacy-production[bot] <61871480+codacy-production[bot]@users.noreply.github.com>
---
training.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/training.py b/training.py
index 4dde15b..5d3ed30 100644
--- a/training.py
+++ b/training.py
@@ -92,7 +92,7 @@ def main():
dataset = dataset.shuffle(seed=42)
if config["demo"]:
dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo
- dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
+ dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"])
# Set up training arguments
training_arguments = TrainingArguments(
output_dir=config["model"]["output_dir"],