diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..b829302 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,24 @@ +{ + "name": "semikong", + "dockerComposeFile": "../compose.yaml", + "service": "semikong", + "workspaceFolder": "/workspace", + "customizations": { + "vscode": { + "extensions": [ + "GitHub.copilot-chat", + "codezombiech.gitignore", + "GitHub.copilot", + "saoudrizwan.claude-dev", + "njpwerner.autodocstring", + "ms-python.mypy-type-checker", + "ms-python.vscode-pylance", + "ms-python.python", + "ms-python.debugpy", + "ms-python.isort", + "ms-python.black-formatter" + ] + } + } +} + diff --git a/.gitignore b/.gitignore index 82f9275..018577f 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,7 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +*.lock +results* +data/* diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c8cfe39 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f3fc16d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime + +# The installer requires curl (and certificates) to download the release archive +RUN apt-get update && apt-get install -y --no-install-recommends curl=7.81.0-1ubuntu1.20 ca-certificates=20240203~22.04.1 git=1:2.34.1-1ubuntu1.12 vim=2:8.2.3995-1ubuntu2.23 && rm -rf /var/lib/apt/lists/* + +# Download the latest installer +ADD https://astral.sh/uv/install.sh /uv-installer.sh + +# Run the installer then remove it +RUN sh /uv-installer.sh && rm /uv-installer.sh + +# Ensure the installed binary is on the `PATH` +ENV PATH="/opt/conda/bin:$PATH" +ENV UV_PROJECT_ENVIRONMENT="/opt/conda" +ENV UV_SYSTEM_PYTHON=true + +WORKDIR /workspace diff --git a/README.md b/README.md index 98bce2f..d709685 100644 --- a/README.md +++ b/README.md @@ -323,7 +323,17 @@ You can perform inference with SEMIKONG chat or base models as below.
### Quick start - Docker -TBA +You shold install Docker and VSCode. + +Install the Dev Containers extension. +Please select "Reopen in Container." + +You should now be able to work with the SEMIKONG project inside the Docker container using VSCode. + +Run the following command to install all packages. +```sh +uv sync --all-extras +```[ Back to top ⬆️ ] diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..e443f56 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,18 @@ +services: + semikong: + build: + context: . + dockerfile: Dockerfile + image: semikong + container_name: semikong + tty: true + stdin_open: true + volumes: + - ./:/workspace + working_dir: /workspace + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] \ No newline at end of file diff --git a/configs/training-config.yaml b/configs/training-config.yaml index b6db6fe..478a3db 100644 --- a/configs/training-config.yaml +++ b/configs/training-config.yaml @@ -1,3 +1,5 @@ +demo: True + model: model_name: "model_path_folder_or_model_name_hf" new_model: "semikong-8b" @@ -7,7 +9,8 @@ model: bnb_4bit_quant_type: "nf4" device_map: {"": 0} output_dir: "./results" - dataset_name: "dataset_path_folder_or_dataset_name_hf" + dataset_name: /workspace/data/SemiKong_Training_Dataset/train-00000-of-00001.parquet + remove_columns: ["input"] training: num_train_epochs: 2 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..98aa265 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[project] +name = "semikong" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +authors = [ + { name = "", email = "" } +] +requires-python = ">=3.10" +dependencies = [ + "bitsandbytes>=0.45.3", + "datasets>=3.3.2", + "huggingface-hub>=0.29.1", + "loguru>=0.7.3", + "numpy>=2.2.3", + "peft>=0.14.0", + "transformers>=4.49.0", + "trl>=0.15.2", + "vllm>=0.5.0.post1", +] + +[dependency-groups] +dev = [ + "black>=25.1.0", + "isort>=6.0.1", + "pre-commit>=4.1.0", + "pylint>=3.3.4", + "tqdm>=4.67.1", +] + diff --git a/raw_inference.py b/raw_inference.py index 144bbff..ab36fde 100644 --- a/raw_inference.py +++ b/raw_inference.py @@ -1,10 +1,11 @@ import argparse -import logging import torch +import transformers import yaml from peft import LoraConfig, PeftModel -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, pipeline) def load_config(config_file="./configs/inference-config.yaml"): @@ -70,7 +71,7 @@ def text_gen_eval_wrapper(model, tokenizer, prompt, max_length=200, temperature= A wrapper function for inferencing, generating text based on a prompt. """ # Suppress logging - logging.set_verbosity(logging.CRITICAL) + transformers.logging.set_verbosity(transformers.logging.CRITICAL) # Initialize text generation pipeline pipe = pipeline( diff --git a/training.py b/training.py index 077572f..5d3ed30 100644 --- a/training.py +++ b/training.py @@ -4,7 +4,8 @@ import yaml from datasets import load_dataset from peft import LoraConfig -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, TrainingArguments) from trl import SFTTrainer @@ -51,6 +52,7 @@ def configure_lora(config): r=lora_params["lora_r"], bias="none", task_type="CAUSAL_LM", + target_modules=["q_proj", "v_proj"], ) @@ -71,7 +73,7 @@ def template_dataset(sample, tokenizer): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Fine-tune a model with LoRA and 4-bit precision.") - parser.add_argument("--config", type=str, default="config.yaml", help="Path to the YAML config file.") + parser.add_argument("--config", type=str, default="configs/training-config.yaml", help="Path to the YAML config file.") args = parser.parse_args() # Load configuration @@ -85,11 +87,12 @@ def main(): # Load and process the dataset dataset_name = config["model"]["dataset_name"] - dataset = load_dataset("json", data_files=dataset_name, split="train") + extension = dataset_name.split(".")[-1] + dataset = load_dataset(extension, data_files=dataset_name, split="train") dataset = dataset.shuffle(seed=42) - dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo - dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=list(dataset.features)) - + if config["demo"]: + dataset = dataset.select(range(50)) # Optional: select first 50 rows for demo + dataset = dataset.map(lambda sample: template_dataset(sample, tokenizer), remove_columns=config["model"]["remove_columns"]) # Set up training arguments training_arguments = TrainingArguments( output_dir=config["model"]["output_dir"], @@ -98,7 +101,7 @@ def main(): optim=config["training"]["optim"], save_steps=config["training"]["save_steps"], logging_steps=config["training"]["logging_steps"], - learning_rate=config["training"]["learning_rate"], + learning_rate=float(config["training"]["learning_rate"]), fp16=config["training"]["fp16"], bf16=config["training"]["bf16"], max_grad_norm=config["training"]["max_grad_norm"], @@ -113,11 +116,8 @@ def main(): model=model, train_dataset=dataset, peft_config=peft_config, - dataset_text_field="text", - max_seq_length=config["training"]["max_seq_length"], tokenizer=tokenizer, args=training_arguments, - packing=config["training"]["packing"], ) trainer.train()