diff --git a/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/README.md b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/README.md new file mode 100644 index 00000000..a40a3a5e --- /dev/null +++ b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/README.md @@ -0,0 +1,59 @@ +# SmolLM2-135M-Instruct Optimization Recipe for CPU + +This folder contains the optimization recipe for the [SmolLM2-135M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct) model targeting CPU execution. The model is optimized to **INT4** precision using Microsoft Olive and ONNX Runtime GenAI's ModelBuilder. + +## 📊 Recipe Details + +| Property | Details | +| :--- | :--- | +| **Model Name** | `HuggingFaceTB/SmolLM2-135M-Instruct` | +| **Architecture** | SmolLM2 (Llama-based) | +| **Target Device** | CPU | +| **Precision** | INT4 | +| **Execution Provider** | `CPUExecutionProvider` | +| **Optimization Tool** | Microsoft Olive (ModelBuilder) | + +## 🛠️ Prerequisites + +Before running the optimization, ensure you have the required dependencies installed. + +```bash +pip install -r requirements.txt +``` + +## 🚀 How to Run Optimization + +Navigate to this directory and run the following command to optimize the model: + +```bash +python -m olive run --config olive_config.json +``` + +This will download the model, apply INT4 quantization, and save the optimized ONNX model in the `models/smollm_manual` directory. + +## 🤖 How to Run Inference + +Once the model is optimized, you can use `onnxruntime-genai` to run inference locally. + +**Example Python Snippet:** + +```python +import onnxruntime_genai as og + +model = og.Model("models/smollm_manual") +tokenizer = og.Tokenizer(model) + +prompt = "<|im_start|>user\nExplain quantum physics in one sentence.<|im_end|>\n<|im_start|>assistant\n" +tokens = tokenizer.encode(prompt) + +params = og.GeneratorParams(model) +params.set_search_options(max_length=100) +params.input_ids = tokens + +generator = og.Generator(model, params) + +while not generator.is_done(): + generator.compute_logits() + generator.generate_next_token() + print(tokenizer.decode(generator.get_next_tokens()), end='', flush=True) +``` \ No newline at end of file diff --git a/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/info.yml b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/info.yml new file mode 100644 index 00000000..d4ad42bb --- /dev/null +++ b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/info.yml @@ -0,0 +1,6 @@ +arch: SmolLM2 +recipes: + - name: SmolLM2-135M-Instruct CPU INT4 + file: olive_config.json + devices: cpu + eps: CPUExecutionProvider \ No newline at end of file diff --git a/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/olive_ci.json b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/olive_ci.json new file mode 100644 index 00000000..2dafb7c5 --- /dev/null +++ b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/olive_ci.json @@ -0,0 +1,9 @@ +[ + { + "name": "smollm2_cpu_int4", + "os": "windows", + "device": "cpu", + "requirements_file": "requirements.txt", + "command": "python -m olive run --config olive_config.json" + } +] \ No newline at end of file diff --git a/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/olive_config.json b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/olive_config.json new file mode 100644 index 00000000..e1d11161 --- /dev/null +++ b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/olive_config.json @@ -0,0 +1,36 @@ +{ + "input_model": { + "type": "HfModel", + "config": { + "model_path": "HuggingFaceTB/SmolLM2-135M-Instruct" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "config": { + "accelerators": [ + { + "device": "cpu", + "execution_providers": ["CPUExecutionProvider"] + } + ] + } + } + }, + "passes": { + "builder": { + "type": "ModelBuilder", + "config": { + "precision": "int4" + } + } + }, + "engine": { + "log_severity_level": 1, + "host": "local_system", + "target": "local_system", + "cache_dir": ".olive-cache", + "output_dir": "models/smollm_manual" + } + } \ No newline at end of file diff --git a/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/requirements.txt b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/requirements.txt new file mode 100644 index 00000000..50c07af4 --- /dev/null +++ b/HuggingFaceTB-SmolLM2-135M-Instruct/CPU/requirements.txt @@ -0,0 +1,3 @@ +olive-ai[auto-opt] +onnxruntime>=1.20.1 +onnxruntime-genai>=0.5.0 \ No newline at end of file