diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json new file mode 100644 index 00000000..de733864 --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json @@ -0,0 +1,30 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-1.5B-Instruct", + "task": "text-classification" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "NvTensorRTRTXExecutionProvider" ] } ] + } + }, + "engine": { "target": "local_system" }, + "passes": { + "builder": { "type": "ModelBuilder", "precision": "fp16" }, + "quantization": { + "type": "NVModelOptQuantization", + "algorithm": "awq", + "int4_block_size": 32, + "tokenizer_dir": "Qwen/Qwen2.5-1.5B-Instruct", + "calibration_method": "awq_lite", + "enable_mixed_quant": true, + "calibration_providers": ["NvTensorRtRtx"], + "calibration_params": { + "add_position_ids": false + } + } + }, + "log_severity_level": 0 +} diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/README.md b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/README.md index d4faeb44..8a8328e8 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/README.md +++ b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/README.md @@ -19,3 +19,60 @@ Use the following command to export the model using Olive with NvTensorRTRTXExec ```bash olive run --config Qwen2.5-1.5B-Instruct_model_builder_fp16.json ``` + +## NVMO PTQ Mixed Precision Quantization + +The olive recipe `Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` produces INT4 + INT8 mixed precision quantized model using NVIDIA's TensorRT Model Optimizer toolkit with AWQ algorithm. + +### Setup + +1. Install Olive with NVIDIA TensorRT Model Optimizer toolkit + + - Run following command to install Olive with TensorRT Model Optimizer. + ```bash + pip install olive-ai[nvmo] + ``` + + - If TensorRT Model Optimizer needs to be installed from a local wheel, then follow below steps. + + ```bash + pip install olive-ai + pip install [onnx] + ``` + + - Make sure that TensorRT Model Optimizer is installed correctly. + ```bash + python -c "from modelopt.onnx.quantization.int4 import quantize as quantize_int4" + ``` + + - Refer TensorRT Model Optimizer [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_with_olive.html) for its detailed installation instructions and setup dependencies. + +2. Install suitable onnxruntime and onnxruntime-genai packages + + - Install the onnxruntime and onnxruntime-genai packages that have NvTensorRTRTXExecutionProvider support. Refer documentation for [NvTensorRtRtx execution-provider](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider) to setup its dependencies/requirements. + - Note that by default, TensorRT Model Optimizer comes with onnxruntime-directml. And onnxrutime-genai-cuda package comes with onnxruntime-gpu. So, in order to use onnxruntime package with NvTensorRTRTXExecutionProvider support, one might need to uninstall existing other onnxruntime packages. + - Make sure that at the end, there is only one onnxruntime package installed. Use command like following for validating the onnxruntime package installation. + ```bash + python -c "import onnxruntime as ort; print(ort.get_available_providers())" + ``` + +3. Install additional requirements. + + - Install packages provided in requirements text file. + ```bash + pip install -r requirements-nvmo.txt + ``` + +### Steps to run + +```bash +olive run --config Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json +``` + +### Recipe details + +The olive recipe `Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` has 2 passes: (a) `ModelBuilder` and (b) `NVModelOptQuantization`. The `ModelBuilder` pass is used to generate the FP16 model for `NvTensorRTRTXExecutionProvider` (aka `NvTensorRtRtx` EP). Subsequently, the `NVModelOptQuantization` pass performs INT4 + INT8 mixed precision quantization using AWQ algorithm with AWQ Lite calibration method to produce the optimized model. + +### Troubleshoot + +In case of any issue related to quantization using TensorRT Model Optimizer toolkit, refer its [FAQs](https://nvidia.github.io/TensorRT-Model-Optimizer/support/2_faqs.html) for potential help or suggestions. \ No newline at end of file diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/info.yml index 2671dfee..b46e88ef 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/info.yml +++ b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/info.yml @@ -4,3 +4,7 @@ recipes: file: Qwen2.5-1.5B-Instruct_model_builder_fp16.json devices: gpu eps: NvTensorRTRTXExecutionProvider + - name: Qwen2.5_1.5B_Instruct_NVMO_PTQ_Mixed_Precision_AWQ_Lite + file: Qwen2.5-1.5B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json + devices: gpu + eps: NvTensorRTRTXExecutionProvider diff --git a/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/requirements-nvmo.txt b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/requirements-nvmo.txt new file mode 100644 index 00000000..d98c750c --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/NvTensorRtRtx/requirements-nvmo.txt @@ -0,0 +1,4 @@ +datasets>=2.14.4 +torch +transformers + diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json new file mode 100644 index 00000000..8c6e4b91 --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json @@ -0,0 +1,30 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "task": "text-classification" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "NvTensorRTRTXExecutionProvider" ] } ] + } + }, + "engine": { "target": "local_system" }, + "passes": { + "builder": { "type": "ModelBuilder", "precision": "fp16" }, + "quantization": { + "type": "NVModelOptQuantization", + "algorithm": "awq", + "int4_block_size": 32, + "tokenizer_dir": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "calibration_method": "awq_lite", + "enable_mixed_quant": true, + "calibration_providers": ["NvTensorRtRtx"], + "calibration_params": { + "add_position_ids": false + } + } + }, + "log_severity_level": 0 +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/README.md b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/README.md index 0b261348..c6ee0e55 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/README.md +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/README.md @@ -19,3 +19,60 @@ Use the following command to export the model using Olive with NvTensorRTRTXExec ```bash olive run --config DeepSeek-R1-Distill-Qwen-1.5B_model_builder_fp16.json ``` + +## NVMO PTQ Mixed Precision Quantization + +The olive recipe `DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json` produces INT4 + INT8 mixed precision quantized model using NVIDIA's TensorRT Model Optimizer toolkit with AWQ algorithm. + +### Setup + +1. Install Olive with NVIDIA TensorRT Model Optimizer toolkit + + - Run following command to install Olive with TensorRT Model Optimizer. + ```bash + pip install olive-ai[nvmo] + ``` + + - If TensorRT Model Optimizer needs to be installed from a local wheel, then follow below steps. + + ```bash + pip install olive-ai + pip install [onnx] + ``` + + - Make sure that TensorRT Model Optimizer is installed correctly. + ```bash + python -c "from modelopt.onnx.quantization.int4 import quantize as quantize_int4" + ``` + + - Refer TensorRT Model Optimizer [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_with_olive.html) for its detailed installation instructions and setup dependencies. + +2. Install suitable onnxruntime and onnxruntime-genai packages + + - Install the onnxruntime and onnxruntime-genai packages that have NvTensorRTRTXExecutionProvider support. Refer documentation for [NvTensorRtRtx execution-provider](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider) to setup its dependencies/requirements. + - Note that by default, TensorRT Model Optimizer comes with onnxruntime-directml. And onnxrutime-genai-cuda package comes with onnxruntime-gpu. So, in order to use onnxruntime package with NvTensorRTRTXExecutionProvider support, one might need to uninstall existing other onnxruntime packages. + - Make sure that at the end, there is only one onnxruntime package installed. Use command like following for validating the onnxruntime package installation. + ```bash + python -c "import onnxruntime as ort; print(ort.get_available_providers())" + ``` + +3. Install additional requirements. + + - Install packages provided in requirements text file. + ```bash + pip install -r requirements-nvmo.txt + ``` + +### Steps to run + +```bash +olive run --config DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json +``` + +### Recipe details + +The olive recipe `DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json` has 2 passes: (a) `ModelBuilder` and (b) `NVModelOptQuantization`. The `ModelBuilder` pass is used to generate the FP16 model for `NvTensorRTRTXExecutionProvider` (aka `NvTensorRtRtx` EP). Subsequently, the `NVModelOptQuantization` pass performs INT4 + INT8 mixed precision quantization using AWQ algorithm with AWQ Lite calibration method to produce the optimized model. + +### Troubleshoot + +In case of any issue related to quantization using TensorRT Model Optimizer toolkit, refer its [FAQs](https://nvidia.github.io/TensorRT-Model-Optimizer/support/2_faqs.html) for potential help or suggestions. \ No newline at end of file diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/info.yml index e7309971..7be873e6 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/info.yml @@ -4,3 +4,7 @@ recipes: file: DeepSeek-R1-Distill-Qwen-1.5B_model_builder_fp16.json devices: gpu eps: NvTensorRTRTXExecutionProvider + - name: DeepSeek-R1-Distill-Qwen-1.5B_NVMO_PTQ_Mixed_Precision_AWQ_Lite + file: DeepSeek-R1-Distill-Qwen_1.5B_nvmo_ptq_mixed_precision_awq_lite.json + devices: gpu + eps: NvTensorRTRTXExecutionProvider diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/requirements-nvmo.txt b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/requirements-nvmo.txt new file mode 100644 index 00000000..d98c750c --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/NvTensorRtRtx/requirements-nvmo.txt @@ -0,0 +1,4 @@ +datasets>=2.14.4 +torch +transformers + diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json new file mode 100644 index 00000000..4d88d32d --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json @@ -0,0 +1,30 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "meta-llama/Llama-3.2-1B-Instruct", + "task": "text-classification" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "NvTensorRTRTXExecutionProvider" ] } ] + } + }, + "engine": { "target": "local_system" }, + "passes": { + "builder": { "type": "ModelBuilder", "precision": "fp16" }, + "quantization": { + "type": "NVModelOptQuantization", + "algorithm": "awq", + "int4_block_size": 32, + "tokenizer_dir": "meta-llama/Llama-3.2-1B-Instruct", + "calibration_method": "awq_lite", + "enable_mixed_quant": true, + "calibration_providers": ["NvTensorRtRtx"], + "calibration_params": { + "add_position_ids": false + } + } + }, + "log_severity_level": 0 +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/README.md b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/README.md index 79bf8d3c..225eee19 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/README.md +++ b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/README.md @@ -19,3 +19,60 @@ Use the following command to export the model using Olive with NvTensorRTRTXExec ```bash olive run --config Llama-3.2-1B-Instruct_model_builder_fp16.json ``` + +## NVMO PTQ Mixed Precision Quantization + +The olive recipe `Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` produces INT4 + INT8 mixed precision quantized model using NVIDIA's TensorRT Model Optimizer toolkit with AWQ algorithm. + +### Setup + +1. Install Olive with NVIDIA TensorRT Model Optimizer toolkit + + - Run following command to install Olive with TensorRT Model Optimizer. + ```bash + pip install olive-ai[nvmo] + ``` + + - If TensorRT Model Optimizer needs to be installed from a local wheel, then follow below steps. + + ```bash + pip install olive-ai + pip install [onnx] + ``` + + - Make sure that TensorRT Model Optimizer is installed correctly. + ```bash + python -c "from modelopt.onnx.quantization.int4 import quantize as quantize_int4" + ``` + + - Refer TensorRT Model Optimizer [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/windows/_installation_with_olive.html) for its detailed installation instructions and setup dependencies. + +2. Install suitable onnxruntime and onnxruntime-genai packages + + - Install the onnxruntime and onnxruntime-genai packages that have NvTensorRTRTXExecutionProvider support. Refer documentation for [NvTensorRtRtx execution-provider](https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider) to setup its dependencies/requirements. + - Note that by default, TensorRT Model Optimizer comes with onnxruntime-directml. And onnxrutime-genai-cuda package comes with onnxruntime-gpu. So, in order to use onnxruntime package with NvTensorRTRTXExecutionProvider support, one might need to uninstall existing other onnxruntime packages. + - Make sure that at the end, there is only one onnxruntime package installed. Use command like following for validating the onnxruntime package installation. + ```bash + python -c "import onnxruntime as ort; print(ort.get_available_providers())" + ``` + +3. Install additional requirements. + + - Install packages provided in requirements text file. + ```bash + pip install -r requirements-nvmo.txt + ``` + +### Steps to run + +```bash +olive run --config Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json +``` + +### Recipe details + +The olive recipe `Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json` has 2 passes: (a) `ModelBuilder` and (b) `NVModelOptQuantization`. The `ModelBuilder` pass is used to generate the FP16 model for `NvTensorRTRTXExecutionProvider` (aka `NvTensorRtRtx` EP). Subsequently, the `NVModelOptQuantization` pass performs INT4 + INT8 mixed precision quantization using AWQ algorithm with AWQ Lite calibration method to produce the optimized model. + +### Troubleshoot + +In case of any issue related to quantization using TensorRT Model Optimizer toolkit, refer its [FAQs](https://nvidia.github.io/TensorRT-Model-Optimizer/support/2_faqs.html) for potential help or suggestions. \ No newline at end of file diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/info.yml b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/info.yml index 7f571a7f..e590de7f 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/info.yml +++ b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/info.yml @@ -4,3 +4,7 @@ recipes: file: Llama-3.2-1B-Instruct_model_builder_fp16.json devices: gpu eps: NvTensorRTRTXExecutionProvider + - name: Llama-3.2-1B-Instruct_NVMO_PTQ_Mixed_Precision_AWQ_Lite + file: Llama-3.2-1B-Instruct_nvmo_ptq_mixed_precision_awq_lite.json + devices: gpu + eps: NvTensorRTRTXExecutionProvider diff --git a/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/requirements-nvmo.txt b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/requirements-nvmo.txt new file mode 100644 index 00000000..d98c750c --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/NvTensorRtRtx/requirements-nvmo.txt @@ -0,0 +1,4 @@ +datasets>=2.14.4 +torch +transformers +