diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 256887da..8024cd45 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,6 +1,6 @@ { "configCheck": 139, - "copyCheck": 178, + "copyCheck": 171, "extensionCheck": 1, "gitignoreCheck": 40, "inferenceModelCheck": 25, @@ -9,8 +9,8 @@ "modelProjectCheck": 41, "oliveCheck": 45, "oliveJsonCheck": 139, - "pathCheck": 1153, + "pathCheck": 1158, "requirementsCheck": 37, "templateCheck": 3, - "venvRequirementsCheck": 13 + "venvRequirementsCheck": 14 } diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index d0d96651..7cfa3610 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -18,7 +18,7 @@ "architecture": "Transformer", "status": "Ready", "relativePath": "microsoft-Phi-3.5-mini-instruct/aitk", - "version": 5, + "version": 6, "p0": true }, { @@ -63,7 +63,7 @@ "architecture": "Transformer", "status": "Ready", "relativePath": "deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk", - "version": 5, + "version": 6, "p0": true }, { @@ -172,7 +172,7 @@ "architecture": "Transformer", "status": "Ready", "relativePath": "meta-llama-Llama-3.2-1B-Instruct/aitk", - "version": 5, + "version": 6, "p0": true }, { @@ -239,7 +239,7 @@ "architecture": "Transformer", "status": "Ready", "relativePath": "Qwen-Qwen2.5-1.5B-Instruct/aitk", - "version": 5, + "version": 6, "p0": true }, { @@ -407,7 +407,7 @@ "architecture": "Transformer", "status": "Ready", "relativePath": "meta-llama-Llama-3.1-8B-Instruct/aitk", - "version": 3, + "version": 4, "p0": false }, { diff --git a/.aitk/configs/parameter_template.json b/.aitk/configs/parameter_template.json index b4aad6ca..c5ee93e1 100644 --- a/.aitk/configs/parameter_template.json +++ b/.aitk/configs/parameter_template.json @@ -102,6 +102,10 @@ "name": "Quantization Dataset Size", "type": "int" }, + "QuantizationDatasetLength": { + "name": "Quantization Dataset Sequence Length", + "type": "int" + }, "QuantizationDatasetSplit": { "name": "Quantization Dataset Split", "tags": [ diff --git a/.aitk/requirements/requirements-NvidiaGPU-GptqModel.txt b/.aitk/requirements/requirements-NvidiaGPU-GptqModel.txt new file mode 100644 index 00000000..fab66b44 --- /dev/null +++ b/.aitk/requirements/requirements-NvidiaGPU-GptqModel.txt @@ -0,0 +1,20 @@ +# follow https://github.com/CodeLinaro/GPTQModel/blob/rel_4.2.5/requirements.txt except for torch +# uvpip:install git+https://github.com/CodeLinaro/GPTQModel.git@64231a266cc70c5597fe97f26e7ec5ccda660c37 --no-build-isolation;post;{"BUILD_CUDA_EXT":"0"} +# download:fast_hadamard_transform-1.0.4.post1-cp312-cp312-win_amd64.whl +./fast_hadamard_transform-1.0.4.post1-cp312-cp312-win_amd64.whl +accelerate==1.10.1 +device-smi==0.4.1 +hf_transfer==0.1.9 +huggingface_hub==0.34.4 +logbar==0.0.4 +maturin==1.9.3 +numpy==2.2.6 +packaging==24.2 +pillow==11.3.0 +protobuf==6.32.0 +random_word==1.0.13 +safetensors==0.6.2 +threadpoolctl==3.6.0 +tokenicer==0.0.5 +transformers==4.56.0 +wheel==0.45.1 diff --git a/.aitk/requirements/requirements-NvidiaGPU.txt b/.aitk/requirements/requirements-NvidiaGPU.txt index b8c7b9fe..0bbedc9f 100644 --- a/.aitk/requirements/requirements-NvidiaGPU.txt +++ b/.aitk/requirements/requirements-NvidiaGPU.txt @@ -34,15 +34,15 @@ multidict==6.6.4 multiprocess==0.70.16 networkx==3.4.2 numpy==2.2.4 -# olive-ai==0.10.1 -olive-ai==0.10.1 +# olive-ai==0.11.0 +olive-ai==0.11.0 # onnx==1.17.0 onnx==1.17.0 onnx-ir==0.1.10 -# onnxruntime-genai-cuda==0.7.0 -onnxruntime-genai-cuda==0.7.0 -# onnxruntime-gpu==1.21.0 -onnxruntime-gpu==1.21.0 +# onnxruntime-genai-cuda==0.11.2 +onnxruntime-genai-cuda==0.11.2 +# onnxruntime-gpu==1.24.1 +onnxruntime-gpu==1.24.1 onnxscript==0.5.3 # optimum==1.26.1 optimum==1.26.1 @@ -69,11 +69,11 @@ sympy==1.13.3 # tabulate==0.9.0 tabulate==0.9.0 tokenizers==0.21.4 -# torch==2.7.0+cu128 -torch==2.7.0+cu128 +# torch==2.8.0+cu128 +torch==2.8.0+cu128 torchmetrics==1.7.1 -# torchvision==0.22.0+cu128 -torchvision==0.22.0+cu128 +# torchvision==0.23.0+cu128 +torchvision==0.23.0+cu128 tqdm==4.67.1 transformers==4.51.3 typing-extensions==4.15.0 diff --git a/.aitk/requirements/requirements-QNN.txt b/.aitk/requirements/requirements-QNN.txt index 22c9e9f9..7a3da679 100644 --- a/.aitk/requirements/requirements-QNN.txt +++ b/.aitk/requirements/requirements-QNN.txt @@ -34,11 +34,11 @@ multidict==6.6.4 multiprocess==0.70.16 networkx==3.5 numpy==2.2.4 -# olive-ai==0.10.1 -olive-ai==0.10.1 +# olive-ai==0.11.0 +olive-ai==0.11.0 onnx==1.17.0 onnx-ir==0.1.10 -# uvpip:install onnxruntime-qnn==1.22.0.dev20250402004 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple --no-deps;post +# uvpip:install onnxruntime-qnn==1.23.2 --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple --no-deps;post onnxscript==0.5.3 optuna==4.2.1 packaging==24.2 diff --git a/.aitk/scripts/install_freeze.py b/.aitk/scripts/install_freeze.py index adc5dac3..2b5736be 100644 --- a/.aitk/scripts/install_freeze.py +++ b/.aitk/scripts/install_freeze.py @@ -31,7 +31,6 @@ # if from git: "git+https://github.com/microsoft/Olive.git@COMMIT_ID#egg=olive_ai oliveAi = "olive-ai==0.10.1" torchVision = "torchvision==0.22.0" -# TODO it is an example amdQuark = "AMD__Quark_py3.10.17" @@ -283,4 +282,5 @@ def write_requires_recursively(name: str): if __name__ == "__main__": + raise "deprecated, need revise" main() diff --git a/.aitk/scripts/sanitize/constants.py b/.aitk/scripts/sanitize/constants.py index 66043348..a94d9ae4 100644 --- a/.aitk/scripts/sanitize/constants.py +++ b/.aitk/scripts/sanitize/constants.py @@ -108,6 +108,8 @@ class OliveDeviceTypes(Enum): # Should sort by value class OlivePassNames: AitkPython = "aitkpython" + GptqModel = "gptqmodel" + GptqQuantizer = "gptqquantizer" ModelBuilder = "modelbuilder" NVModelOptQuantization = "nvmodeloptquantization" OnnxFloatToFloat16 = "onnxfloattofloat16" @@ -145,6 +147,7 @@ class OlivePropertyNames: Host = "host" LoadDatasetConfig = "load_dataset_config" MaxSamples = "max_samples" + MaxSeqLen = "max_seq_len" Metrics = "metrics" Name = "name" NumCalibData = "num_calib_data" diff --git a/.aitk/scripts/sanitize/generator_amd.py b/.aitk/scripts/sanitize/generator_amd.py index 7a1d7b34..665c2c84 100644 --- a/.aitk/scripts/sanitize/generator_amd.py +++ b/.aitk/scripts/sanitize/generator_amd.py @@ -11,13 +11,16 @@ def generate_quantization_config( - configFile: Path, modelList: ModelList, parameter: ModelParameter + configFile: Path | dict, modelList: ModelList, parameter: ModelParameter ) -> Optional[Section]: """ Generates a quantization configuration section for the given file. """ - with open_ex(configFile, "r") as f: - content = json.load(f) + if isinstance(configFile, Path): + with open_ex(configFile, "r") as f: + content = json.load(f) + else: + content = configFile parameters = [] data_configs = content.get(OlivePropertyNames.DataConfigs, []) for k, v in content[OlivePropertyNames.Passes].items(): @@ -110,6 +113,19 @@ def generate_quantization_config( ) pre_process_data_config = data_configs[i].get(OlivePropertyNames.PreProcessDataConfig) + + max_seq_len = pre_process_data_config.get(OlivePropertyNames.MaxSeqLen) + if max_seq_len: + parameters.append( + Parameter( + autoGenerated=True, + template=Parameter( + template="QuantizationDatasetLength", + path=f"{OlivePropertyNames.DataConfigs}[{i}].{OlivePropertyNames.PreProcessDataConfig}.{OlivePropertyNames.MaxSeqLen}", + ), + ) + ) + max_samples = pre_process_data_config.get(OlivePropertyNames.MaxSamples) if max_samples: parameters.append( diff --git a/.aitk/scripts/sanitize/generator_qnn.py b/.aitk/scripts/sanitize/generator_qnn.py index 7848420b..7bfae28f 100644 --- a/.aitk/scripts/sanitize/generator_qnn.py +++ b/.aitk/scripts/sanitize/generator_qnn.py @@ -1,10 +1,26 @@ from pathlib import Path +import json +from .constants import OlivePassNames, OlivePropertyNames, PhaseTypeEnum from .generator_amd import generate_quantization_config from .generator_common import create_model_parameter, set_optimization_path from .model_info import ModelList from .model_parameter import ModelParameter -from .utils import isLLM_by_id +from .utils import isLLM_by_id, open_ex + + +def setup_features(content: dict, parameter: ModelParameter): + def add(feature: str): + if parameter.executeRuntimeFeatures is None: + parameter.executeRuntimeFeatures = [] + if feature not in parameter.executeRuntimeFeatures: + parameter.executeRuntimeFeatures.append(feature) + + for k, v in content[OlivePropertyNames.Passes].items(): + if v[OlivePropertyNames.Type].lower() == OlivePassNames.GptqQuantizer: + add("AutoGptq") + elif v[OlivePropertyNames.Type].lower() == OlivePassNames.GptqModel: + add("GptqModel") def generator_qnn(id: str, recipe, folder: Path, modelList: ModelList): @@ -30,9 +46,13 @@ def generator_qnn(id: str, recipe, folder: Path, modelList: ModelList): if "npu" in runtime_values: parameter.isQNNLLM = True - quantize = generate_quantization_config(configFile, modelList, parameter) + with open_ex(configFile, "r") as f: + content = json.load(f) + quantize = generate_quantization_config(content, modelList, parameter) if quantize: parameter.sections.append(quantize) + setup_features(content, parameter) + parameter.writeIfChanged() print(f"\tGenerated QNN configuration for {file}") diff --git a/.aitk/scripts/sanitize/model_parameter.py b/.aitk/scripts/sanitize/model_parameter.py index 4b52a76d..4aac6b77 100644 --- a/.aitk/scripts/sanitize/model_parameter.py +++ b/.aitk/scripts/sanitize/model_parameter.py @@ -237,8 +237,6 @@ class ModelParameter(BaseModelClass): # This kind of config will # - setup runtimeOverwrite for CUDA EP and others # + the previous EP is used for EPContextBinaryGeneator by PythonEnvironment - # - do not support cpu evaluation - # - setup executeRuntimeFeatures, pyEnvRuntimeFeatures isQNNLLM: Optional[bool] = None # SET AUTOMATICALLY TO TRUE WHEN CUDAExecutionProvider # When true, it means some passes need CUDA so user could not run it without @@ -300,9 +298,6 @@ def Check(self, templates: Dict[str, Parameter], oliveJson: Any, modelList: Mode ), ) - if self.isQNNLLM: - self.addCpu = False - # Add runtime syskey, system = get_target_system(oliveJson) currentEp: str = system[OlivePropertyNames.Accelerators][0][OlivePropertyNames.ExecutionProviders][0] @@ -320,7 +315,6 @@ def Check(self, templates: Dict[str, Parameter], oliveJson: Any, modelList: Mode executeEp=EPNames.CUDAExecutionProvider, evaluateUsedInExecute=True, ) - self.executeRuntimeFeatures = ["AutoGptq"] if self.runtimeOverwrite and not self.runtimeOverwrite.Check(oliveJson): printError(f"{self._file} runtime overwrite has error") diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md index c2b78901..85416245 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md @@ -52,6 +52,8 @@ To support both efficiently, we create **two model instances**: ## **PTQ + AOT Compilation for Qualcomm NPUs using QNN EP** +**When Quantization Dataset Sequence Length is 1024, it needs about 20GB GPU Memory. So adjust according to your hardware.** + This process extends the [**QDQ Model with 4-bit Weights & 16-bit Activations**](#qdq-model-with-4-bit-weights--16-bit-activations) by compiling it specifically for **Qualcomm NPUs** using the **QNN Execution Provider**. ### **Resource Optimization Strategy** diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config index fbcb485b..6a6e71a4 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config @@ -1,19 +1,5 @@ { "copies": [ - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json", - "dst": "qwen2_5_qnn_config.json", - "replacements": [ - { - "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "replace": "Qwen/Qwen2.5-1.5B-Instruct" - }, - { - "find": "model/deepseek", - "replace": "model/qwen2_5" - } - ] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json", "dst": "qwen2_5_trtrtx_config.json", @@ -42,11 +28,6 @@ } ] }, - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json.config", - "dst": "qwen2_5_dml_config.json.config", - "replacements": [] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml index ab959901..ef8f7233 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml @@ -37,15 +37,13 @@ recipes: ep: QNNExecutionProvider aitk: oliveFile: "QNN/config_gpu.json" - requirementsPatches: - - AutoGptq isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider aitk: modelInfo: id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct" - version: 5 + version: 6 groupId: "huggingface/Qwen/Qwen2.5-1.5B-Instruct" groupItemName: "1.5B" p0: true diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config index 8d84fc40..8b192b6f 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config @@ -31,6 +31,6 @@ ], "modelInfo": { "id": "huggingface/Qwen/Qwen2.5-1.5B-Instruct", - "version": 5 + "version": 6 } } diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json index 54a0c6e9..91b7cada 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json @@ -44,26 +44,34 @@ "pre_process_data_config": { "strategy": "line-by-line", "add_special_tokens": true, - "max_samples": 128, - "max_seq_len": 512 + "max_samples": 256, + "max_seq_len": 1024 } } ], "passes": { - "q": { - "type": "QuaRot" + "cs": { + "type": "CaptureSplitInfo", + "num_splits": 1, + "unique_embeds_lm_head_splits": true }, "g": { - "type": "GptqQuantizer", + "type": "GptqModel", + "bits": 4, "sym": true, "group_size": -1, - "desc_act": true, - "data_config": "wikitext2_train_joined" - }, - "cs": { - "type": "CaptureSplitInfo", - "num_splits": 4, - "unique_embeds_lm_head_splits": true + "lm_head": true, + "rotation": "hadamard", + "device": "cuda", + "data_config": "wikitext2_train_joined", + "dynamic": { + "+:.*lm_head*": { + "bits": 8, + "sym": true, + "group_size": 32, + "desc_act": false + } + } }, "mb": { "type": "ModelBuilder", @@ -71,7 +79,6 @@ "int4_block_size": 32, "int4_accuracy_level": 4, "int4_op_types_to_quantize": [ - "MatMul", "Gather" ] }, @@ -80,7 +87,7 @@ "use_int4": true, "add_zero_point": true, "nodes_to_exclude": [ - "/lm_head/MatMul_Q4" + "/lm_head/MatMulNBits" ], "save_as_external_data": true }, @@ -93,6 +100,9 @@ { "surgeon": "AttentionMaskToSequenceLengths" }, + { + "surgeon": "RemoveGidxFromMatMulNBits" + }, { "surgeon": "SimplifiedLayerNormToL2Norm" } diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json.config index 2e22af4c..d36ab661 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json.config @@ -15,9 +15,8 @@ "evaluateUsedInExecute": true }, "executeRuntimeFeatures": [ - "AutoGptq" + "GptqModel" ], - "addCpu": false, "runtime": { "autoGenerated": true, "name": "Evaluate on", @@ -185,6 +184,16 @@ "template": "QuantizationDatasetSplit" } }, + { + "autoGenerated": true, + "name": "Quantization Dataset Sequence Length", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": { + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": "QuantizationDatasetLength" + } + }, { "autoGenerated": true, "name": "Quantization Dataset Size", diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md index 177708c0..f652df9c 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md @@ -52,6 +52,8 @@ To support both efficiently, we create **two model instances**: ## **PTQ + AOT Compilation for Qualcomm NPUs using QNN EP** +**When Quantization Dataset Sequence Length is 1024, it needs about 20GB GPU Memory. So adjust according to your hardware.** + This process extends the [**QDQ Model with 4-bit Weights & 16-bit Activations**](#qdq-model-with-4-bit-weights--16-bit-activations) by compiling it specifically for **Qualcomm NPUs** using the **QNN Execution Provider**. ### **Resource Optimization Strategy** diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json index fc2c4e96..64678a55 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json @@ -44,8 +44,8 @@ "pre_process_data_config": { "strategy": "line-by-line", "add_special_tokens": true, - "max_samples": 128, - "max_seq_len": 512 + "max_samples": 256, + "max_seq_len": 1024 } } ], diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json.config index 633b2573..414c60dc 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json.config +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json.config @@ -17,7 +17,6 @@ "executeRuntimeFeatures": [ "AutoGptq" ], - "addCpu": false, "runtime": { "autoGenerated": true, "name": "Evaluate on", @@ -185,6 +184,16 @@ "template": "QuantizationDatasetSplit" } }, + { + "autoGenerated": true, + "name": "Quantization Dataset Sequence Length", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": { + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": "QuantizationDatasetLength" + } + }, { "autoGenerated": true, "name": "Quantization Dataset Size", diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml index 309dba9a..a343476e 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml @@ -37,15 +37,13 @@ recipes: ep: QNNExecutionProvider aitk: oliveFile: "QNN/config_gpu.json" - requirementsPatches: - - AutoGptq isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider aitk: modelInfo: id: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" - version: 5 + version: 6 groupId: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" groupItemName: "1.5B" p0: true diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config index 5969076f..c997fb66 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config @@ -31,6 +31,6 @@ ], "modelInfo": { "id": "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "version": 5 + "version": 6 } } diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config b/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config index a9fc1749..816fa047 100644 --- a/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config +++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config @@ -1,19 +1,5 @@ { "copies": [ - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json", - "dst": "llama3_1_qnn_config.json", - "replacements": [ - { - "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "replace": "meta-llama/Llama-3.1-8B-Instruct" - }, - { - "find": "model/deepseek", - "replace": "model/llama3_1" - } - ] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_vitis_ai_config.json", "dst": "llama3_1_vitis_ai_config.json", @@ -42,11 +28,6 @@ } ] }, - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json.config", - "dst": "llama3_1_dml_config.json.config", - "replacements": [] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/requirements.txt", "dst": "requirements.txt", diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/info.yml b/meta-llama-Llama-3.1-8B-Instruct/aitk/info.yml index a0049c32..9c928c5a 100644 --- a/meta-llama-Llama-3.1-8B-Instruct/aitk/info.yml +++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/info.yml @@ -35,5 +35,5 @@ recipes: aitk: modelInfo: id: "huggingface/meta-llama/Llama-3.1-8B-Instruct" - version: 3 + version: 4 p0: false diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json b/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json index ec8ff1b8..e7b629fb 100644 --- a/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json +++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json @@ -44,8 +44,8 @@ "pre_process_data_config": { "strategy": "line-by-line", "add_special_tokens": true, - "max_samples": 128, - "max_seq_len": 512 + "max_samples": 256, + "max_seq_len": 1024 } } ], @@ -53,25 +53,40 @@ "q": { "type": "QuaRot" }, - "g": { - "type": "GptqQuantizer", - "sym": true, - "group_size": -1, - "desc_act": true, - "data_config": "wikitext2_train_joined" - }, "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, + "g": { + "type": "GptqModel", + "bits": 4, + "sym": true, + "group_size": -1, + "lm_head": true, + "device": "cuda", + "data_config": "wikitext2_train_joined", + "dynamic": { + "+:.*v_proj*": { + "bits": 8, + "sym": true, + "group_size": -1, + "desc_act": true + }, + "+:.*lm_head*": { + "bits": 4, + "sym": true, + "group_size": 32, + "desc_act": false + } + } + }, "mb": { "type": "ModelBuilder", "precision": "int4", "int4_block_size": 32, "int4_accuracy_level": 4, "int4_op_types_to_quantize": [ - "MatMul", "Gather" ] }, @@ -80,7 +95,7 @@ "use_int4": true, "add_zero_point": true, "nodes_to_exclude": [ - "/lm_head/MatMul_Q4" + "/lm_head/MatMulNBits" ], "save_as_external_data": true }, @@ -93,12 +108,25 @@ { "surgeon": "AttentionMaskToSequenceLengths" }, + { + "surgeon": "RemoveGidxFromMatMulNBits" + }, { "surgeon": "SimplifiedLayerNormToL2Norm" } ], "save_as_external_data": true }, + "f16": { + "type": "OnnxFloatToFloat16", + "op_include_list": [ + "GroupQueryAttention" + ], + "keep_io_types": [ + "logits" + ], + "save_as_external_data": true + }, "sq": { "type": "OnnxStaticQuantization", "data_config": "wikitext2_train_act", @@ -113,7 +141,10 @@ "GroupQueryAttention", "MatMulNBits" ], - "save_as_external_data": true + "save_as_external_data": true, + "extra_options": { + "CalibStridedMinMax": 1 + } }, "sp": { "type": "SplitModel" @@ -130,7 +161,7 @@ "htp_graph_finalization_optimization_mode": "3", "soc_model": "60" }, - "weight_sharing": true + "weight_sharing": false }, "cp": { "type": "ComposeOnnxModels" diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json.config b/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json.config index 52df65a0..a4d3dca9 100644 --- a/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json.config +++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json.config @@ -15,9 +15,8 @@ "evaluateUsedInExecute": true }, "executeRuntimeFeatures": [ - "AutoGptq" + "GptqModel" ], - "addCpu": false, "runtime": { "autoGenerated": true, "name": "Evaluate on", @@ -185,6 +184,16 @@ "template": "QuantizationDatasetSplit" } }, + { + "autoGenerated": true, + "name": "Quantization Dataset Sequence Length", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": { + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": "QuantizationDatasetLength" + } + }, { "autoGenerated": true, "name": "Quantization Dataset Size", diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/model_project.config b/meta-llama-Llama-3.1-8B-Instruct/aitk/model_project.config index 9f893120..8a2b9d15 100644 --- a/meta-llama-Llama-3.1-8B-Instruct/aitk/model_project.config +++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/model_project.config @@ -27,6 +27,6 @@ ], "modelInfo": { "id": "huggingface/meta-llama/Llama-3.1-8B-Instruct", - "version": 3 + "version": 4 } } diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md index 0dd6ffa4..e092f04e 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md @@ -52,6 +52,8 @@ To support both efficiently, we create **two model instances**: ## **PTQ + AOT Compilation for Qualcomm NPUs using QNN EP** +**When Quantization Dataset Sequence Length is 1024, it needs about 20GB GPU Memory. So adjust according to your hardware.** + This process extends the [**QDQ Model with 4-bit Weights & 16-bit Activations**](#qdq-model-with-4-bit-weights--16-bit-activations) by compiling it specifically for **Qualcomm NPUs** using the **QNN Execution Provider**. ### **Resource Optimization Strategy** diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config index 5713565e..d539528b 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config @@ -42,11 +42,6 @@ } ] }, - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json.config", - "dst": "llama3_2_dml_config.json.config", - "replacements": [] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml index b5ab37f3..02c1dcfb 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml @@ -37,13 +37,11 @@ recipes: ep: QNNExecutionProvider aitk: oliveFile: "QNN/config_gpu.json" - requirementsPatches: - - AutoGptq isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider aitk: modelInfo: id: "huggingface/meta-llama/Llama-3.2-1B-Instruct" - version: 5 + version: 6 p0: true diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json index 37f5444e..f0586873 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json @@ -44,8 +44,8 @@ "pre_process_data_config": { "strategy": "line-by-line", "add_special_tokens": true, - "max_samples": 128, - "max_seq_len": 512 + "max_samples": 256, + "max_seq_len": 1024 } } ], diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json.config index 9dd31149..54c287a3 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json.config @@ -17,7 +17,6 @@ "executeRuntimeFeatures": [ "AutoGptq" ], - "addCpu": false, "runtime": { "autoGenerated": true, "name": "Evaluate on", @@ -185,6 +184,16 @@ "template": "QuantizationDatasetSplit" } }, + { + "autoGenerated": true, + "name": "Quantization Dataset Sequence Length", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": { + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": "QuantizationDatasetLength" + } + }, { "autoGenerated": true, "name": "Quantization Dataset Size", diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config index b63789af..3887e2f4 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config @@ -31,6 +31,6 @@ ], "modelInfo": { "id": "huggingface/meta-llama/Llama-3.2-1B-Instruct", - "version": 5 + "version": 6 } } diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/README.md b/microsoft-Phi-3.5-mini-instruct/aitk/README.md index ada2f2dc..a0603bc5 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/README.md +++ b/microsoft-Phi-3.5-mini-instruct/aitk/README.md @@ -52,6 +52,8 @@ To support both efficiently, we create **two model instances**: ## **PTQ + AOT Compilation for Qualcomm NPUs using QNN EP** +**When Quantization Dataset Sequence Length is 1024, it needs about 20GB GPU Memory. So adjust according to your hardware.** + This process extends the [**QDQ Model with 4-bit Weights & 16-bit Activations**](#qdq-model-with-4-bit-weights--16-bit-activations) by compiling it specifically for **Qualcomm NPUs** using the **QNN Execution Provider**. ### **Resource Optimization Strategy** diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config index 5d9b59ff..97e833c1 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config @@ -1,19 +1,5 @@ { "copies": [ - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json", - "dst": "phi3_5_qnn_config.json", - "replacements": [ - { - "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "replace": "microsoft/Phi-3.5-mini-instruct" - }, - { - "find": "model/deepseek", - "replace": "model/phi3_5" - } - ] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json", "dst": "phi3_5_dml_config.json", @@ -28,11 +14,6 @@ } ] }, - { - "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json.config", - "dst": "phi3_5_dml_config.json.config", - "replacements": [] - }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml index eeb4fe32..c45baad8 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml +++ b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml @@ -37,13 +37,11 @@ recipes: ep: QNNExecutionProvider aitk: oliveFile: "QNN/config_gpu.json" - requirementsPatches: - - AutoGptq isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider aitk: modelInfo: id: "huggingface/microsoft/Phi-3.5-mini-instruct" - version: 5 + version: 6 p0: true diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config index a1ac1eb2..d5d2fe50 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config @@ -31,6 +31,6 @@ ], "modelInfo": { "id": "huggingface/microsoft/Phi-3.5-mini-instruct", - "version": 5 + "version": 6 } } diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json index 5ef81f64..2bd35dca 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json @@ -44,8 +44,8 @@ "pre_process_data_config": { "strategy": "line-by-line", "add_special_tokens": true, - "max_samples": 128, - "max_seq_len": 512 + "max_samples": 256, + "max_seq_len": 1024 } } ], @@ -53,25 +53,40 @@ "q": { "type": "QuaRot" }, - "g": { - "type": "GptqQuantizer", - "sym": true, - "group_size": -1, - "desc_act": true, - "data_config": "wikitext2_train_joined" - }, "cs": { "type": "CaptureSplitInfo", "num_splits": 4, "unique_embeds_lm_head_splits": true }, + "g": { + "type": "GptqModel", + "bits": 4, + "sym": true, + "group_size": -1, + "lm_head": true, + "device": "cuda", + "data_config": "wikitext2_train_joined", + "dynamic": { + "+:.*layers\\.[0-9](?!\\d).*v_proj.*": { + "bits": 8, + "sym": true, + "group_size": -1, + "desc_act": true + }, + "+:.*lm_head.*": { + "bits": 8, + "sym": true, + "group_size": 32, + "desc_act": false + } + } + }, "mb": { "type": "ModelBuilder", "precision": "int4", "int4_block_size": 32, "int4_accuracy_level": 4, "int4_op_types_to_quantize": [ - "MatMul", "Gather" ] }, @@ -80,7 +95,7 @@ "use_int4": true, "add_zero_point": true, "nodes_to_exclude": [ - "/lm_head/MatMul_Q4" + "/lm_head/MatMulNBits" ], "save_as_external_data": true }, @@ -93,12 +108,25 @@ { "surgeon": "AttentionMaskToSequenceLengths" }, + { + "surgeon": "RemoveGidxFromMatMulNBits" + }, { "surgeon": "SimplifiedLayerNormToL2Norm" } ], "save_as_external_data": true }, + "f16": { + "type": "OnnxFloatToFloat16", + "op_include_list": [ + "GroupQueryAttention" + ], + "keep_io_types": [ + "logits" + ], + "save_as_external_data": true + }, "sq": { "type": "OnnxStaticQuantization", "data_config": "wikitext2_train_act", @@ -130,7 +158,7 @@ "htp_graph_finalization_optimization_mode": "3", "soc_model": "60" }, - "weight_sharing": true + "weight_sharing": false }, "cp": { "type": "ComposeOnnxModels" diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json.config index 52df65a0..a4d3dca9 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json.config @@ -15,9 +15,8 @@ "evaluateUsedInExecute": true }, "executeRuntimeFeatures": [ - "AutoGptq" + "GptqModel" ], - "addCpu": false, "runtime": { "autoGenerated": true, "name": "Evaluate on", @@ -185,6 +184,16 @@ "template": "QuantizationDatasetSplit" } }, + { + "autoGenerated": true, + "name": "Quantization Dataset Sequence Length", + "type": "int", + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": { + "path": "data_configs[1].pre_process_data_config.max_seq_len", + "template": "QuantizationDatasetLength" + } + }, { "autoGenerated": true, "name": "Quantization Dataset Size",