From 7d92a4554e9d8bb81ebb4443177be85055d0016c Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Thu, 12 Feb 2026 14:29:08 +0800
Subject: [PATCH 1/2] update readme and profiling req

---
 .aitk/configs/checks.json                      |  2 +-
 .aitk/requirements/requirements-Profiling.txt  |  1 +
 Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md      |  7 +++++--
 .../aitk/README.md                             |  7 +++++--
 .../aitk/_copy.json.config                     | 18 ------------------
 .../aitk/README.md                             |  7 +++++--
 microsoft-Phi-3.5-mini-instruct/aitk/README.md |  7 +++++--
 7 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json
index f7dec5a7..256887da 100644
--- a/.aitk/configs/checks.json
+++ b/.aitk/configs/checks.json
@@ -1,6 +1,6 @@
 {
     "configCheck": 139,
-    "copyCheck": 179,
+    "copyCheck": 178,
     "extensionCheck": 1,
     "gitignoreCheck": 40,
     "inferenceModelCheck": 25,
diff --git a/.aitk/requirements/requirements-Profiling.txt b/.aitk/requirements/requirements-Profiling.txt
index 4ff32c6e..9091f54a 100644
--- a/.aitk/requirements/requirements-Profiling.txt
+++ b/.aitk/requirements/requirements-Profiling.txt
@@ -5,6 +5,7 @@ mpmath==1.3.0
 numpy==2.2.4
 # onnx==1.17.0
 onnx==1.17.0
+onnxruntime-genai-winml==0.11.2
 # uvpip:uninstall onnxruntime-winml;pre
 # We also need to uninstall in case user tries new version, uses previous version and then updates again
 # because uninstalling winml will remove onnxruntime folder but we will not install windowsml to add it back
diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
index 13a61646..1c3ef30a 100644
--- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
+++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
@@ -2,14 +2,17 @@
 
 This repository demonstrates the optimization of the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model using **post-training quantization (PTQ)** techniques. The optimization process is divided into these workflows:
 
-- QDQ for AMD NPU
+- Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
+- int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
 - DML for general GPU
-   + This process uses AutoAWQ and ModelBuilder
+   + This process uses ModelBuilder
+
+**For some python packages, users need to install visual studio 2022 or visual studio 2022 build tools with c++ development tools modules.**
 
 ## **QDQ Model with 4-bit Weights & 16-bit Activations**
 
diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md
index 50d6122d..60502e1a 100644
--- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md
+++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md
@@ -2,14 +2,17 @@
 
 This repository demonstrates the optimization of the [DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) model using **post-training quantization (PTQ)** techniques. The optimization process is divided into these workflows:
 
-- QDQ for AMD NPU
+- Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
+- int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
 - DML for general GPU
-   + This process uses AutoAWQ and ModelBuilder
+   + This process uses ModelBuilder
+
+**For some python packages, users need to install visual studio 2022 or visual studio 2022 build tools with c++ development tools modules.**
 
 ## **QDQ Model with 4-bit Weights & 16-bit Activations**
 
diff --git a/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config b/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config
index db7d8f65..a9fc1749 100644
--- a/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config
+++ b/meta-llama-Llama-3.1-8B-Instruct/aitk/_copy.json.config
@@ -47,24 +47,6 @@
             "dst": "llama3_1_dml_config.json.config",
             "replacements": []
         },
-        {
-            "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md",
-            "dst": "README.md",
-            "replacements": [
-                {
-                    "find": "# DeepSeek-R1-Distill-Qwen-1.5B Model Optimization",
-                    "replace": "# Llama-3.1-8B-Instruct Model Optimization"
-                },
-                {
-                    "find": "[DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)",
-                    "replace": "[Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)"
-                },
-                {
-                    "find": "> ⚠️ If got 6033 error, replace `genai_config.json` in `./model` folder",
-                    "replace": ""
-                }
-            ]
-        },
         {
             "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/requirements.txt",
             "dst": "requirements.txt",
diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md
index 573bf132..285c1ee4 100644
--- a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md
+++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md
@@ -2,14 +2,17 @@
 
 This repository demonstrates the optimization of the [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) model using **post-training quantization (PTQ)** techniques. The optimization process is divided into these workflows:
 
-- QDQ for AMD NPU
+- Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
+- int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
 - DML for general GPU
-   + This process uses AutoAWQ and ModelBuilder
+   + This process uses ModelBuilder
+
+**For some python packages, users need to install visual studio 2022 or visual studio 2022 build tools with c++ development tools modules.**
 
 ## **QDQ Model with 4-bit Weights & 16-bit Activations**
 
diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/README.md b/microsoft-Phi-3.5-mini-instruct/aitk/README.md
index cd635e33..be4d058e 100644
--- a/microsoft-Phi-3.5-mini-instruct/aitk/README.md
+++ b/microsoft-Phi-3.5-mini-instruct/aitk/README.md
@@ -2,14 +2,17 @@
 
 This repository demonstrates the optimization of the [Microsoft Phi-3.5 Mini Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) model using **post-training quantization (PTQ)** techniques. The optimization process is divided into these workflows:
 
-- QDQ for AMD NPU
+- Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
+- int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
 - DML for general GPU
-   + This process uses AutoAWQ and ModelBuilder
+   + This process uses ModelBuilder
+
+**For some python packages, users need to install visual studio 2022 or visual studio 2022 build tools with c++ development tools modules.**
 
 ## **QDQ Model with 4-bit Weights & 16-bit Activations**
 

From fbaa57e000e6ed1cbe66f48c54d0c10acd64f0dc Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Thu, 12 Feb 2026 14:33:42 +0800
Subject: [PATCH 2/2] I

---
 Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md                | 2 +-
 deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md | 2 +-
 meta-llama-Llama-3.2-1B-Instruct/aitk/README.md          | 2 +-
 microsoft-Phi-3.5-mini-instruct/aitk/README.md           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
index 1c3ef30a..c2b78901 100644
--- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
+++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/README.md
@@ -5,7 +5,7 @@ This repository demonstrates the optimization of the [Qwen2.5-1.5B-Instruct](htt
 - Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
-- int4 Quantization for QNN GPU
+- Int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md
index 60502e1a..177708c0 100644
--- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md
+++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md
@@ -5,7 +5,7 @@ This repository demonstrates the optimization of the [DeepSeek-R1-Distill-Qwen-1
 - Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
-- int4 Quantization for QNN GPU
+- Int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md
index 285c1ee4..0dd6ffa4 100644
--- a/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md
+++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/README.md
@@ -5,7 +5,7 @@ This repository demonstrates the optimization of the [Llama-3.2-1B-Instruct](htt
 - Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
-- int4 Quantization for QNN GPU
+- Int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU
diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/README.md b/microsoft-Phi-3.5-mini-instruct/aitk/README.md
index be4d058e..ada2f2dc 100644
--- a/microsoft-Phi-3.5-mini-instruct/aitk/README.md
+++ b/microsoft-Phi-3.5-mini-instruct/aitk/README.md
@@ -5,7 +5,7 @@ This repository demonstrates the optimization of the [Microsoft Phi-3.5 Mini Ins
 - Quark Quantization for AMD NPU
 - PTQ + AOT for QNN NPU
    + This process extends the QDQ flow and compiling specifically for **Qualcomm NPUs**
-- int4 Quantization for QNN GPU
+- Int4 Quantization for QNN GPU
 - OpenVINO for Intel® CPU/GPU/NPU
    + This process uses OpenVINO specific passes like `OpenVINOOptimumConversion`, `OpenVINOIoUpdate` and `OpenVINOEncapsulation`
 - Float downcasting for NVIDIA TRT for RTX GPU