From 52f51202e9a6249dc38a105216181fad7798a730 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 8 Dec 2025 18:29:14 -0800
Subject: [PATCH 1/5] Add ds nvfp4

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../auto_round/deepseek/README.md             | 20 ++++++++++++++++---
 .../auto_round/deepseek/quantize.py           |  8 +++++++-
 .../auto_round/deepseek/run_evaluation.sh     | 11 ++++++++--
 .../auto_round/deepseek/run_generate.sh       | 13 +++++++++---
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
index fee88c56a89..2e6b97f0649 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
@@ -1,4 +1,4 @@
-This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork.
+This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8/NVFP4 and evaluate them using a custom vLLM fork.
 
 ## Requirement
 ```bash
@@ -29,13 +29,18 @@ bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels
 bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels
 ```
 
+- NVFP4
+```bash
+bash run_quant.sh --model $MODEL -t nvfp4 --output_dir ./qmodels
+```
+
 ## Evaluation
 
 ### Prompt Tests
 
 Usage: 
 ```bash
-bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path]
+bash ./run_generate.sh -s [mxfp4|mxfp8|nvfp4] -tp [tensor_parallel_size] -m [model_path]
 ```
 
 - MXFP8
@@ -46,12 +51,16 @@ bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8
 ```bash
 bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
 ```
+- NVFP4
+```bash
+bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4
+```
 ### Evaluation
 
 
 Usage: 
 ```bash
-bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
+bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
 ```
 ```bash
 bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
@@ -62,4 +71,9 @@ bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
 ```bash
 bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
 bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
+```
+- NVFP4
+```bash
+bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4
+bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
 ```
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
index 9becc2cecf9..f2b59e84277 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -32,6 +32,11 @@
         "fp_layers": "lm_head,self_attn",
         "iters": 0,
     },
+    "nvfp4": {
+        "scheme": "NVFP4",
+        "fp_layers": "lm_head,self_attn",
+        "iters": 0,
+    },
 }
 
 
@@ -63,11 +68,12 @@ def quant_model(args):
     quant_config = AutoRoundConfig(
         tokenizer=tokenizer,
         scheme=config["scheme"],
-        enable_torch_compile=args.enable_torch_compile,
+        enable_torch_compile=True,
         iters=config["iters"],
         fp_layers=config["fp_layers"],
         export_format=export_format,
         output_dir=output_dir,
+        low_gpu_mem_usage=True,
     )
 
     # quantizer execute
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index 1d805c7872b..8206bacd977 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -11,7 +11,7 @@ BATCH_SIZE=512
 
 # Function to display usage
 usage() {
-    echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]"
+    echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]"
     echo "  -m: Path to the quantized model (required)"
     echo "  -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)"
     echo "  -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)"
@@ -80,6 +80,13 @@ if [[ "$SCHEME" == "mxfp4" ]]; then
     VLLM_ENABLE_STATIC_MOE=0
     VLLM_USE_DEEP_GEMM=0
     VLLM_ENABLE_AR_EXT=1
+elif [[ "$SCHEME" == "nvfp4" ]]; then
+    VLLM_AR_MXFP4_MODULAR_MOE=0
+    VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
+    VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
+    VLLM_ENABLE_STATIC_MOE=0
+    VLLM_USE_DEEP_GEMM=0
+    VLLM_ENABLE_AR_EXT=0
 elif [[ "$SCHEME" == "mxfp8" ]]; then
     VLLM_AR_MXFP4_MODULAR_MOE=0
     VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
@@ -88,7 +95,7 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then
     VLLM_USE_DEEP_GEMM=0
     VLLM_ENABLE_AR_EXT=1
 else
-    echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'."
+    echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'."
     usage
     exit 1
 fi
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh
index c9ee73ce182..7453481cba4 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh
@@ -52,8 +52,8 @@ done
 
 # Validate quantization type
 QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]')
-if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then
-    echo "Error: Quantization type must be mxfp4 or mxfp8"
+if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" && "$QUANT_TYPE_UPPER" != "NVFP4" ]]; then
+    echo "Error: Quantization type must be mxfp4, mxfp8 or nvfp4"
     usage
     exit 1
 fi
@@ -81,19 +81,26 @@ echo "  Model: $MODEL_PATH"
 echo "  Tensor Parallelism: $TP_SIZE"
 echo ""
 
+# Set environment variables based on quantization type
 # Set environment variables based on quantization type
 if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then
+    export VLLM_ENABLE_AR_EXT=1
     export VLLM_AR_MXFP4_MODULAR_MOE=1
     export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1
     echo "Using MXFP4 configuration"
+elif [[ "$QUANT_TYPE_UPPER" == "NVFP4" ]]; then
+    export VLLM_ENABLE_AR_EXT=0
+    export VLLM_AR_MXFP4_MODULAR_MOE=0
+    export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
+    echo "Using NVFP4 configuration"
 else
+    export VLLM_ENABLE_AR_EXT=1
     export VLLM_AR_MXFP4_MODULAR_MOE=0
     export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
     echo "Using MXFP8 configuration"
 fi
 
 # Common environment variables
-export VLLM_ENABLE_AR_EXT=1
 export VLLM_ENABLE_STATIC_MOE=0
 export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
 export VLLM_USE_DEEP_GEMM=0

From 22fb6fc3233fc462feef0dc2f44c09d5877933bd Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 8 Dec 2025 22:47:31 -0800
Subject: [PATCH 2/5] update example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization/auto_round/deepseek/quantize.py       | 10 ++++++----
 .../quantization/auto_round/qwen/quantize.py           |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
index f2b59e84277..fc6d32cf5ac 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -36,6 +36,7 @@
         "scheme": "NVFP4",
         "fp_layers": "lm_head,self_attn",
         "iters": 0,
+        "export_format": "llm_compressor"
     },
 }
 
@@ -45,11 +46,12 @@ def get_model_and_tokenizer(model_name):
     fp32_model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="cpu",
-        trust_remote_code=True,
+        trust_remote_code=False,
+        dtype="auto",
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
-        trust_remote_code=True,
+        trust_remote_code=False,
     )
     return fp32_model, tokenizer
 
@@ -62,13 +64,13 @@ def quant_model(args):
     )
 
     config = topologies_config[args.t]
-    export_format = "auto_round" if args.use_autoround_format else "llm_compressor"
+    export_format = config.get("export_format", "auto_round")
     output_dir = f"{args.output_dir}/quantized_model_{args.t}"
     fp32_model, tokenizer = get_model_and_tokenizer(args.model)
     quant_config = AutoRoundConfig(
         tokenizer=tokenizer,
         scheme=config["scheme"],
-        enable_torch_compile=True,
+        # enable_torch_compile=True,
         iters=config["iters"],
         fp_layers=config["fp_layers"],
         export_format=export_format,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
index 28b7e59b75d..2844eb6dde8 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
@@ -40,6 +40,7 @@ def get_model_and_tokenizer(model_name):
         model_name,
         device_map="cpu",
         trust_remote_code=True,
+        dtype="auto",
     )
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,

From 2d88b4b1c2646739fce6ce71c424397f1dcb545b Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 9 Dec 2025 20:41:02 -0800
Subject: [PATCH 3/5] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization/auto_round/deepseek/quantize.py             | 5 ++++-
 .../quantization/auto_round/deepseek/run_generate.sh         | 4 ++--
 .../quantization/auto_round/deepseek/run_quant.sh            | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
index fc6d32cf5ac..d816ccdbdd1 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -36,7 +36,10 @@
         "scheme": "NVFP4",
         "fp_layers": "lm_head,self_attn",
         "iters": 0,
-        "export_format": "llm_compressor"
+        "export_format": "llm_compressor",
+        "low_cpu_mem_usage": True,
+        "low_gpu_mem_usage": True,
+        "reloading":False,
     },
 }
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh
index 7453481cba4..17833651ef4 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh
@@ -120,6 +120,6 @@ python generate.py \
     --tensor_parallel_size $TP_SIZE \
     --max-tokens 16 \
     --max-num-seqs 4 \
+    --max-model-len 2048 \
     --gpu_memory_utilization 0.75 \
-    --no-enable-prefix-caching \
-    --enable_expert_parallel
\ No newline at end of file
+    --no-enable-prefix-caching  
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
index 435132a97f2..e1063815120 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
@@ -41,6 +41,7 @@ done
 [ -z "$TARGET" ] && echo "Error: -t is required" && usage
 [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage
 
+AR_LOG_LEVEL=TRACE \
 python quantize.py \
   --model "$MODEL" \
   -t "$TARGET" \

From eee8763929ff736c0c0f4c4463409f3bddeace49 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 12 Dec 2025 04:28:53 -0800
Subject: [PATCH 4/5] fix ct version

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization/auto_round/deepseek/quantize.py               | 1 +
 .../quantization/auto_round/deepseek/requirements.txt          | 3 ++-
 .../language-modeling/quantization/auto_round/qwen/quantize.py | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
index d816ccdbdd1..bde0574084b 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -79,6 +79,7 @@ def quant_model(args):
         export_format=export_format,
         output_dir=output_dir,
         low_gpu_mem_usage=True,
+        reloading=False,
     )
 
     # quantizer execute
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
index 80392549f26..2bf47dc6d7e 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
@@ -1,2 +1,3 @@
 lm-eval==0.4.9.1
-loguru
\ No newline at end of file
+loguru
+compressed-tensors==0.12.2
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
index 2844eb6dde8..d7606468571 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py
@@ -68,6 +68,7 @@ def quant_model(args):
         fp_layers=config["fp_layers"],
         export_format=export_format,
         output_dir=output_dir,
+        reloading=False,
     )
 
     # quantizer execute

From 66edd91d93916e34429aee72d35bec450f94cd70 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 12 Dec 2025 04:30:41 -0800
Subject: [PATCH 5/5] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization/auto_round/deepseek/quantize.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
index bde0574084b..3f5f8e857d2 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -73,7 +73,7 @@ def quant_model(args):
     quant_config = AutoRoundConfig(
         tokenizer=tokenizer,
         scheme=config["scheme"],
-        # enable_torch_compile=True,
+        enable_torch_compile=args.enable_torch_compile,
         iters=config["iters"],
         fp_layers=config["fp_layers"],
         export_format=export_format,