From 52f51202e9a6249dc38a105216181fad7798a730 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 8 Dec 2025 18:29:14 -0800 Subject: [PATCH 1/5] Add ds nvfp4 Signed-off-by: yiliu30 --- .../auto_round/deepseek/README.md | 20 ++++++++++++++++--- .../auto_round/deepseek/quantize.py | 8 +++++++- .../auto_round/deepseek/run_evaluation.sh | 11 ++++++++-- .../auto_round/deepseek/run_generate.sh | 13 +++++++++--- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md index fee88c56a89..2e6b97f0649 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -1,4 +1,4 @@ -This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork. +This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8/NVFP4 and evaluate them using a custom vLLM fork. ## Requirement ```bash @@ -29,13 +29,18 @@ bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels ``` +- NVFP4 +```bash +bash run_quant.sh --model $MODEL -t nvfp4 --output_dir ./qmodels +``` + ## Evaluation ### Prompt Tests Usage: ```bash -bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path] +bash ./run_generate.sh -s [mxfp4|mxfp8|nvfp4] -tp [tensor_parallel_size] -m [model_path] ``` - MXFP8 @@ -46,12 +51,16 @@ bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8 ```bash bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4 ``` +- NVFP4 +```bash +bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4 +``` ### Evaluation Usage: ```bash -bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] +bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] ``` ```bash bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8 @@ -62,4 +71,9 @@ bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 ```bash bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4 bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 +``` +- NVFP4 +```bash +bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4 +bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4 ``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index 9becc2cecf9..f2b59e84277 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -32,6 +32,11 @@ "fp_layers": "lm_head,self_attn", "iters": 0, }, + "nvfp4": { + "scheme": "NVFP4", + "fp_layers": "lm_head,self_attn", + "iters": 0, + }, } @@ -63,11 +68,12 @@ def quant_model(args): quant_config = AutoRoundConfig( tokenizer=tokenizer, scheme=config["scheme"], - enable_torch_compile=args.enable_torch_compile, + enable_torch_compile=True, iters=config["iters"], fp_layers=config["fp_layers"], export_format=export_format, output_dir=output_dir, + low_gpu_mem_usage=True, ) # quantizer execute diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 1d805c7872b..8206bacd977 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -11,7 +11,7 @@ BATCH_SIZE=512 # Function to display usage usage() { - echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]" + echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]" echo " -m: Path to the quantized model (required)" echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" echo " -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)" @@ -80,6 +80,13 @@ if [[ "$SCHEME" == "mxfp4" ]]; then VLLM_ENABLE_STATIC_MOE=0 VLLM_USE_DEEP_GEMM=0 VLLM_ENABLE_AR_EXT=1 +elif [[ "$SCHEME" == "nvfp4" ]]; then + VLLM_AR_MXFP4_MODULAR_MOE=0 + VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + VLLM_ENABLE_STATIC_MOE=0 + VLLM_USE_DEEP_GEMM=0 + VLLM_ENABLE_AR_EXT=0 elif [[ "$SCHEME" == "mxfp8" ]]; then VLLM_AR_MXFP4_MODULAR_MOE=0 VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 @@ -88,7 +95,7 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then VLLM_USE_DEEP_GEMM=0 VLLM_ENABLE_AR_EXT=1 else - echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'." + echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'." usage exit 1 fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh index c9ee73ce182..7453481cba4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh @@ -52,8 +52,8 @@ done # Validate quantization type QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]') -if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then - echo "Error: Quantization type must be mxfp4 or mxfp8" +if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" && "$QUANT_TYPE_UPPER" != "NVFP4" ]]; then + echo "Error: Quantization type must be mxfp4, mxfp8 or nvfp4" usage exit 1 fi @@ -81,19 +81,26 @@ echo " Model: $MODEL_PATH" echo " Tensor Parallelism: $TP_SIZE" echo "" +# Set environment variables based on quantization type # Set environment variables based on quantization type if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then + export VLLM_ENABLE_AR_EXT=1 export VLLM_AR_MXFP4_MODULAR_MOE=1 export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 echo "Using MXFP4 configuration" +elif [[ "$QUANT_TYPE_UPPER" == "NVFP4" ]]; then + export VLLM_ENABLE_AR_EXT=0 + export VLLM_AR_MXFP4_MODULAR_MOE=0 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + echo "Using NVFP4 configuration" else + export VLLM_ENABLE_AR_EXT=1 export VLLM_AR_MXFP4_MODULAR_MOE=0 export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 echo "Using MXFP8 configuration" fi # Common environment variables -export VLLM_ENABLE_AR_EXT=1 export VLLM_ENABLE_STATIC_MOE=0 export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 export VLLM_USE_DEEP_GEMM=0 From 22fb6fc3233fc462feef0dc2f44c09d5877933bd Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 8 Dec 2025 22:47:31 -0800 Subject: [PATCH 2/5] update example Signed-off-by: yiliu30 --- .../quantization/auto_round/deepseek/quantize.py | 10 ++++++---- .../quantization/auto_round/qwen/quantize.py | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index f2b59e84277..fc6d32cf5ac 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -36,6 +36,7 @@ "scheme": "NVFP4", "fp_layers": "lm_head,self_attn", "iters": 0, + "export_format": "llm_compressor" }, } @@ -45,11 +46,12 @@ def get_model_and_tokenizer(model_name): fp32_model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", - trust_remote_code=True, + trust_remote_code=False, + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained( model_name, - trust_remote_code=True, + trust_remote_code=False, ) return fp32_model, tokenizer @@ -62,13 +64,13 @@ def quant_model(args): ) config = topologies_config[args.t] - export_format = "auto_round" if args.use_autoround_format else "llm_compressor" + export_format = config.get("export_format", "auto_round") output_dir = f"{args.output_dir}/quantized_model_{args.t}" fp32_model, tokenizer = get_model_and_tokenizer(args.model) quant_config = AutoRoundConfig( tokenizer=tokenizer, scheme=config["scheme"], - enable_torch_compile=True, + # enable_torch_compile=True, iters=config["iters"], fp_layers=config["fp_layers"], export_format=export_format, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py index 28b7e59b75d..2844eb6dde8 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py @@ -40,6 +40,7 @@ def get_model_and_tokenizer(model_name): model_name, device_map="cpu", trust_remote_code=True, + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained( model_name, From 2d88b4b1c2646739fce6ce71c424397f1dcb545b Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 9 Dec 2025 20:41:02 -0800 Subject: [PATCH 3/5] update Signed-off-by: yiliu30 --- .../quantization/auto_round/deepseek/quantize.py | 5 ++++- .../quantization/auto_round/deepseek/run_generate.sh | 4 ++-- .../quantization/auto_round/deepseek/run_quant.sh | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index fc6d32cf5ac..d816ccdbdd1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -36,7 +36,10 @@ "scheme": "NVFP4", "fp_layers": "lm_head,self_attn", "iters": 0, - "export_format": "llm_compressor" + "export_format": "llm_compressor", + "low_cpu_mem_usage": True, + "low_gpu_mem_usage": True, + "reloading":False, }, } diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh index 7453481cba4..17833651ef4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh @@ -120,6 +120,6 @@ python generate.py \ --tensor_parallel_size $TP_SIZE \ --max-tokens 16 \ --max-num-seqs 4 \ + --max-model-len 2048 \ --gpu_memory_utilization 0.75 \ - --no-enable-prefix-caching \ - --enable_expert_parallel \ No newline at end of file + --no-enable-prefix-caching \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh index 435132a97f2..e1063815120 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh @@ -41,6 +41,7 @@ done [ -z "$TARGET" ] && echo "Error: -t is required" && usage [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage +AR_LOG_LEVEL=TRACE \ python quantize.py \ --model "$MODEL" \ -t "$TARGET" \ From eee8763929ff736c0c0f4c4463409f3bddeace49 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 12 Dec 2025 04:28:53 -0800 Subject: [PATCH 4/5] fix ct version Signed-off-by: yiliu30 --- .../quantization/auto_round/deepseek/quantize.py | 1 + .../quantization/auto_round/deepseek/requirements.txt | 3 ++- .../language-modeling/quantization/auto_round/qwen/quantize.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index d816ccdbdd1..bde0574084b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -79,6 +79,7 @@ def quant_model(args): export_format=export_format, output_dir=output_dir, low_gpu_mem_usage=True, + reloading=False, ) # quantizer execute diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt index 80392549f26..2bf47dc6d7e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt @@ -1,2 +1,3 @@ lm-eval==0.4.9.1 -loguru \ No newline at end of file +loguru +compressed-tensors==0.12.2 \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py index 2844eb6dde8..d7606468571 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py @@ -68,6 +68,7 @@ def quant_model(args): fp_layers=config["fp_layers"], export_format=export_format, output_dir=output_dir, + reloading=False, ) # quantizer execute From 66edd91d93916e34429aee72d35bec450f94cd70 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 12 Dec 2025 04:30:41 -0800 Subject: [PATCH 5/5] update Signed-off-by: yiliu30 --- .../quantization/auto_round/deepseek/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index bde0574084b..3f5f8e857d2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -73,7 +73,7 @@ def quant_model(args): quant_config = AutoRoundConfig( tokenizer=tokenizer, scheme=config["scheme"], - # enable_torch_compile=True, + enable_torch_compile=args.enable_torch_compile, iters=config["iters"], fp_layers=config["fp_layers"], export_format=export_format,