diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md index 85ad84a17f4..3ea7158f9f6 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -1,4 +1,4 @@ -This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork. +This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8/NVFP4 and evaluate them using a custom vLLM fork. ## Requirement ```bash @@ -29,13 +29,18 @@ bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels ``` +- NVFP4 +```bash +bash run_quant.sh --model $MODEL -t nvfp4 --output_dir ./qmodels +``` + ## Evaluation ### Prompt Tests Usage: ```bash -bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path] +bash ./run_generate.sh -s [mxfp4|mxfp8|nvfp4] -tp [tensor_parallel_size] -m [model_path] ``` - MXFP8 @@ -46,12 +51,16 @@ bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8 ```bash bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4 ``` +- NVFP4 +```bash +bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4 +``` ### Evaluation Usage: ```bash -bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] +bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] ``` ```bash bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8 @@ -62,4 +71,9 @@ bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 ```bash bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4 bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 +``` +- NVFP4 +```bash +bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4 +bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4 ``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index 496a9f26e68..3f5f8e857d2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -32,6 +32,15 @@ "fp_layers": "lm_head,self_attn", "iters": 0, }, + "nvfp4": { + "scheme": "NVFP4", + "fp_layers": "lm_head,self_attn", + "iters": 0, + "export_format": "llm_compressor", + "low_cpu_mem_usage": True, + "low_gpu_mem_usage": True, + "reloading":False, + }, } @@ -58,7 +67,7 @@ def quant_model(args): ) config = topologies_config[args.t] - export_format = "auto_round" if args.use_autoround_format else "llm_compressor" + export_format = config.get("export_format", "auto_round") output_dir = f"{args.output_dir}/quantized_model_{args.t}" fp32_model, tokenizer = get_model_and_tokenizer(args.model) quant_config = AutoRoundConfig( @@ -69,6 +78,7 @@ def quant_model(args): fp_layers=config["fp_layers"], export_format=export_format, output_dir=output_dir, + low_gpu_mem_usage=True, reloading=False, ) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt index 80392549f26..2bf47dc6d7e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt @@ -1,2 +1,3 @@ lm-eval==0.4.9.1 -loguru \ No newline at end of file +loguru +compressed-tensors==0.12.2 \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index d0039e5ecff..412d0546391 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -11,7 +11,7 @@ BATCH_SIZE=512 # Function to display usage usage() { - echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]" + echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]" echo " -m: Path to the quantized model (required)" echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" echo " -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)" @@ -80,6 +80,13 @@ if [[ "$SCHEME" == "mxfp4" ]]; then VLLM_ENABLE_STATIC_MOE=0 VLLM_USE_DEEP_GEMM=0 VLLM_ENABLE_AR_EXT=1 +elif [[ "$SCHEME" == "nvfp4" ]]; then + VLLM_AR_MXFP4_MODULAR_MOE=0 + VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 + VLLM_ENABLE_STATIC_MOE=0 + VLLM_USE_DEEP_GEMM=0 + VLLM_ENABLE_AR_EXT=0 elif [[ "$SCHEME" == "mxfp8" ]]; then VLLM_AR_MXFP4_MODULAR_MOE=0 VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 @@ -88,7 +95,7 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then VLLM_USE_DEEP_GEMM=0 VLLM_ENABLE_AR_EXT=1 else - echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'." + echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'." usage exit 1 fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh index c9ee73ce182..17833651ef4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh @@ -52,8 +52,8 @@ done # Validate quantization type QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]') -if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then - echo "Error: Quantization type must be mxfp4 or mxfp8" +if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" && "$QUANT_TYPE_UPPER" != "NVFP4" ]]; then + echo "Error: Quantization type must be mxfp4, mxfp8 or nvfp4" usage exit 1 fi @@ -81,19 +81,26 @@ echo " Model: $MODEL_PATH" echo " Tensor Parallelism: $TP_SIZE" echo "" +# Set environment variables based on quantization type # Set environment variables based on quantization type if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then + export VLLM_ENABLE_AR_EXT=1 export VLLM_AR_MXFP4_MODULAR_MOE=1 export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1 echo "Using MXFP4 configuration" +elif [[ "$QUANT_TYPE_UPPER" == "NVFP4" ]]; then + export VLLM_ENABLE_AR_EXT=0 + export VLLM_AR_MXFP4_MODULAR_MOE=0 + export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 + echo "Using NVFP4 configuration" else + export VLLM_ENABLE_AR_EXT=1 export VLLM_AR_MXFP4_MODULAR_MOE=0 export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0 echo "Using MXFP8 configuration" fi # Common environment variables -export VLLM_ENABLE_AR_EXT=1 export VLLM_ENABLE_STATIC_MOE=0 export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 export VLLM_USE_DEEP_GEMM=0 @@ -113,6 +120,6 @@ python generate.py \ --tensor_parallel_size $TP_SIZE \ --max-tokens 16 \ --max-num-seqs 4 \ + --max-model-len 2048 \ --gpu_memory_utilization 0.75 \ - --no-enable-prefix-caching \ - --enable_expert_parallel \ No newline at end of file + --no-enable-prefix-caching \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh index 435132a97f2..e1063815120 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh @@ -41,6 +41,7 @@ done [ -z "$TARGET" ] && echo "Error: -t is required" && usage [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage +AR_LOG_LEVEL=TRACE \ python quantize.py \ --model "$MODEL" \ -t "$TARGET" \ diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py index 24b6a762ff2..cacf14fde5b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py @@ -40,6 +40,7 @@ def get_model_and_tokenizer(model_name): model_name, device_map="cpu", trust_remote_code=True, + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained( model_name,