Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork.
This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8/NVFP4 and evaluate them using a custom vLLM fork.

## Requirement
```bash
Expand Down Expand Up @@ -29,13 +29,18 @@ bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels
bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels
```

- NVFP4
```bash
bash run_quant.sh --model $MODEL -t nvfp4 --output_dir ./qmodels
```

## Evaluation

### Prompt Tests

Usage:
```bash
bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path]
bash ./run_generate.sh -s [mxfp4|mxfp8|nvfp4] -tp [tensor_parallel_size] -m [model_path]
```

- MXFP8
Expand All @@ -46,12 +51,16 @@ bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8
```bash
bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
```
- NVFP4
```bash
bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4
```
### Evaluation


Usage:
```bash
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
```
```bash
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
Expand All @@ -62,4 +71,9 @@ bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
```bash
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
```
- NVFP4
```bash
bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4
bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
```
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@
"fp_layers": "lm_head,self_attn",
"iters": 0,
},
"nvfp4": {
"scheme": "NVFP4",
"fp_layers": "lm_head,self_attn",
"iters": 0,
"export_format": "llm_compressor",
"low_cpu_mem_usage": True,
"low_gpu_mem_usage": True,
"reloading":False,
},
}


Expand All @@ -58,7 +67,7 @@ def quant_model(args):
)

config = topologies_config[args.t]
export_format = "auto_round" if args.use_autoround_format else "llm_compressor"
export_format = config.get("export_format", "auto_round")
output_dir = f"{args.output_dir}/quantized_model_{args.t}"
fp32_model, tokenizer = get_model_and_tokenizer(args.model)
quant_config = AutoRoundConfig(
Expand All @@ -69,6 +78,7 @@ def quant_model(args):
fp_layers=config["fp_layers"],
export_format=export_format,
output_dir=output_dir,
low_gpu_mem_usage=True,
reloading=False,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
lm-eval==0.4.9.1
loguru
loguru
compressed-tensors==0.12.2
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ BATCH_SIZE=512

# Function to display usage
usage() {
echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]"
echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]"
echo " -m: Path to the quantized model (required)"
echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)"
echo " -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)"
Expand Down Expand Up @@ -80,6 +80,13 @@ if [[ "$SCHEME" == "mxfp4" ]]; then
VLLM_ENABLE_STATIC_MOE=0
VLLM_USE_DEEP_GEMM=0
VLLM_ENABLE_AR_EXT=1
elif [[ "$SCHEME" == "nvfp4" ]]; then
VLLM_AR_MXFP4_MODULAR_MOE=0
VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
VLLM_ENABLE_STATIC_MOE=0
VLLM_USE_DEEP_GEMM=0
VLLM_ENABLE_AR_EXT=0
elif [[ "$SCHEME" == "mxfp8" ]]; then
VLLM_AR_MXFP4_MODULAR_MOE=0
VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
Expand All @@ -88,7 +95,7 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then
VLLM_USE_DEEP_GEMM=0
VLLM_ENABLE_AR_EXT=1
else
echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'."
echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'."
usage
exit 1
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ done

# Validate quantization type
QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]')
if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then
echo "Error: Quantization type must be mxfp4 or mxfp8"
if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" && "$QUANT_TYPE_UPPER" != "NVFP4" ]]; then
echo "Error: Quantization type must be mxfp4, mxfp8 or nvfp4"
usage
exit 1
fi
Expand Down Expand Up @@ -81,19 +81,26 @@ echo " Model: $MODEL_PATH"
echo " Tensor Parallelism: $TP_SIZE"
echo ""

# Set environment variables based on quantization type
# Set environment variables based on quantization type
if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then
export VLLM_ENABLE_AR_EXT=1
export VLLM_AR_MXFP4_MODULAR_MOE=1
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1
echo "Using MXFP4 configuration"
elif [[ "$QUANT_TYPE_UPPER" == "NVFP4" ]]; then
export VLLM_ENABLE_AR_EXT=0
export VLLM_AR_MXFP4_MODULAR_MOE=0
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
echo "Using NVFP4 configuration"
else
export VLLM_ENABLE_AR_EXT=1
export VLLM_AR_MXFP4_MODULAR_MOE=0
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
echo "Using MXFP8 configuration"
fi

# Common environment variables
export VLLM_ENABLE_AR_EXT=1
export VLLM_ENABLE_STATIC_MOE=0
export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
export VLLM_USE_DEEP_GEMM=0
Expand All @@ -113,6 +120,6 @@ python generate.py \
--tensor_parallel_size $TP_SIZE \
--max-tokens 16 \
--max-num-seqs 4 \
--max-model-len 2048 \
--gpu_memory_utilization 0.75 \
--no-enable-prefix-caching \
--enable_expert_parallel
--no-enable-prefix-caching
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ done
[ -z "$TARGET" ] && echo "Error: -t is required" && usage
[ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage

AR_LOG_LEVEL=TRACE \
python quantize.py \
--model "$MODEL" \
-t "$TARGET" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def get_model_and_tokenizer(model_name):
model_name,
device_map="cpu",
trust_remote_code=True,
dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
Expand Down