From b3dd4542a3ec11237573209c7b18ecdeb91933f8 Mon Sep 17 00:00:00 2001 From: Mitch Lewis Date: Tue, 13 Jan 2026 18:36:04 -0700 Subject: [PATCH] Revert "[Bugfix] Fix incorrect dispatch for CutlassBlockScaledGroupedGemm and DeepGEMM (#20933)" This reverts commit bcdfb2a3308e14fbf46da6d6d41747f289af9300. --- vllm/model_executor/layers/quantization/fp8.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 824dfe15ae25..59db3e6c4449 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -488,16 +488,11 @@ def __init__(self, quant_config: Fp8Config): logger.warning_once("Failed to import DeepGemm kernels.") elif not self.block_quant: logger.warning_once("Model is not block quantized. Not using " - "DeepGemm kernels") + " DeepGemm kernels") elif (current_platform.is_cuda() - and current_platform.is_device_capability(90)): + and current_platform.has_device_capability(90)): logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.") self.allow_deep_gemm = True - elif (current_platform.is_cuda() - and is_blackwell_deep_gemm_used()): - logger.info_once("Using DeepGemm SM100 kernels for " - "Fp8MoEMethod.") - self.allow_deep_gemm = True else: logger.warning_once( "DeepGemm not supported on the current platform.") @@ -505,10 +500,10 @@ def __init__(self, quant_config: Fp8Config): # Check for CutlassBlockScaledGroupedGemm support. self.allow_cutlass_block_scaled_grouped_gemm = False if not self.block_quant: - logger.debug_once("Model is not block quantized. Not using " - "CutlassBlockScaledGroupedGemm kernels") + logger.warning_once("Model is not block quantized. Not using " + "CutlassBlockScaledGroupedGemm kernels") elif (current_platform.is_cuda() - and current_platform.is_device_capability(100)): + and current_platform.has_device_capability(100)): logger.info_once( "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod." )