diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 824dfe15ae25..59db3e6c4449 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -488,16 +488,11 @@ def __init__(self, quant_config: Fp8Config): logger.warning_once("Failed to import DeepGemm kernels.") elif not self.block_quant: logger.warning_once("Model is not block quantized. Not using " - "DeepGemm kernels") + " DeepGemm kernels") elif (current_platform.is_cuda() - and current_platform.is_device_capability(90)): + and current_platform.has_device_capability(90)): logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.") self.allow_deep_gemm = True - elif (current_platform.is_cuda() - and is_blackwell_deep_gemm_used()): - logger.info_once("Using DeepGemm SM100 kernels for " - "Fp8MoEMethod.") - self.allow_deep_gemm = True else: logger.warning_once( "DeepGemm not supported on the current platform.") @@ -505,10 +500,10 @@ def __init__(self, quant_config: Fp8Config): # Check for CutlassBlockScaledGroupedGemm support. self.allow_cutlass_block_scaled_grouped_gemm = False if not self.block_quant: - logger.debug_once("Model is not block quantized. Not using " - "CutlassBlockScaledGroupedGemm kernels") + logger.warning_once("Model is not block quantized. Not using " + "CutlassBlockScaledGroupedGemm kernels") elif (current_platform.is_cuda() - and current_platform.is_device_capability(100)): + and current_platform.has_device_capability(100)): logger.info_once( "Using CutlassBlockScaledGroupedGemm kernels for Fp8MoEMethod." )