diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py similarity index 98% rename from benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py rename to benchmarks/kernels/benchmark_cutlass_fp4_moe.py index d6b5820a5b41..7982cbb1422c 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.v1.worker.workspace import init_workspace_manager WEIGHT_SHAPES_MOE = { "nvidia/DeepSeek-R1-FP4": [ @@ -442,10 +441,6 @@ def replay_graph(graph, num_repeats): def main(args): - # Initialize workspace manager (required for CUTLASS MoE kernels) - device = torch.device("cuda:0") - init_workspace_manager(device) - print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 626b3b160044..e07d6c776bc0 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -15,7 +15,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.v1.worker.workspace import init_workspace_manager # Weight shapes for different models: [num_experts, topk, hidden_size, # intermediate_size] @@ -298,10 +297,6 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100): def main(args): - # Initialize workspace manager (required for CUTLASS MoE kernels) - device = torch.device("cuda:0") - init_workspace_manager(device) - print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 4390be8770c1..9b426d8d5f77 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -14,7 +14,6 @@ fused_topk, ) from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.v1.worker.workspace import init_workspace_manager DEFAULT_MODELS = [ "mistralai/Mixtral-8x7B-Instruct-v0.1", @@ -365,10 +364,6 @@ def replay_graph(graph, num_repeats): def main(args): - # Initialize workspace manager (required for CUTLASS MoE kernels) - device = torch.device("cuda:0") - init_workspace_manager(device) - print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}")