Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.worker.workspace import init_workspace_manager

WEIGHT_SHAPES_MOE = {
"nvidia/DeepSeek-R1-FP4": [
Expand Down Expand Up @@ -442,10 +441,6 @@ def replay_graph(graph, num_repeats):


def main(args):
# Initialize workspace manager (required for CUTLASS MoE kernels)
device = torch.device("cuda:0")
init_workspace_manager(device)

print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
Expand Down
5 changes: 0 additions & 5 deletions benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.worker.workspace import init_workspace_manager

# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
Expand Down Expand Up @@ -298,10 +297,6 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100):


def main(args):
# Initialize workspace manager (required for CUTLASS MoE kernels)
device = torch.device("cuda:0")
init_workspace_manager(device)

print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
Expand Down
5 changes: 0 additions & 5 deletions benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
fused_topk,
)
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.worker.workspace import init_workspace_manager

DEFAULT_MODELS = [
"mistralai/Mixtral-8x7B-Instruct-v0.1",
Expand Down Expand Up @@ -365,10 +364,6 @@ def replay_graph(graph, num_repeats):


def main(args):
# Initialize workspace manager (required for CUTLASS MoE kernels)
device = torch.device("cuda:0")
init_workspace_manager(device)

print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
Expand Down