From d4597f3dc876439ec67626b5c045d52e70f0854e Mon Sep 17 00:00:00 2001 From: Max Hu Date: Wed, 14 Jan 2026 10:32:53 -0800 Subject: [PATCH 01/34] add implementation Signed-off-by: Max Hu --- .../layers/attention/mm_encoder_attention.py | 26 +++++- vllm/model_executor/models/qwen2_5_vl.py | 12 ++- vllm/model_executor/models/qwen3_vl.py | 12 +++ vllm/platforms/cuda.py | 1 + vllm/v1/attention/ops/vit_attn_wrappers.py | 79 +++++++++++++++++++ 5 files changed, 127 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 33e120e7660e..97afa10ce57a 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -13,6 +13,7 @@ from vllm.v1.attention.ops.vit_attn_wrappers import ( vit_fa4_flash_attn_wrapper, vit_flash_attn_wrapper, + vit_flashinfer_wrapper, vit_torch_sdpa_wrapper, ) @@ -34,6 +35,7 @@ def __init__( num_kv_heads: int | None = None, prefix: str = "", multimodal_config: MultiModalConfig | None = None, + workspace_buffer: torch.Tensor | None = None, # Only used for FlashInfer ) -> None: """ Args: @@ -49,10 +51,10 @@ def __init__( self.num_heads = num_heads self.head_size = head_size - self.scale = scale + self.scale = 1.0 / (head_size**0.5) if scale is None else scale self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.layer_name = prefix - + self.workspace_buffer = workspace_buffer assert self.num_heads % self.num_kv_heads == 0, ( f"num_heads ({self.num_heads}) is not " f"divisible by num_kv_heads ({self.num_kv_heads})" @@ -185,6 +187,24 @@ def _forward_fa( output = output.reshape(bsz, q_len, -1) return output + def _forward_flashinfer( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, + ) -> torch.Tensor: + return vit_flashinfer_wrapper( + q=query, + k=key, + v=value, + scale=self.scale, + workspace_buffer=self.workspace_buffer, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + def _forward_fa4( self, query: torch.Tensor, @@ -241,6 +261,8 @@ def forward_cuda( return self._forward_fa4(query, key, value, cu_seqlens, max_seqlen) elif self.is_flash_attn_backend: return self._forward_fa(query, key, value, cu_seqlens, max_seqlen) + elif self.attn_backend == AttentionBackendEnum.FLASHINFER: + return self._forward_flashinfer(query, key, value, cu_seqlens, max_seqlen) elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: return self._forward_sdpa(query, key, value, cu_seqlens) else: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 9cfd12a31903..6e6682df7654 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -310,6 +310,7 @@ def __init__( quant_config: QuantizationConfig | None = None, multimodal_config: MultiModalConfig | None = None, prefix: str = "", + workspace_buffer: torch.Tensor | None = None, # Only used for FlashInfer ) -> None: super().__init__() # Per attention head and per partition values. @@ -355,6 +356,7 @@ def __init__( head_size=self.hidden_size_per_attention_head, scale=self.hidden_size_per_attention_head**-0.5, multimodal_config=multimodal_config, + workspace_buffer=workspace_buffer, ) self.apply_rotary_emb = ApplyRotaryEmb(enforce_enable=True) @@ -436,6 +438,7 @@ def __init__( quant_config: QuantizationConfig | None = None, multimodal_config: MultiModalConfig | None = None, prefix: str = "", + workspace_buffer: torch.Tensor | None = None, # Only used for FlashInfer ) -> None: super().__init__() if norm_layer is None: @@ -449,6 +452,7 @@ def __init__( quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.attn", + workspace_buffer=workspace_buffer, ) self.mlp = Qwen2_5_VisionMLP( dim, @@ -635,11 +639,16 @@ def __init__( AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.ROCM_AITER_FA, + AttentionBackendEnum.FLASHINFER, }: raise RuntimeError( f"Qwen2.5-VL does not support {self.attn_backend} backend now." ) - + workspace_buffer = ( + None + if self.attn_backend != AttentionBackendEnum.FLASHINFER + else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") + ) with set_model_tag("Qwen2_5_VisionBlock", is_encoder=True): self.blocks = nn.ModuleList( [ @@ -652,6 +661,7 @@ def __init__( quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", + workspace_buffer=workspace_buffer, ) for layer_idx in range(depth) ] diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 44b45a08dc1e..286dce118153 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -214,6 +214,7 @@ def __init__( multimodal_config: MultiModalConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + workspace_buffer: torch.Tensor | None = None, ) -> None: super().__init__() if norm_layer is None: @@ -227,6 +228,7 @@ def __init__( quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.attn", + workspace_buffer=workspace_buffer, ) self.mlp = Qwen3_VisionMLP( dim, @@ -399,10 +401,18 @@ def __init__( AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.ROCM_AITER_FA, + AttentionBackendEnum.FLASHINFER, }: raise RuntimeError( f"Qwen3-VL does not support {self.attn_backend} backend now." ) + + workspace_buffer = ( + None + if self.attn_backend != AttentionBackendEnum.FLASHINFER + else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") + ) + self.blocks = nn.ModuleList( [ Qwen3_VisionBlock( @@ -414,6 +424,7 @@ def __init__( quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", + workspace_buffer=workspace_buffer, ) for layer_idx in range(vision_config.depth) ] @@ -540,6 +551,7 @@ def compute_attn_mask_seqlen( if ( self.attn_backend == AttentionBackendEnum.FLASH_ATTN or self.attn_backend == AttentionBackendEnum.FLASH_ATTN_CUTE + or self.attn_backend == AttentionBackendEnum.FLASHINFER or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA ): max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8f315881df45..020e948a4a40 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -363,6 +363,7 @@ def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]: AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.FLASHINFER, ] @classmethod diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 8fa0a442c3a5..ef2ca7a6cf5c 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -262,3 +262,82 @@ def vit_torch_sdpa_wrapper( cu_seqlens: torch.Tensor | None = None, ) -> torch.Tensor: return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens) + + +def flashinfer_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: float, + workspace_buffer: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, +) -> torch.Tensor: + from vllm.v1.attention.backends.flashinfer import cudnn_batch_prefill_with_kv_cache + + is_reshaped = q.dim() == 4 + batch_size = q.shape[0] + if is_reshaped: + q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + + q_len = q.size(0) + if cu_seqlens is None: + cu_seqlens = torch.full((1, 1, 1, 1), q_len, dtype=torch.int32, device=q.device) + elif cu_seqlens.dim() == 1: + cu_seqlens = cu_seqlens[:, None, None, None] + max_seqlen = q_len if max_seqlen is None else max_seqlen.item() + + output = cudnn_batch_prefill_with_kv_cache( + q, + k, + v, + scale, + workspace_buffer, + max_token_per_sequence=max_seqlen, + max_sequence_kv=max_seqlen, + actual_seq_lens_q=cu_seqlens, + actual_seq_lens_kv=cu_seqlens, + causal=False, + return_lse=False, + ) + if isinstance(output, tuple): + for i in output: + if isinstance(i, torch.Tensor): + output = i + if is_reshaped: + output = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size) + + return output + + +def vit_flashinfer_wrapper_fake( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: float, + workspace_buffer: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.empty_like(q) + + +direct_register_custom_op( + op_name="flashinfer_wrapper", + op_func=flashinfer_wrapper, + fake_impl=vit_flashinfer_wrapper_fake, +) + + +def vit_flashinfer_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: float, + workspace_buffer: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.ops.vllm.flashinfer_wrapper( + q, k, v, scale, workspace_buffer, cu_seqlens, max_seqlen + ) From 7cbf291a47d9e47d21cc6c5a4648558f817f4dc0 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 19 Jan 2026 11:35:32 -0800 Subject: [PATCH 02/34] add impl Signed-off-by: Max Hu --- .../layers/attention/mm_encoder_attention.py | 10 +++++++++- vllm/model_executor/models/qwen2_5_vl.py | 2 ++ vllm/model_executor/models/qwen3_vl.py | 10 +++++++++- vllm/v1/attention/ops/vit_attn_wrappers.py | 19 ++++++++++++------- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 97afa10ce57a..c2df61fa5e4d 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -194,6 +194,7 @@ def _forward_flashinfer( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: return vit_flashinfer_wrapper( q=query, @@ -203,6 +204,7 @@ def _forward_flashinfer( workspace_buffer=self.workspace_buffer, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, + act_seq_lens=act_seq_lens, ) def _forward_fa4( @@ -246,6 +248,7 @@ def forward_native( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -256,13 +259,16 @@ def forward_cuda( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: if self.is_fa4_backend: return self._forward_fa4(query, key, value, cu_seqlens, max_seqlen) elif self.is_flash_attn_backend: return self._forward_fa(query, key, value, cu_seqlens, max_seqlen) elif self.attn_backend == AttentionBackendEnum.FLASHINFER: - return self._forward_flashinfer(query, key, value, cu_seqlens, max_seqlen) + return self._forward_flashinfer( + query, key, value, cu_seqlens, max_seqlen, act_seq_lens + ) elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: return self._forward_sdpa(query, key, value, cu_seqlens) else: @@ -278,6 +284,7 @@ def forward_cpu( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -288,6 +295,7 @@ def forward_xpu( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: assert self.is_flash_attn_backend, ( "XPU only supports FLASH_ATTN for vision attention." diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 6e6682df7654..42e30748e86f 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -368,6 +368,7 @@ def forward( rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -408,6 +409,7 @@ def forward( value=v, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, + act_seq_lens=act_seq_lens, ) context_layer = einops.rearrange( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 286dce118153..fafb40bdd8fb 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -247,6 +247,7 @@ def forward( rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -254,6 +255,7 @@ def forward( rotary_pos_emb_cos=rotary_pos_emb_cos, rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, + act_seq_lens=act_seq_lens, ) x = x + self.mlp(self.norm2(x)) @@ -410,7 +412,7 @@ def __init__( workspace_buffer = ( None if self.attn_backend != AttentionBackendEnum.FLASHINFER - else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") + else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=self.device) ) self.blocks = nn.ModuleList( @@ -580,10 +582,15 @@ def forward( axis=0, dtype=np.int32 ) cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + act_seq_lens = torch.from_numpy(cu_seqlens[1:] - cu_seqlens[:-1]) + act_seq_lens = act_seq_lens.to(self.device, non_blocking=True) + cu_seqlens = torch.from_numpy(cu_seqlens) hidden_states = hidden_states.unsqueeze(1) max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) + if self.attn_backend == AttentionBackendEnum.FLASHINFER: + cu_seqlens = cu_seqlens * self.hidden_size cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) deepstack_feature_lists = [] @@ -594,6 +601,7 @@ def forward( rotary_pos_emb_cos=rotary_pos_emb_cos, rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, + act_seq_lens=act_seq_lens, ) if layer_num in self.deepstack_visual_indexes: deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index ef2ca7a6cf5c..134e8df4e05c 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -272,6 +272,7 @@ def flashinfer_wrapper( workspace_buffer: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: from vllm.v1.attention.backends.flashinfer import cudnn_batch_prefill_with_kv_cache @@ -281,10 +282,8 @@ def flashinfer_wrapper( q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) q_len = q.size(0) - if cu_seqlens is None: - cu_seqlens = torch.full((1, 1, 1, 1), q_len, dtype=torch.int32, device=q.device) - elif cu_seqlens.dim() == 1: - cu_seqlens = cu_seqlens[:, None, None, None] + batch_offsets = cu_seqlens.view(-1, 1, 1, 1) + actual_seq_lens = act_seq_lens.view(-1, 1, 1, 1) max_seqlen = q_len if max_seqlen is None else max_seqlen.item() output = cudnn_batch_prefill_with_kv_cache( @@ -295,10 +294,14 @@ def flashinfer_wrapper( workspace_buffer, max_token_per_sequence=max_seqlen, max_sequence_kv=max_seqlen, - actual_seq_lens_q=cu_seqlens, - actual_seq_lens_kv=cu_seqlens, + actual_seq_lens_q=actual_seq_lens, + actual_seq_lens_kv=actual_seq_lens, causal=False, return_lse=False, + batch_offsets_q=batch_offsets, + batch_offsets_o=batch_offsets, + batch_offsets_k=batch_offsets, + batch_offsets_v=batch_offsets, ) if isinstance(output, tuple): for i in output: @@ -318,6 +321,7 @@ def vit_flashinfer_wrapper_fake( workspace_buffer: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: return torch.empty_like(q) @@ -337,7 +341,8 @@ def vit_flashinfer_wrapper( workspace_buffer: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, + act_seq_lens: torch.Tensor | None = None, ) -> torch.Tensor: return torch.ops.vllm.flashinfer_wrapper( - q, k, v, scale, workspace_buffer, cu_seqlens, max_seqlen + q, k, v, scale, workspace_buffer, cu_seqlens, max_seqlen, act_seq_lens ) From 871329128e206439935b0b223d4fc41814af1834 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Tue, 20 Jan 2026 14:23:40 -0800 Subject: [PATCH 03/34] add flashinfer --- vllm/v1/attention/backends/flashinfer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index a6e4776060f0..f9b594c943ae 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -15,6 +15,7 @@ ) from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache from flashinfer.prefill import trtllm_batch_context_with_kv_cache +from flashinfer.cudnn.prefill import cudnn_batch_prefill_with_kv_cache from flashinfer.utils import FP4Tensor from typing_extensions import override From f9362fb8cbf58f2a2f0d3e5948b6df26e574d892 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Wed, 21 Jan 2026 10:26:33 -0800 Subject: [PATCH 04/34] fix tp Signed-off-by: Max Hu --- vllm/model_executor/models/qwen2_5_vl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 42e30748e86f..b18b7663529c 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -402,7 +402,8 @@ def forward( q, k = qk_rotated.unbind(dim=0) else: q, k, v = qkv.unbind(dim=2) - + if self.attn.attn_backend == AttentionBackendEnum.FLASHINFER: + cu_seqlens = cu_seqlens // self.tp_size context_layer = self.attn( query=q, key=k, From d48087fd1242ba66dadfebeacfc6427e14dbe60b Mon Sep 17 00:00:00 2001 From: Anerudhan Gopal Date: Tue, 20 Jan 2026 13:51:41 -0800 Subject: [PATCH 05/34] Temporary change for ViT --- .../layers/attention/mm_encoder_attention.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/v1/attention/ops/vit_attn_wrappers.py | 73 +++++++++++++++++-- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index c2df61fa5e4d..7f9773e5a632 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -203,7 +203,7 @@ def _forward_flashinfer( scale=self.scale, workspace_buffer=self.workspace_buffer, cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, + max_seqlen=torch.tensor(32768), #Hard code for now, This will remove the dependency on max_seqlen. Set it to arbitrarily large (128K) act_seq_lens=act_seq_lens, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index fafb40bdd8fb..5d34b386eec1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -590,7 +590,7 @@ def forward( hidden_states = hidden_states.unsqueeze(1) max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) if self.attn_backend == AttentionBackendEnum.FLASHINFER: - cu_seqlens = cu_seqlens * self.hidden_size + cu_seqlens = cu_seqlens * self.num_heads * self.hidden_size cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) deepstack_feature_lists = [] diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 134e8df4e05c..7f4ee6311ee9 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -264,6 +264,59 @@ def vit_torch_sdpa_wrapper( return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens) +# Batch buckets for cuDNN graph caching - graphs are cached per bucket size +# This avoids creating a new graph for each unique batch size at runtime +BATCH_BUCKETS = [8, 16, 32, 64] + + +def _pad_to_batch_bucket( + batch_size: int, + actual_seq_lens: torch.Tensor, + batch_offsets: torch.Tensor, +) -> tuple[int, torch.Tensor, torch.Tensor]: + """ + Pad actual_seq_lens and batch_offsets to match the nearest batch bucket. + + This follows the same padding strategy as cuDNN frontend's SDPA caching: + https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_sdpa_with_caching.py + + Args: + batch_size: Actual batch size + actual_seq_lens: Tensor of shape (batch_size, 1, 1, 1) with sequence lengths + batch_offsets: Tensor of shape (batch_size + 1, 1, 1, 1) with cumulative offsets + + Returns: + Tuple of (padded_batch_size, padded_actual_seq_lens, padded_batch_offsets) + """ + # Find the nearest bucket size >= actual batch size + batch_size_padded = next( + (b for b in BATCH_BUCKETS if b >= batch_size), BATCH_BUCKETS[-1] + ) + + if batch_size_padded == batch_size: + return batch_size, actual_seq_lens, batch_offsets + + # Pad actual_seq_lens with zeros + zeros_seq_lens = torch.zeros( + (batch_size_padded - batch_size, 1, 1, 1), + dtype=actual_seq_lens.dtype, + device=actual_seq_lens.device, + ) + actual_seq_lens_padded = torch.cat([actual_seq_lens, zeros_seq_lens], dim=0) + + # Pad batch_offsets with zeros + # Note: batch_offsets has shape (batch_size + 1, 1, 1, 1), so we need to pad + # (batch_size_padded + 1) - (batch_size + 1) = batch_size_padded - batch_size + zeros_offsets = torch.zeros( + (batch_size_padded - batch_size, 1, 1, 1), + dtype=batch_offsets.dtype, + device=batch_offsets.device, + ) + batch_offsets_padded = torch.cat([batch_offsets, zeros_offsets], dim=0) + + return batch_size_padded, actual_seq_lens_padded, batch_offsets_padded + + def flashinfer_wrapper( q: torch.Tensor, k: torch.Tensor, @@ -277,7 +330,8 @@ def flashinfer_wrapper( from vllm.v1.attention.backends.flashinfer import cudnn_batch_prefill_with_kv_cache is_reshaped = q.dim() == 4 - batch_size = q.shape[0] + batch_size = q.shape[0] if is_reshaped else (cu_seqlens.shape[0] - 1) + if is_reshaped: q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) @@ -286,6 +340,11 @@ def flashinfer_wrapper( actual_seq_lens = act_seq_lens.view(-1, 1, 1, 1) max_seqlen = q_len if max_seqlen is None else max_seqlen.item() + # Pad batch_offsets and actual_seq_lens to nearest batch bucket + # This enables cuDNN graph caching for better performance + padded_batch_size, actual_seq_lens_padded, batch_offsets_padded = \ + _pad_to_batch_bucket(batch_size, actual_seq_lens, batch_offsets) + output = cudnn_batch_prefill_with_kv_cache( q, k, @@ -294,14 +353,14 @@ def flashinfer_wrapper( workspace_buffer, max_token_per_sequence=max_seqlen, max_sequence_kv=max_seqlen, - actual_seq_lens_q=actual_seq_lens, - actual_seq_lens_kv=actual_seq_lens, + actual_seq_lens_q=actual_seq_lens_padded, + actual_seq_lens_kv=actual_seq_lens_padded, causal=False, return_lse=False, - batch_offsets_q=batch_offsets, - batch_offsets_o=batch_offsets, - batch_offsets_k=batch_offsets, - batch_offsets_v=batch_offsets, + batch_offsets_q=batch_offsets_padded, + batch_offsets_o=batch_offsets_padded, + batch_offsets_k=batch_offsets_padded, + batch_offsets_v=batch_offsets_padded, ) if isinstance(output, tuple): for i in output: From 71eeda271dd1e0c8da4f42b6f7994196076c1fa4 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 21 Jan 2026 23:25:53 -0500 Subject: [PATCH 06/34] fix workspace_buffer device. --- vllm/model_executor/models/qwen2_5_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b18b7663529c..8e1848c1e43f 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -650,7 +650,7 @@ def __init__( workspace_buffer = ( None if self.attn_backend != AttentionBackendEnum.FLASHINFER - else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") + else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=self.device) ) with set_model_tag("Qwen2_5_VisionBlock", is_encoder=True): self.blocks = nn.ModuleList( From 392b3ac9d7ebc4ede89f70cd005858fced16bd24 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 21 Jan 2026 23:27:22 -0500 Subject: [PATCH 07/34] change max_seqlen to 128k. --- vllm/model_executor/layers/attention/mm_encoder_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 7f9773e5a632..278ea130a6d5 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -203,7 +203,7 @@ def _forward_flashinfer( scale=self.scale, workspace_buffer=self.workspace_buffer, cu_seqlens=cu_seqlens, - max_seqlen=torch.tensor(32768), #Hard code for now, This will remove the dependency on max_seqlen. Set it to arbitrarily large (128K) + max_seqlen=torch.tensor(128 * 1024), #Hard code for now, This will remove the dependency on max_seqlen. Set it to arbitrarily large (128K) act_seq_lens=act_seq_lens, ) From 772a17b7c8b026b5677c4be825453c89a1bc8e24 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 22 Jan 2026 13:21:29 -0500 Subject: [PATCH 08/34] remove duplicate multiplier. --- vllm/model_executor/models/qwen3_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 5d34b386eec1..fafb40bdd8fb 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -590,7 +590,7 @@ def forward( hidden_states = hidden_states.unsqueeze(1) max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) if self.attn_backend == AttentionBackendEnum.FLASHINFER: - cu_seqlens = cu_seqlens * self.num_heads * self.hidden_size + cu_seqlens = cu_seqlens * self.hidden_size cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) deepstack_feature_lists = [] From c38e8c4621777aea7f21b0356daa64ed7cfa3506 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Fri, 23 Jan 2026 09:19:11 -0800 Subject: [PATCH 09/34] fix accuracy and refactor --- .../layers/attention/mm_encoder_attention.py | 16 +-- vllm/model_executor/models/qwen2_5_vl.py | 7 +- vllm/model_executor/models/qwen3_vl.py | 83 ++++++++++++-- vllm/v1/attention/ops/vit_attn_wrappers.py | 103 ++++-------------- 4 files changed, 107 insertions(+), 102 deletions(-) diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 278ea130a6d5..60cc9002e3c3 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -194,7 +194,7 @@ def _forward_flashinfer( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: return vit_flashinfer_wrapper( q=query, @@ -203,8 +203,8 @@ def _forward_flashinfer( scale=self.scale, workspace_buffer=self.workspace_buffer, cu_seqlens=cu_seqlens, - max_seqlen=torch.tensor(128 * 1024), #Hard code for now, This will remove the dependency on max_seqlen. Set it to arbitrarily large (128K) - act_seq_lens=act_seq_lens, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, ) def _forward_fa4( @@ -248,7 +248,7 @@ def forward_native( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -259,7 +259,7 @@ def forward_cuda( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: if self.is_fa4_backend: return self._forward_fa4(query, key, value, cu_seqlens, max_seqlen) @@ -267,7 +267,7 @@ def forward_cuda( return self._forward_fa(query, key, value, cu_seqlens, max_seqlen) elif self.attn_backend == AttentionBackendEnum.FLASHINFER: return self._forward_flashinfer( - query, key, value, cu_seqlens, max_seqlen, act_seq_lens + query, key, value, cu_seqlens, max_seqlen, sequence_lengths ) elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -284,7 +284,7 @@ def forward_cpu( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -295,7 +295,7 @@ def forward_xpu( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: assert self.is_flash_attn_backend, ( "XPU only supports FLASH_ATTN for vision attention." diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8e1848c1e43f..ccf3dcb59049 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -368,7 +368,7 @@ def forward( rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -402,15 +402,14 @@ def forward( q, k = qk_rotated.unbind(dim=0) else: q, k, v = qkv.unbind(dim=2) - if self.attn.attn_backend == AttentionBackendEnum.FLASHINFER: - cu_seqlens = cu_seqlens // self.tp_size + context_layer = self.attn( query=q, key=k, value=v, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, - act_seq_lens=act_seq_lens, + sequence_lengths=sequence_lengths, ) context_layer = einops.rearrange( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index fafb40bdd8fb..bf376c73083c 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -52,6 +52,7 @@ from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group +from vllm.distributed import parallel_state from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -131,6 +132,10 @@ # Official recommended max pixels is 24576 * 32 * 32 _MAX_FRAMES_PER_VIDEO = 24576 +# Batch buckets for cuDNN graph caching - graphs are cached per bucket size +# This avoids creating a new graph for each unique batch size at runtime +BATCH_BUCKETS = [8, 16, 32, 64] + class Qwen3_VisionPatchEmbed(nn.Module): def __init__( @@ -247,7 +252,7 @@ def forward( rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -255,7 +260,7 @@ def forward( rotary_pos_emb_cos=rotary_pos_emb_cos, rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, - act_seq_lens=act_seq_lens, + sequence_lengths=sequence_lengths, ) x = x + self.mlp(self.norm2(x)) @@ -339,6 +344,17 @@ def __init__( self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes self.num_grid_per_side = int(self.num_position_embeddings**0.5) + use_data_parallel = ( + multimodal_config.mm_encoder_tp_mode == "data" + if multimodal_config + else False + ) + self.tp_size = ( + 1 + if use_data_parallel + else parallel_state.get_tensor_model_parallel_world_size() + ) + # NOTE: This is used for creating empty tensor for all_gather for # DP ViT. Here out_hidden_size is enlarged due to deepstack self.out_hidden_size = vision_config.out_hidden_size * ( @@ -559,6 +575,53 @@ def compute_attn_mask_seqlen( max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() return max_seqlen + def compute_flashinfer_cu_seqlens(self, cu_seqlens: np.ndarray) -> np.ndarray: + scale = self.hidden_size // self.tp_size + return np.concatenate([ + cu_seqlens * scale * 2, # q, k stride + cu_seqlens * scale * 3, # v stride + cu_seqlens * scale, # o stride + ]) + + def pad_to_batch_bucket( + self, + cu_seqlens: np.ndarray, + sequence_lengths: np.ndarray, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Pad cu_seqlens and sequence_lengths to match the nearest batch bucket. + + This follows the same padding strategy as cuDNN frontend's SDPA caching: + https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_sdpa_with_caching.py + + Args: + cu_seqlens: Array of shape (batch_size + 1,) with cumulative offsets + sequence_lengths: Array of shape (batch_size,) with sequence lengths + + Returns: + Tuple of (padded_cu_seqlens, padded_sequence_lengths) + """ + batch_size = len(sequence_lengths) + # Find the nearest bucket size >= actual batch size + batch_size_padded = next( + (b for b in BATCH_BUCKETS if b >= batch_size), BATCH_BUCKETS[-1] + ) + + if batch_size_padded == batch_size: + return cu_seqlens, sequence_lengths + + pad_count = batch_size_padded - batch_size + + # Pad actual_seq_lens with zeros + zeros_seq_lens = np.zeros((pad_count, ), dtype=sequence_lengths.dtype) + sequence_lengths_padded = np.concatenate([sequence_lengths, zeros_seq_lens], axis=0) + + # Pad cu_seqlens with zeros + zeros_offsets = np.zeros((pad_count, ), dtype=cu_seqlens.dtype) + cu_seqlens_padded = np.concatenate([cu_seqlens, zeros_offsets], axis=0) + + return cu_seqlens_padded, sequence_lengths_padded + def forward( self, x: torch.Tensor, @@ -582,15 +645,17 @@ def forward( axis=0, dtype=np.int32 ) cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) - act_seq_lens = torch.from_numpy(cu_seqlens[1:] - cu_seqlens[:-1]) - act_seq_lens = act_seq_lens.to(self.device, non_blocking=True) + sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1] + if self.attn_backend == AttentionBackendEnum.FLASHINFER: + cu_seqlens, sequence_lengths = self.pad_to_batch_bucket(cu_seqlens, sequence_lengths) + cu_seqlens = self.compute_flashinfer_cu_seqlens(cu_seqlens) cu_seqlens = torch.from_numpy(cu_seqlens) - + sequence_lengths = torch.from_numpy(sequence_lengths) hidden_states = hidden_states.unsqueeze(1) - max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) - if self.attn_backend == AttentionBackendEnum.FLASHINFER: - cu_seqlens = cu_seqlens * self.hidden_size + max_seqlen = torch.tensor(128 * 1024, device=self.device) \ + if self.attn_backend == AttentionBackendEnum.FLASHINFER \ + else self.compute_attn_mask_seqlen(cu_seqlens) cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) deepstack_feature_lists = [] @@ -601,7 +666,7 @@ def forward( rotary_pos_emb_cos=rotary_pos_emb_cos, rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, - act_seq_lens=act_seq_lens, + sequence_lengths=sequence_lengths, ) if layer_num in self.deepstack_visual_indexes: deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 7f4ee6311ee9..e948e5d17d5b 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -263,60 +263,6 @@ def vit_torch_sdpa_wrapper( ) -> torch.Tensor: return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens) - -# Batch buckets for cuDNN graph caching - graphs are cached per bucket size -# This avoids creating a new graph for each unique batch size at runtime -BATCH_BUCKETS = [8, 16, 32, 64] - - -def _pad_to_batch_bucket( - batch_size: int, - actual_seq_lens: torch.Tensor, - batch_offsets: torch.Tensor, -) -> tuple[int, torch.Tensor, torch.Tensor]: - """ - Pad actual_seq_lens and batch_offsets to match the nearest batch bucket. - - This follows the same padding strategy as cuDNN frontend's SDPA caching: - https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_sdpa_with_caching.py - - Args: - batch_size: Actual batch size - actual_seq_lens: Tensor of shape (batch_size, 1, 1, 1) with sequence lengths - batch_offsets: Tensor of shape (batch_size + 1, 1, 1, 1) with cumulative offsets - - Returns: - Tuple of (padded_batch_size, padded_actual_seq_lens, padded_batch_offsets) - """ - # Find the nearest bucket size >= actual batch size - batch_size_padded = next( - (b for b in BATCH_BUCKETS if b >= batch_size), BATCH_BUCKETS[-1] - ) - - if batch_size_padded == batch_size: - return batch_size, actual_seq_lens, batch_offsets - - # Pad actual_seq_lens with zeros - zeros_seq_lens = torch.zeros( - (batch_size_padded - batch_size, 1, 1, 1), - dtype=actual_seq_lens.dtype, - device=actual_seq_lens.device, - ) - actual_seq_lens_padded = torch.cat([actual_seq_lens, zeros_seq_lens], dim=0) - - # Pad batch_offsets with zeros - # Note: batch_offsets has shape (batch_size + 1, 1, 1, 1), so we need to pad - # (batch_size_padded + 1) - (batch_size + 1) = batch_size_padded - batch_size - zeros_offsets = torch.zeros( - (batch_size_padded - batch_size, 1, 1, 1), - dtype=batch_offsets.dtype, - device=batch_offsets.device, - ) - batch_offsets_padded = torch.cat([batch_offsets, zeros_offsets], dim=0) - - return batch_size_padded, actual_seq_lens_padded, batch_offsets_padded - - def flashinfer_wrapper( q: torch.Tensor, k: torch.Tensor, @@ -325,27 +271,25 @@ def flashinfer_wrapper( workspace_buffer: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, ) -> torch.Tensor: from vllm.v1.attention.backends.flashinfer import cudnn_batch_prefill_with_kv_cache is_reshaped = q.dim() == 4 - batch_size = q.shape[0] if is_reshaped else (cu_seqlens.shape[0] - 1) - + if is_reshaped: + reshape_batch_size = q.shape[0] q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - q_len = q.size(0) - batch_offsets = cu_seqlens.view(-1, 1, 1, 1) - actual_seq_lens = act_seq_lens.view(-1, 1, 1, 1) - max_seqlen = q_len if max_seqlen is None else max_seqlen.item() - - # Pad batch_offsets and actual_seq_lens to nearest batch bucket - # This enables cuDNN graph caching for better performance - padded_batch_size, actual_seq_lens_padded, batch_offsets_padded = \ - _pad_to_batch_bucket(batch_size, actual_seq_lens, batch_offsets) + assert len(cu_seqlens) % 3 == 0, "cu_seqlens must be divisible by 3" + cu_seqlength = len(cu_seqlens) // 3 + batch_offsets_qk = cu_seqlens[: cu_seqlength].view(-1, 1, 1, 1) + batch_offsets_v = cu_seqlens[cu_seqlength : cu_seqlength * 2].view(-1, 1, 1, 1) + batch_offsets_o = cu_seqlens[cu_seqlength * 2 :].view(-1, 1, 1, 1) + sequence_lengths = sequence_lengths.view(-1, 1, 1, 1) + max_seqlen = max_seqlen.item() - output = cudnn_batch_prefill_with_kv_cache( + output, _ = cudnn_batch_prefill_with_kv_cache( q, k, v, @@ -353,21 +297,18 @@ def flashinfer_wrapper( workspace_buffer, max_token_per_sequence=max_seqlen, max_sequence_kv=max_seqlen, - actual_seq_lens_q=actual_seq_lens_padded, - actual_seq_lens_kv=actual_seq_lens_padded, + actual_seq_lens_q=sequence_lengths, + actual_seq_lens_kv=sequence_lengths, causal=False, return_lse=False, - batch_offsets_q=batch_offsets_padded, - batch_offsets_o=batch_offsets_padded, - batch_offsets_k=batch_offsets_padded, - batch_offsets_v=batch_offsets_padded, + batch_offsets_q=batch_offsets_qk, + batch_offsets_k=batch_offsets_qk, + batch_offsets_v=batch_offsets_v, + batch_offsets_o=batch_offsets_o, ) - if isinstance(output, tuple): - for i in output: - if isinstance(i, torch.Tensor): - output = i + if is_reshaped: - output = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size) + output = einops.rearrange(output, "(b s) h d -> b s h d", b=reshape_batch_size) return output @@ -380,7 +321,7 @@ def vit_flashinfer_wrapper_fake( workspace_buffer: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, ) -> torch.Tensor: return torch.empty_like(q) @@ -400,8 +341,8 @@ def vit_flashinfer_wrapper( workspace_buffer: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, - act_seq_lens: torch.Tensor | None = None, + sequence_lengths: torch.Tensor | None = None, ) -> torch.Tensor: return torch.ops.vllm.flashinfer_wrapper( - q, k, v, scale, workspace_buffer, cu_seqlens, max_seqlen, act_seq_lens + q, k, v, scale, workspace_buffer, cu_seqlens, max_seqlen, sequence_lengths ) From 19d5ffae28394af877698ccff7bc185ec9ea19a3 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Fri, 23 Jan 2026 13:52:42 -0800 Subject: [PATCH 10/34] more fix --- vllm/model_executor/models/qwen3_vl.py | 70 +++++++++----------------- 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index bf376c73083c..050c0ba387b2 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -575,52 +575,32 @@ def compute_attn_mask_seqlen( max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() return max_seqlen - def compute_flashinfer_cu_seqlens(self, cu_seqlens: np.ndarray) -> np.ndarray: - scale = self.hidden_size // self.tp_size - return np.concatenate([ - cu_seqlens * scale * 2, # q, k stride - cu_seqlens * scale * 3, # v stride - cu_seqlens * scale, # o stride - ]) - - def pad_to_batch_bucket( - self, - cu_seqlens: np.ndarray, - sequence_lengths: np.ndarray, - ) -> tuple[np.ndarray, np.ndarray]: - """ - Pad cu_seqlens and sequence_lengths to match the nearest batch bucket. - - This follows the same padding strategy as cuDNN frontend's SDPA caching: - https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_sdpa_with_caching.py - - Args: - cu_seqlens: Array of shape (batch_size + 1,) with cumulative offsets - sequence_lengths: Array of shape (batch_size,) with sequence lengths - - Returns: - Tuple of (padded_cu_seqlens, padded_sequence_lengths) - """ - batch_size = len(sequence_lengths) - # Find the nearest bucket size >= actual batch size + def add_padding_to_fi_seqlens(self, seq: np.ndarray, batch_size: int, padding_value: int) -> np.ndarray: batch_size_padded = next( (b for b in BATCH_BUCKETS if b >= batch_size), BATCH_BUCKETS[-1] ) - if batch_size_padded == batch_size: - return cu_seqlens, sequence_lengths - - pad_count = batch_size_padded - batch_size - - # Pad actual_seq_lens with zeros - zeros_seq_lens = np.zeros((pad_count, ), dtype=sequence_lengths.dtype) - sequence_lengths_padded = np.concatenate([sequence_lengths, zeros_seq_lens], axis=0) - - # Pad cu_seqlens with zeros - zeros_offsets = np.zeros((pad_count, ), dtype=cu_seqlens.dtype) - cu_seqlens_padded = np.concatenate([cu_seqlens, zeros_offsets], axis=0) - - return cu_seqlens_padded, sequence_lengths_padded + return seq + return np.concatenate([seq, np.full((batch_size_padded - batch_size, ), padding_value, dtype=seq.dtype)]) + + def compute_flashinfer_cu_seqlens(self, + cu_seqlens: np.ndarray, + rotary_pos_emb_cos: torch.Tensor | None = None, + rotary_pos_emb_sin: torch.Tensor | None = None, + ) -> np.ndarray: + batch_size = len(cu_seqlens) - 1 + scale = self.hidden_size // self.tp_size + cu_seqlens = cu_seqlens * scale + if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None: + cu_seqlens_qk = cu_seqlens * 2 + else: + cu_seqlens_qk = cu_seqlens * 3 + cu_seqlens_v = cu_seqlens * 3 + cu_seqlens_o = cu_seqlens + cu_seqlens_qk = self.add_padding_to_fi_seqlens(cu_seqlens_qk, batch_size, cu_seqlens_qk[-1]) + cu_seqlens_v = self.add_padding_to_fi_seqlens(cu_seqlens_v, batch_size, cu_seqlens_v[-1]) + cu_seqlens_o = self.add_padding_to_fi_seqlens(cu_seqlens_o, batch_size, cu_seqlens_o[-1]) + return np.concatenate([cu_seqlens_qk, cu_seqlens_v, cu_seqlens_o]) def forward( self, @@ -647,9 +627,8 @@ def forward( cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1] if self.attn_backend == AttentionBackendEnum.FLASHINFER: - cu_seqlens, sequence_lengths = self.pad_to_batch_bucket(cu_seqlens, sequence_lengths) - cu_seqlens = self.compute_flashinfer_cu_seqlens(cu_seqlens) - + sequence_lengths = self.add_padding_to_fi_seqlens(sequence_lengths, len(sequence_lengths), 0) + cu_seqlens = self.compute_flashinfer_cu_seqlens(cu_seqlens, rotary_pos_emb_cos, rotary_pos_emb_sin) cu_seqlens = torch.from_numpy(cu_seqlens) sequence_lengths = torch.from_numpy(sequence_lengths) hidden_states = hidden_states.unsqueeze(1) @@ -657,6 +636,7 @@ def forward( if self.attn_backend == AttentionBackendEnum.FLASHINFER \ else self.compute_attn_mask_seqlen(cu_seqlens) cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + sequence_lengths = sequence_lengths.to(self.device, non_blocking=True) deepstack_feature_lists = [] for layer_num, blk in enumerate(self.blocks): From 47af3e1dfd8944bbf1a2c9461e3b8bdeb210be68 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Sun, 25 Jan 2026 16:25:51 -0800 Subject: [PATCH 11/34] change dockerfile --- docker/Dockerfile | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 227f4a3355c8..214000bdea79 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -496,12 +496,27 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # This is ~1.1GB and only changes when FlashInfer version bumps # https://docs.flashinfer.ai/installation.html # From versions.json: .flashinfer.version -ARG FLASHINFER_VERSION=0.5.3 +# Install FlashInfer from CentML fork (source build) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \ - && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \ - --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - && flashinfer show-config + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + # Install the main flashinfer package + uv pip install --system -v . && \ + # Build and install flashinfer-cubin + cd /tmp/flashinfer/flashinfer-cubin && \ + uv pip install --system build && \ + python -m build --no-isolation --wheel && \ + uv pip install --system dist/*.whl && \ + # Build and install flashinfer-jit-cache + cd /tmp/flashinfer/flashinfer-jit-cache && \ + python -m build --no-isolation --wheel && \ + uv pip install --system dist/*.whl && \ + # Cleanup + rm -rf /tmp/flashinfer && \ + flashinfer show-config # ============================================================ # OPENAI API SERVER DEPENDENCIES From a09a7854b567b22c072f4366cdf4944714f0ca36 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Sun, 25 Jan 2026 16:49:49 -0800 Subject: [PATCH 12/34] format Signed-off-by: Max Hu --- .../layers/attention/mm_encoder_attention.py | 15 ++++-- vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 47 ++++++++++++++----- vllm/v1/attention/backends/flashinfer.py | 1 - vllm/v1/attention/ops/vit_attn_wrappers.py | 3 +- 5 files changed, 47 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 60cc9002e3c3..28d83776ebe5 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -194,7 +194,8 @@ def _forward_flashinfer( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, - sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor + | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: return vit_flashinfer_wrapper( q=query, @@ -248,7 +249,8 @@ def forward_native( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor + | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -259,7 +261,8 @@ def forward_cuda( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor + | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: if self.is_fa4_backend: return self._forward_fa4(query, key, value, cu_seqlens, max_seqlen) @@ -284,7 +287,8 @@ def forward_cpu( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor + | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: return self._forward_sdpa(query, key, value, cu_seqlens) @@ -295,7 +299,8 @@ def forward_xpu( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - sequence_lengths: torch.Tensor | None = None, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor + | None = None, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: assert self.is_flash_attn_backend, ( "XPU only supports FLASH_ATTN for vision attention." diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index ccf3dcb59049..dfffbc307b49 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -368,7 +368,7 @@ def forward( rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - sequence_lengths: torch.Tensor, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 050c0ba387b2..40099edac6da 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -51,8 +51,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions -from vllm.distributed import get_pp_group -from vllm.distributed import parallel_state +from vllm.distributed import get_pp_group, parallel_state from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -252,7 +251,7 @@ def forward( rotary_pos_emb_cos: torch.Tensor, rotary_pos_emb_sin: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - sequence_lengths: torch.Tensor, # Only used for FlashInfer CuDNN backend + sequence_lengths: torch.Tensor, # Only used for FlashInfer CuDNN backend ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -575,15 +574,25 @@ def compute_attn_mask_seqlen( max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() return max_seqlen - def add_padding_to_fi_seqlens(self, seq: np.ndarray, batch_size: int, padding_value: int) -> np.ndarray: + def add_padding_to_fi_seqlens( + self, seq: np.ndarray, batch_size: int, padding_value: int + ) -> np.ndarray: batch_size_padded = next( (b for b in BATCH_BUCKETS if b >= batch_size), BATCH_BUCKETS[-1] ) if batch_size_padded == batch_size: return seq - return np.concatenate([seq, np.full((batch_size_padded - batch_size, ), padding_value, dtype=seq.dtype)]) + return np.concatenate( + [ + seq, + np.full( + (batch_size_padded - batch_size,), padding_value, dtype=seq.dtype + ), + ] + ) - def compute_flashinfer_cu_seqlens(self, + def compute_flashinfer_cu_seqlens( + self, cu_seqlens: np.ndarray, rotary_pos_emb_cos: torch.Tensor | None = None, rotary_pos_emb_sin: torch.Tensor | None = None, @@ -597,9 +606,15 @@ def compute_flashinfer_cu_seqlens(self, cu_seqlens_qk = cu_seqlens * 3 cu_seqlens_v = cu_seqlens * 3 cu_seqlens_o = cu_seqlens - cu_seqlens_qk = self.add_padding_to_fi_seqlens(cu_seqlens_qk, batch_size, cu_seqlens_qk[-1]) - cu_seqlens_v = self.add_padding_to_fi_seqlens(cu_seqlens_v, batch_size, cu_seqlens_v[-1]) - cu_seqlens_o = self.add_padding_to_fi_seqlens(cu_seqlens_o, batch_size, cu_seqlens_o[-1]) + cu_seqlens_qk = self.add_padding_to_fi_seqlens( + cu_seqlens_qk, batch_size, cu_seqlens_qk[-1] + ) + cu_seqlens_v = self.add_padding_to_fi_seqlens( + cu_seqlens_v, batch_size, cu_seqlens_v[-1] + ) + cu_seqlens_o = self.add_padding_to_fi_seqlens( + cu_seqlens_o, batch_size, cu_seqlens_o[-1] + ) return np.concatenate([cu_seqlens_qk, cu_seqlens_v, cu_seqlens_o]) def forward( @@ -627,14 +642,20 @@ def forward( cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1] if self.attn_backend == AttentionBackendEnum.FLASHINFER: - sequence_lengths = self.add_padding_to_fi_seqlens(sequence_lengths, len(sequence_lengths), 0) - cu_seqlens = self.compute_flashinfer_cu_seqlens(cu_seqlens, rotary_pos_emb_cos, rotary_pos_emb_sin) + sequence_lengths = self.add_padding_to_fi_seqlens( + sequence_lengths, len(sequence_lengths), 0 + ) + cu_seqlens = self.compute_flashinfer_cu_seqlens( + cu_seqlens, rotary_pos_emb_cos, rotary_pos_emb_sin + ) cu_seqlens = torch.from_numpy(cu_seqlens) sequence_lengths = torch.from_numpy(sequence_lengths) hidden_states = hidden_states.unsqueeze(1) - max_seqlen = torch.tensor(128 * 1024, device=self.device) \ - if self.attn_backend == AttentionBackendEnum.FLASHINFER \ + max_seqlen = ( + torch.tensor(128 * 1024, device=self.device) + if self.attn_backend == AttentionBackendEnum.FLASHINFER else self.compute_attn_mask_seqlen(cu_seqlens) + ) cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) sequence_lengths = sequence_lengths.to(self.device, non_blocking=True) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index f9b594c943ae..a6e4776060f0 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -15,7 +15,6 @@ ) from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache from flashinfer.prefill import trtllm_batch_context_with_kv_cache -from flashinfer.cudnn.prefill import cudnn_batch_prefill_with_kv_cache from flashinfer.utils import FP4Tensor from typing_extensions import override diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index e948e5d17d5b..17d80aae0b60 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -263,6 +263,7 @@ def vit_torch_sdpa_wrapper( ) -> torch.Tensor: return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens) + def flashinfer_wrapper( q: torch.Tensor, k: torch.Tensor, @@ -283,7 +284,7 @@ def flashinfer_wrapper( assert len(cu_seqlens) % 3 == 0, "cu_seqlens must be divisible by 3" cu_seqlength = len(cu_seqlens) // 3 - batch_offsets_qk = cu_seqlens[: cu_seqlength].view(-1, 1, 1, 1) + batch_offsets_qk = cu_seqlens[:cu_seqlength].view(-1, 1, 1, 1) batch_offsets_v = cu_seqlens[cu_seqlength : cu_seqlength * 2].view(-1, 1, 1, 1) batch_offsets_o = cu_seqlens[cu_seqlength * 2 :].view(-1, 1, 1, 1) sequence_lengths = sequence_lengths.view(-1, 1, 1, 1) From bfd41ecf56af5198a2c5d8c93bd25ce27681d001 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Sun, 25 Jan 2026 17:14:24 -0800 Subject: [PATCH 13/34] fix version Signed-off-by: Max Hu --- docker/versions.json | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docker/versions.json b/docker/versions.json index 045955bc46ce..bb2432067d6f 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -67,8 +67,11 @@ "RUN_WHEEL_CHECK": { "default": "true" }, - "FLASHINFER_VERSION": { - "default": "0.5.3" + "FLASHINFER_REPO": { + "default": "https://github.com/CentML/flashinfer.git" + }, + "FLASHINFER_BRANCH": { + "default": "mlperf-inf-mm-q3vl-v6.0" }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" From 5599eb47b7ab76749bbb75a3c538d654350cb88c Mon Sep 17 00:00:00 2001 From: Max Hu Date: Sun, 25 Jan 2026 17:23:36 -0800 Subject: [PATCH 14/34] change python version --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 214000bdea79..8e1ab2998ac9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -508,11 +508,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Build and install flashinfer-cubin cd /tmp/flashinfer/flashinfer-cubin && \ uv pip install --system build && \ - python -m build --no-isolation --wheel && \ + python3 -m build --no-isolation --wheel && \ uv pip install --system dist/*.whl && \ # Build and install flashinfer-jit-cache cd /tmp/flashinfer/flashinfer-jit-cache && \ - python -m build --no-isolation --wheel && \ + python3 -m build --no-isolation --wheel && \ uv pip install --system dist/*.whl && \ # Cleanup rm -rf /tmp/flashinfer && \ From 76b1482ec4f5427a3e9af81f4d3bc28aaca99f33 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Sun, 25 Jan 2026 17:38:39 -0800 Subject: [PATCH 15/34] remove qwen25 transformer support --- vllm/model_executor/models/qwen2_5_vl.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index dfffbc307b49..a10e5a4689b6 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -440,7 +440,6 @@ def __init__( quant_config: QuantizationConfig | None = None, multimodal_config: MultiModalConfig | None = None, prefix: str = "", - workspace_buffer: torch.Tensor | None = None, # Only used for FlashInfer ) -> None: super().__init__() if norm_layer is None: @@ -454,7 +453,6 @@ def __init__( quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.attn", - workspace_buffer=workspace_buffer, ) self.mlp = Qwen2_5_VisionMLP( dim, @@ -641,16 +639,11 @@ def __init__( AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.ROCM_AITER_FA, - AttentionBackendEnum.FLASHINFER, }: raise RuntimeError( f"Qwen2.5-VL does not support {self.attn_backend} backend now." ) - workspace_buffer = ( - None - if self.attn_backend != AttentionBackendEnum.FLASHINFER - else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=self.device) - ) + with set_model_tag("Qwen2_5_VisionBlock", is_encoder=True): self.blocks = nn.ModuleList( [ @@ -663,7 +656,6 @@ def __init__( quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", - workspace_buffer=workspace_buffer, ) for layer_idx in range(depth) ] From fec48331929fe4f7ebfd77a91e5dddec39afd6e7 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 08:35:44 -0800 Subject: [PATCH 16/34] change dockerfile --- docker/Dockerfile | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8e1ab2998ac9..027e8116b0f9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -492,29 +492,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt -# Install FlashInfer pre-compiled kernel cache and binaries -# This is ~1.1GB and only changes when FlashInfer version bumps -# https://docs.flashinfer.ai/installation.html -# From versions.json: .flashinfer.version -# Install FlashInfer from CentML fork (source build) +# Install FlashInfer from CentML fork (source build with AOT kernels) # https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG CUDA_VERSION ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 RUN --mount=type=cache,target=/root/.cache/uv \ + # Set CUDA arch list based on CUDA version + if [[ "${CUDA_VERSION}" == 11.* ]]; then \ + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"; \ + elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then \ + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"; \ + elif [[ "${CUDA_VERSION}" == 12.[8-9]* ]]; then \ + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0"; \ + else \ + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0f 12.0"; \ + fi && \ + # Clone and build FlashInfer git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ cd /tmp/flashinfer && \ - # Install the main flashinfer package - uv pip install --system -v . && \ - # Build and install flashinfer-cubin - cd /tmp/flashinfer/flashinfer-cubin && \ - uv pip install --system build && \ - python3 -m build --no-isolation --wheel && \ - uv pip install --system dist/*.whl && \ - # Build and install flashinfer-jit-cache - cd /tmp/flashinfer/flashinfer-jit-cache && \ - python3 -m build --no-isolation --wheel && \ - uv pip install --system dist/*.whl && \ - # Cleanup + # Set environment for build + export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ + export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" && \ + export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" && \ + python3 -m flashinfer.aot && \ + uv pip install --system --no-build-isolation --force-reinstall -v . && \ rm -rf /tmp/flashinfer && \ flashinfer show-config From 9a8c2d5274238c7de4295f1bbdafc02a119201f1 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 08:59:07 -0800 Subject: [PATCH 17/34] add build versions --- docker/Dockerfile | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 027e8116b0f9..8c3d6a70a6e1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -495,29 +495,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install FlashInfer from CentML fork (source build with AOT kernels) # https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 # https://docs.flashinfer.ai/installation.html -ARG CUDA_VERSION ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 RUN --mount=type=cache,target=/root/.cache/uv \ # Set CUDA arch list based on CUDA version - if [[ "${CUDA_VERSION}" == 11.* ]]; then \ - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"; \ - elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then \ - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"; \ - elif [[ "${CUDA_VERSION}" == 12.[8-9]* ]]; then \ - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0"; \ - else \ - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0f 12.0"; \ - fi && \ # Clone and build FlashInfer git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ cd /tmp/flashinfer && \ - # Set environment for build - export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ - export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" && \ - export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" && \ - python3 -m flashinfer.aot && \ uv pip install --system --no-build-isolation --force-reinstall -v . && \ + uv pip install --system nvidia-nvshmem-cu12 && \ + cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl && \ + export FLASHINFER_CUDA_ARCH_LIST=="10.0a 10.0f 12.0"; \ + cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl && \ rm -rf /tmp/flashinfer && \ flashinfer show-config From f6a2ee72ac5baa1f86b9c41bcbbaf4a290343fb4 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 09:05:12 -0800 Subject: [PATCH 18/34] chagne version --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 8c3d6a70a6e1..b246ff1b50b0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -507,7 +507,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ cd flashinfer-cubin && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl && \ - export FLASHINFER_CUDA_ARCH_LIST=="10.0a 10.0f 12.0"; \ + export FLASHINFER_CUDA_ARCH_LIST="13.0"; \ cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl && \ From 4b9aa2a3ee9d97d7555c10e16d5672f468b980c3 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 09:13:57 -0800 Subject: [PATCH 19/34] change version --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b246ff1b50b0..090c4da43bd3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -507,7 +507,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ cd flashinfer-cubin && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl && \ - export FLASHINFER_CUDA_ARCH_LIST="13.0"; \ + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 12.0"; \ cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl && \ From f782e97ca488d2c57737609a8f35618301b19f26 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 09:28:20 -0800 Subject: [PATCH 20/34] change --- docker/Dockerfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 090c4da43bd3..4e330ace0254 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -504,13 +504,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \ cd /tmp/flashinfer && \ uv pip install --system --no-build-isolation --force-reinstall -v . && \ uv pip install --system nvidia-nvshmem-cu12 && \ - cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl && \ - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 12.0"; \ - cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 12.0" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ rm -rf /tmp/flashinfer && \ flashinfer show-config From 56868a9c0eec97790c833f2b5bd897da1f5ca051 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 09:46:43 -0800 Subject: [PATCH 21/34] change --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 4e330ace0254..a11cebd27ce9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -509,7 +509,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 12.0" && \ + # export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 12.0" && \ (cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ From c2ca45033efb611f6c5d302a143b33978c88757d Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 09:57:01 -0800 Subject: [PATCH 22/34] change --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a11cebd27ce9..d521ba572fca 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -509,7 +509,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ # Build flashinfer-jit-cache (subshell isolates cd) - # export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 12.0" && \ + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.0f 10.0" && \ (cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ From 1d8b7ec6daf0ba02c9289b0f382784772d46b52e Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 09:57:36 -0800 Subject: [PATCH 23/34] change --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d521ba572fca..a3e7d83b4dd8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -509,7 +509,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.0f 10.0" && \ + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f 10.0" && \ (cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ From 413260ec0e57eb46f046c83f3421ba7507e5829b Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 10:05:15 -0800 Subject: [PATCH 24/34] change --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a3e7d83b4dd8..6366535e02d4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -509,7 +509,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f 10.0" && \ + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f 12.0" && \ (cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ From 7a2ac66d307ce59ea014f4840a8e8d3fc0ca1b48 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 11:27:00 -0800 Subject: [PATCH 25/34] build image --- docker/Dockerfile | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6366535e02d4..0beff8cabf67 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -391,6 +391,30 @@ COPY requirements/dev.txt requirements/dev.txt RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + +# Install FlashInfer from CentML fork (source build with AOT kernels) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + # Set CUDA arch list based on CUDA version + # Clone and build FlashInfer + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + uv pip install --system --no-build-isolation -v . && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + rm -rf /tmp/flashinfer && \ + flashinfer show-config + #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed @@ -492,30 +516,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt -# Install FlashInfer from CentML fork (source build with AOT kernels) -# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html -ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git -ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 -RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version - # Clone and build FlashInfer - git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ - cd /tmp/flashinfer && \ - uv pip install --system --no-build-isolation --force-reinstall -v . && \ - uv pip install --system nvidia-nvshmem-cu12 && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f 12.0" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - rm -rf /tmp/flashinfer && \ - flashinfer show-config - # ============================================================ # OPENAI API SERVER DEPENDENCIES # Pre-install these to avoid reinstalling on every vLLM wheel rebuild From e8d34b775b1fc132ef90fabeed28ef07f3a1bfa8 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 12:22:21 -0800 Subject: [PATCH 26/34] change back --- docker/Dockerfile | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0beff8cabf67..36d89b3eed19 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -392,29 +392,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -# Install FlashInfer from CentML fork (source build with AOT kernels) -# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html -ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git -ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 -RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version - # Clone and build FlashInfer - git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ - cd /tmp/flashinfer && \ - uv pip install --system --no-build-isolation -v . && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - rm -rf /tmp/flashinfer && \ - flashinfer show-config - #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed @@ -516,6 +493,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt +# Install FlashInfer from CentML fork (source build with AOT kernels) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + # Set CUDA arch list based on CUDA version + # Clone and build FlashInfer + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + uv pip install --system --no-build-isolation -v . && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + rm -rf /tmp/flashinfer && \ + flashinfer show-config + # ============================================================ # OPENAI API SERVER DEPENDENCIES # Pre-install these to avoid reinstalling on every vLLM wheel rebuild From 5adb2944890dd057232481f5d6864638f4919aab Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 13:08:54 -0800 Subject: [PATCH 27/34] change to 10.0f --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 36d89b3eed19..728d65965764 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -509,7 +509,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f 10.3a" && \ (cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . && \ uv pip install --system dist/*.whl) && \ From bc90e8f76003bf457eb0d9f9a9f295e81b0d8516 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 13:25:04 -0800 Subject: [PATCH 28/34] fix fi import Signed-off-by: Max Hu --- vllm/v1/attention/ops/vit_attn_wrappers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 17d80aae0b60..84b1438fb1b0 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -274,7 +274,7 @@ def flashinfer_wrapper( max_seqlen: torch.Tensor | None = None, sequence_lengths: torch.Tensor | None = None, ) -> torch.Tensor: - from vllm.v1.attention.backends.flashinfer import cudnn_batch_prefill_with_kv_cache + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache is_reshaped = q.dim() == 4 From 2d1286dd676e1e8197cd2264ccdfe90c389d26d1 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 13:29:12 -0800 Subject: [PATCH 29/34] change to build in dev image Signed-off-by: Max Hu --- docker/Dockerfile | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 728d65965764..226958c0a205 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -367,6 +367,29 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ else \ echo "Skipping wheel size check."; \ fi + +# Install FlashInfer from CentML fork (source build with AOT kernels) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + # Set CUDA arch list based on CUDA version + # Clone and build FlashInfer + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + uv pip install --system --no-build-isolation -v . && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + rm -rf /tmp/flashinfer && \ + flashinfer show-config #################### EXTENSION Build IMAGE #################### #################### DEV IMAGE #################### @@ -493,29 +516,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt -# Install FlashInfer from CentML fork (source build with AOT kernels) -# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html -ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git -ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 -RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version - # Clone and build FlashInfer - git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ - cd /tmp/flashinfer && \ - uv pip install --system --no-build-isolation -v . && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0f 10.3a" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - rm -rf /tmp/flashinfer && \ - flashinfer show-config - # ============================================================ # OPENAI API SERVER DEPENDENCIES # Pre-install these to avoid reinstalling on every vLLM wheel rebuild From 42858c683ad445fe2c8eda1c79c0f4d2500fa296 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 13:35:03 -0800 Subject: [PATCH 30/34] change location Signed-off-by: Max Hu --- docker/Dockerfile | 46 ++++++++++++++++++++++---------------------- docker/versions.json | 12 ++++++------ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 226958c0a205..29be6f269a80 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -273,6 +273,29 @@ ENV UV_LINK_MODE=copy WORKDIR /workspace +# Install FlashInfer from CentML fork (source build with AOT kernels) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + # Set CUDA arch list based on CUDA version + # Clone and build FlashInfer + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + uv pip install --system --no-build-isolation -v . && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + rm -rf /tmp/flashinfer && \ + flashinfer show-config + # Build DeepGEMM wheel # Default moved here from tools/install_deepgemm.sh for centralized version management ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6 @@ -367,29 +390,6 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ else \ echo "Skipping wheel size check."; \ fi - -# Install FlashInfer from CentML fork (source build with AOT kernels) -# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html -ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git -ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 -RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version - # Clone and build FlashInfer - git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ - cd /tmp/flashinfer && \ - uv pip install --system --no-build-isolation -v . && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - rm -rf /tmp/flashinfer && \ - flashinfer show-config #################### EXTENSION Build IMAGE #################### #################### DEV IMAGE #################### diff --git a/docker/versions.json b/docker/versions.json index bb2432067d6f..543cb8fdc113 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -49,6 +49,12 @@ "vllm_target_device": { "default": "cuda" }, + "FLASHINFER_REPO": { + "default": "https://github.com/CentML/flashinfer.git" + }, + "FLASHINFER_BRANCH": { + "default": "mlperf-inf-mm-q3vl-v6.0" + }, "DEEPGEMM_GIT_REF": { "default": "594953acce41793ae00a1233eb516044d604bcb6" }, @@ -67,12 +73,6 @@ "RUN_WHEEL_CHECK": { "default": "true" }, - "FLASHINFER_REPO": { - "default": "https://github.com/CentML/flashinfer.git" - }, - "FLASHINFER_BRANCH": { - "default": "mlperf-inf-mm-q3vl-v6.0" - }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" }, From c9a8f9b22ccdd90169438c5111bd4fb8b075deac Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 13:50:28 -0800 Subject: [PATCH 31/34] change location Signed-off-by: Max Hu --- docker/Dockerfile | 46 ++++++++++++++++++++++---------------------- docker/versions.json | 12 ++++++------ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 29be6f269a80..c76d887a2fe8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -273,29 +273,6 @@ ENV UV_LINK_MODE=copy WORKDIR /workspace -# Install FlashInfer from CentML fork (source build with AOT kernels) -# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html -ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git -ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 -RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version - # Clone and build FlashInfer - git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ - cd /tmp/flashinfer && \ - uv pip install --system --no-build-isolation -v . && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - rm -rf /tmp/flashinfer && \ - flashinfer show-config - # Build DeepGEMM wheel # Default moved here from tools/install_deepgemm.sh for centralized version management ARG DEEPGEMM_GIT_REF=594953acce41793ae00a1233eb516044d604bcb6 @@ -327,6 +304,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \ ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \ find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete + +# Install FlashInfer from CentML fork (source build with AOT kernels) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + # Set CUDA arch list based on CUDA version + # Clone and build FlashInfer + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + uv pip install --system --no-build-isolation -v . && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + rm -rf /tmp/flashinfer && \ + flashinfer show-config #################### EXTENSIONS BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### diff --git a/docker/versions.json b/docker/versions.json index 543cb8fdc113..bfa345fc7a15 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -49,12 +49,6 @@ "vllm_target_device": { "default": "cuda" }, - "FLASHINFER_REPO": { - "default": "https://github.com/CentML/flashinfer.git" - }, - "FLASHINFER_BRANCH": { - "default": "mlperf-inf-mm-q3vl-v6.0" - }, "DEEPGEMM_GIT_REF": { "default": "594953acce41793ae00a1233eb516044d604bcb6" }, @@ -64,6 +58,12 @@ "DEEPEP_COMMIT_HASH": { "default": "73b6ea4" }, + "FLASHINFER_REPO": { + "default": "https://github.com/CentML/flashinfer.git" + }, + "FLASHINFER_BRANCH": { + "default": "mlperf-inf-mm-q3vl-v6.0" + }, "GIT_REPO_CHECK": { "default": "0" }, From 89703a475b4fa0f82f7a9d0429d89a6e519b989a Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 14:10:41 -0800 Subject: [PATCH 32/34] change Signed-off-by: Max Hu --- docker/Dockerfile | 48 ++++++++++++++++++++++++-------------------- docker/versions.json | 12 +++++------ 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c76d887a2fe8..d946fdb3494d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -305,28 +305,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \ find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete -# Install FlashInfer from CentML fork (source build with AOT kernels) -# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html -ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git -ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 -RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version - # Clone and build FlashInfer - git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ - cd /tmp/flashinfer && \ - uv pip install --system --no-build-isolation -v . && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - rm -rf /tmp/flashinfer && \ - flashinfer show-config #################### EXTENSIONS BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### @@ -467,6 +445,8 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-venv \ libibverbs-dev \ + gcc \ + g++ \ && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ @@ -516,6 +496,30 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt +# Install FlashInfer from CentML fork (source build with AOT kernels) +# https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 +# https://docs.flashinfer.ai/installation.html +ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git +ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system cmake ninja && \ + # Set CUDA arch list based on CUDA version + # Clone and build FlashInfer + git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ + cd /tmp/flashinfer && \ + uv pip install --system --no-build-isolation -v . && \ + # Build flashinfer-cubin (subshell isolates cd) + (cd flashinfer-cubin && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + # Build flashinfer-jit-cache (subshell isolates cd) + export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ + (cd flashinfer-jit-cache && \ + uv build --no-build-isolation --wheel . && \ + uv pip install --system dist/*.whl) && \ + rm -rf /tmp/flashinfer && \ + flashinfer show-config + # ============================================================ # OPENAI API SERVER DEPENDENCIES # Pre-install these to avoid reinstalling on every vLLM wheel rebuild diff --git a/docker/versions.json b/docker/versions.json index bfa345fc7a15..bb2432067d6f 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -58,12 +58,6 @@ "DEEPEP_COMMIT_HASH": { "default": "73b6ea4" }, - "FLASHINFER_REPO": { - "default": "https://github.com/CentML/flashinfer.git" - }, - "FLASHINFER_BRANCH": { - "default": "mlperf-inf-mm-q3vl-v6.0" - }, "GIT_REPO_CHECK": { "default": "0" }, @@ -73,6 +67,12 @@ "RUN_WHEEL_CHECK": { "default": "true" }, + "FLASHINFER_REPO": { + "default": "https://github.com/CentML/flashinfer.git" + }, + "FLASHINFER_BRANCH": { + "default": "mlperf-inf-mm-q3vl-v6.0" + }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" }, From 9431a615e9ac6c8fe79120bb42b825064f8c1dae Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 20:50:11 -0800 Subject: [PATCH 33/34] change cubin and jitcache to wheels Signed-off-by: Max Hu --- docker/Dockerfile | 17 +++++------------ docker/versions.json | 6 ++++++ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d946fdb3494d..c738245df502 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -445,8 +445,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-venv \ libibverbs-dev \ - gcc \ - g++ \ && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ @@ -501,22 +499,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # https://docs.flashinfer.ai/installation.html ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 +ARG FLASHINFER_CUBIN_VERSION=0.5.3 +ARG FLASHINFER_JIT_CACHE_VERSION=0.5.3 RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system cmake ninja && \ # Set CUDA arch list based on CUDA version # Clone and build FlashInfer git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ cd /tmp/flashinfer && \ uv pip install --system --no-build-isolation -v . && \ - # Build flashinfer-cubin (subshell isolates cd) - (cd flashinfer-cubin && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ - # Build flashinfer-jit-cache (subshell isolates cd) - export FLASHINFER_CUDA_ARCH_LIST="9.0a 10.0a 10.3a" && \ - (cd flashinfer-jit-cache && \ - uv build --no-build-isolation --wheel . && \ - uv pip install --system dist/*.whl) && \ + uv pip install --system flashinfer-cubin==${FLASHINFER_CUBIN_VERSION} && \ + uv pip install --system flashinfer-jit-cache==${FLASHINFER_JIT_CACHE_VERSION} \ + --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm -rf /tmp/flashinfer && \ flashinfer show-config diff --git a/docker/versions.json b/docker/versions.json index bb2432067d6f..3bb174eea948 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -73,6 +73,12 @@ "FLASHINFER_BRANCH": { "default": "mlperf-inf-mm-q3vl-v6.0" }, + "FLASHINFER_CUBIN_VERSION": { + "default": "0.5.3" + }, + "FLASHINFER_JIT_CACHE_VERSION": { + "default": "0.5.3" + }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" }, From 0e0f19eeb128720c7245dd8a84a1482cf3926765 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 26 Jan 2026 20:54:06 -0800 Subject: [PATCH 34/34] change Signed-off-by: Max Hu --- docker/Dockerfile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c738245df502..ac7cea073e35 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -304,7 +304,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \ ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \ find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete - #################### EXTENSIONS BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### @@ -392,7 +391,6 @@ COPY requirements/dev.txt requirements/dev.txt RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') - #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed @@ -494,15 +492,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt -# Install FlashInfer from CentML fork (source build with AOT kernels) +# Install FlashInfer from CentML fork # https://github.com/CentML/flashinfer/tree/mlperf-inf-mm-q3vl-v6.0 -# https://docs.flashinfer.ai/installation.html ARG FLASHINFER_REPO=https://github.com/CentML/flashinfer.git ARG FLASHINFER_BRANCH=mlperf-inf-mm-q3vl-v6.0 ARG FLASHINFER_CUBIN_VERSION=0.5.3 ARG FLASHINFER_JIT_CACHE_VERSION=0.5.3 RUN --mount=type=cache,target=/root/.cache/uv \ - # Set CUDA arch list based on CUDA version # Clone and build FlashInfer git clone --recursive -b ${FLASHINFER_BRANCH} ${FLASHINFER_REPO} /tmp/flashinfer && \ cd /tmp/flashinfer && \