From a0789a921916922d16f005eb31a7805535079a92 Mon Sep 17 00:00:00 2001
From: vickiw973 <vickiw@nvidia.com>
Date: Wed, 1 Oct 2025 15:34:41 +0800
Subject: [PATCH 01/13] add a simple fp16 gemm.

---
 problems/nvidia/fp16_gemm/reference.py  | 395 ++++++++++++++++++++++++
 problems/nvidia/fp16_gemm/submission.py |  96 ++++++
 problems/nvidia/fp16_gemm/task.py       |  15 +
 problems/nvidia/fp16_gemm/task.yml      |  65 ++++
 4 files changed, 571 insertions(+)
 create mode 100644 problems/nvidia/fp16_gemm/reference.py
 create mode 100644 problems/nvidia/fp16_gemm/submission.py
 create mode 100644 problems/nvidia/fp16_gemm/task.py
 create mode 100644 problems/nvidia/fp16_gemm/task.yml

diff --git a/problems/nvidia/fp16_gemm/reference.py b/problems/nvidia/fp16_gemm/reference.py
new file mode 100644
index 00000000..77f52c7a
--- /dev/null
+++ b/problems/nvidia/fp16_gemm/reference.py
@@ -0,0 +1,395 @@
+import argparse
+import torch
+from typing import Tuple
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.torch as cutlass_torch
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+"""
+The first tutorial GEMM demonstrating a simple kernel implementation in CuTeDSL
+
+This dense GEMM kernel is implemented in just over 200 lines of code.
+With large tile sizes, it can achieve very high performance on 8k×8k×8k problem sizes.
+It can serve as a starting point to help users quickly experiment
+with optimizations for challenges that may arise with other problem sizes.
+
+To run this example:
+.. code-block:: bash
+
+    python examples/blackwell/tutorial_fp16_gemm_0.py  \
+      --mnk 8192,8192,8192                             \
+      --tolerance 1e-01
+
+Constraints for this example:
+* The problem size of m and n must be divisible by the tile size m & n (128, 256)
+"""
+
+io_dtype = cutlass.Float16
+acc_dtype = cutlass.Float32
+mma_inst_shape_mnk = (128, 256, 16)
+mma_tiler_mnk = (128, 256, 64)
+threads_per_cta = 128
+
+# Pipeline stage configuration
+ab_stages = 4
+acc_stage = 1
+
+
+@cute.struct
+class SharedStorage:
+    ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, ab_stages * 2]
+    acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, acc_stage * 2]
+    tmem_holding_buf: cutlass.Int32
+
+
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b: cute.CopyAtom,
+    mB_nkl: cute.Tensor,
+    mC_mnl: cute.Tensor,
+    a_smem_layout: cute.ComposedLayout,
+    b_smem_layout: cute.ComposedLayout,
+):
+    # Current thread/warp/block coordinates
+    tidx, _, _ = cute.arch.thread_idx()
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    bidx, bidy, _ = cute.arch.block_idx()
+    mma_coord_mnk = (bidx, bidy, None)
+
+    #
+    # 1. Prepare args
+    #
+
+    # Allocate SMEM
+    smem = cutlass.utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+    sA = smem.allocate_tensor(
+        element_type=io_dtype,
+        layout=a_smem_layout.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout.inner,
+    )
+    sB = smem.allocate_tensor(
+        element_type=io_dtype,
+        layout=b_smem_layout.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout.inner,
+    )
+
+    # Allocate all TMEM columns
+    # tmem_alloc_barrier = pipeline.NamedBarrier(
+    #     barrier_id=1,
+    #     num_threads=threads_per_cta,
+    # )
+    # tmem = utils.TmemAllocator(
+    #     storage.tmem_holding_buf,
+    #     barrier_for_retrieve=tmem_alloc_barrier,
+    # )
+    # num_tmem_cols = 512
+    # tmem.allocate(num_tmem_cols)
+
+    # Prefetch tma descriptor
+    if warp_idx == 0:
+        cpasync.prefetch_descriptor(tma_atom_a)
+        cpasync.prefetch_descriptor(tma_atom_b)
+
+    # Pipeline configuration
+    num_tma_copy_bytes = cute.size_in_bytes(
+        io_dtype, cute.select(a_smem_layout, mode=[0, 1, 2])
+    ) + cute.size_in_bytes(io_dtype, cute.select(b_smem_layout, mode=[0, 1, 2]))
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        num_stages=ab_stages,
+        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        tx_count=num_tma_copy_bytes,
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        num_stages=acc_stage,
+        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, threads_per_cta, threads_per_cta
+        ),
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+    ).make_participants()
+
+    # Partition tensors for MMA and make fragments
+    # (bM, bK, RestK)
+    gA = cute.local_tile(mA_mkl, mma_tiler_mnk, mma_coord_mnk, proj=(1, None, 1))
+    # (bN, bK, RestK)
+    gB = cute.local_tile(mB_nkl, mma_tiler_mnk, mma_coord_mnk, proj=(None, 1, 1))
+    # (bM, bN)
+    gC = cute.local_tile(mC_mnl, mma_tiler_mnk, mma_coord_mnk, proj=(1, 1, None))
+    thr_mma = tiled_mma.get_slice(0)
+    # (MMA, MMA_M, MMA_K)
+    tCgA = thr_mma.partition_A(gA)
+    # (MMA, MMA_N, MMA_K)
+    tCgB = thr_mma.partition_B(gB)
+    # (MMA, MMA_M, MMA_N)
+    tCgC = thr_mma.partition_C(gC)
+    # (MMA, MMA_M, MMA_K)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K)
+    tCrB = tiled_mma.make_fragment_B(sB)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc = tiled_mma.make_fragment_C(acc_shape)
+    # Partition tensors for TMA; This requires the tensors partitioned for MMA
+    tAsA, tAgA = cute.nvgpu.cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    tBsB, tBgB = cute.nvgpu.cpasync.tma_partition(
+        tma_atom_b,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB, 0, 3),
+        cute.group_modes(tCgB, 0, 3),
+    )
+
+    # CTA-wide sync before retrieving the pointer to the start of the allocated TMEM
+    # Only warp 0 does the allocation so we need to sync before retrieving the TMEM start address
+    if warp_idx == 0:
+        cute.arch.alloc_tmem(512, storage.tmem_holding_buf)
+    cute.arch.barrier()
+    tmem_ptr = cute.arch.retrieve_tmem_ptr(
+        acc_dtype,
+        alignment=16,
+        ptr_to_buffer_holding_addr=storage.tmem_holding_buf,
+    )
+
+    # Swap the pointer in tCtAcc
+    tCtAcc = cute.make_tensor(tmem_ptr, tCtAcc.layout)
+
+    subtile_cnt = 4
+    # (EpiTile)
+    epi_tiler = (
+        (cute.size(tCtAcc, mode=[0, 0]), cute.size(tCtAcc, mode=[0, 1]) // subtile_cnt),
+    )
+    # (EpiTile, NumTiles)
+    tCtAcc_epi = cute.zipped_divide(tCtAcc, epi_tiler)
+    # (EpiTile, NumTiles)
+    gC_epi = cute.zipped_divide(tCgC, epi_tiler)
+
+    # Every thread loads 32x128 bits
+    tmem_atom = cute.make_copy_atom(
+        tcgen05.Ld32x32bOp(tcgen05.Repetition.x64),
+        cutlass.Float32,
+    )
+    tmem_tiled_copy = tcgen05.make_tmem_copy(tmem_atom, tCtAcc_epi[None, 0])
+    tmem_thr_copy = tmem_tiled_copy.get_slice(tidx)
+
+    # (TmemCpy,NumTmemCpy,NumTiles)
+    tDtC = tmem_thr_copy.partition_S(tCtAcc_epi)
+    # (TmemCpy,NumTmemCpy,NumTiles)
+    tDgC = tmem_thr_copy.partition_D(gC_epi)
+
+    # (TmemCpy,NumTmemCpy)
+    tCrAcc = cute.make_fragment(tDgC[None, None, 0].shape, acc_dtype)
+    # (TmemCpy,NumTmemCpy)
+    tCrC = cute.make_fragment(tDgC[None, None, 0].shape, io_dtype)
+
+    #
+    # 2. Main loop
+    #
+    num_k_tiles = cute.size(gA, mode=[2])
+    if warp_idx == 0:
+        # Wait for a empty accumulator buffer
+        acc_empty = acc_producer.acquire_and_advance()
+        for k_tile_idx in cutlass.range(num_k_tiles, prefetch_stages=ab_stages - 2):
+            # Issue TMA loads
+            ab_empty = ab_producer.acquire_and_advance()
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, ab_empty.count)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b,
+                tBgB[(None, ab_empty.count)],
+                tBsB[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+
+            # Execute one K-block worth of MMA instructions
+            ab_full = ab_consumer.wait_and_advance()
+            num_k_blocks = cute.size(tCrA, mode=[2])
+            for k_block_idx in cutlass.range_constexpr(num_k_blocks):
+                k_block_coord = (None, None, k_block_idx, ab_full.index)
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc,
+                    tCrA[k_block_coord],
+                    tCrB[k_block_coord],
+                    tCtAcc,
+                )
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Signal that the A/B buffers have been consumed and are ready for the next load
+            ab_full.release()
+
+        # Signal that the accumulator is fully computed
+        acc_empty.commit()
+
+    #
+    # 3. Epilogue
+    #
+
+    # Release TMEM allocation lock
+    if warp_idx == 0:
+        cute.arch.relinquish_tmem_alloc_permit()
+
+    # Wait for the accumulator buffer to be full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # TMEM -> RMEM -> GEMM
+    # Sub-tiling for better instruction-level parallelism
+    for i in cutlass.range(cute.size(tDtC, mode=[2])):
+        cute.copy(tmem_tiled_copy, tDtC[None, None, i], tCrAcc)
+        tCrC.store(tCrAcc.load().to(io_dtype))
+        cute.autovec_copy(tCrC, tDgC[None, None, i])
+
+    acc_full.release()
+
+    # Deallocate TMEM
+    cute.arch.barrier()
+    if warp_idx == 0:
+        cute.arch.dealloc_tmem(tmem_ptr, 512)
+
+
+@cute.jit
+def ref_kernel(
+    a: cute.Tensor,
+    b: cute.Tensor,
+    c: cute.Tensor,
+):
+    # Construct tiled MMA
+    op = tcgen05.MmaF16BF16Op(
+        io_dtype,
+        acc_dtype,
+        mma_inst_shape_mnk,
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+        tcgen05.OperandMajorMode.K,
+        tcgen05.OperandMajorMode.K,
+    )
+    tiled_mma = cute.make_tiled_mma(op)
+
+    # Construct SMEM layouts for A and B
+    a_smem_layout = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        a.element_type,
+        ab_stages,
+    )
+    b_smem_layout = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        b.element_type,
+        ab_stages,
+    )
+    a_smem_layout_one_stage = cute.select(a_smem_layout, mode=[0, 1, 2])
+    b_smem_layout_one_stage = cute.select(b_smem_layout, mode=[0, 1, 2])
+
+    # Construct TMA load atoms
+    op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+    a_tma_atom, a_tma_tensor = cute.nvgpu.make_tiled_tma_atom_A(
+        op,
+        a,
+        a_smem_layout_one_stage,
+        mma_tiler_mnk,
+        tiled_mma,
+        (1, 1, 1),
+    )
+    b_tma_atom, b_tma_tensor = cute.nvgpu.make_tiled_tma_atom_B(
+        op,
+        b,
+        b_smem_layout_one_stage,
+        mma_tiler_mnk,
+        tiled_mma,
+        (1, 1, 1),
+    )
+
+    # Launch the kernel
+    grid_shape = cute.ceil_div((*c.layout.shape, 1), mma_tiler_mnk[:2])
+    kernel(
+        tiled_mma,
+        a_tma_atom,
+        a_tma_tensor,
+        b_tma_atom,
+        b_tma_tensor,
+        c,
+        a_smem_layout,
+        b_smem_layout,
+    ).launch(
+        grid=grid_shape,
+        block=(threads_per_cta, 1, 1),
+    )
+
+
+def generate_input(
+    m: int,
+    n: int,
+    k: int,
+    seed: int,
+):
+    torch.manual_seed(seed)
+
+    # Make K-major tensors (torch tensors are row-major)
+    def make_tensors(mn, k, dtype):
+        shape = (mn, k)
+        return (
+            torch.empty(*shape, dtype=torch.int32)
+            .random_(-2, 2)
+            .to(dtype=dtype, device="cuda")
+        )
+
+    a = make_tensors(m, k, cutlass_torch.dtype(io_dtype))
+    b = make_tensors(n, k, cutlass_torch.dtype(io_dtype))
+    c = make_tensors(m, n, cutlass_torch.dtype(io_dtype))
+    a_tensor = (
+        from_dlpack(a, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=k)
+    )
+    b_tensor = (
+        from_dlpack(b, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=k)
+    )
+    c_tensor = (
+        from_dlpack(c, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=n)
+    )
+
+    # Compute reference result and verify
+    ref = (torch.einsum("mk,nk->mn", a.to(torch.float32), b.to(torch.float32))).cpu()
+
+    return a_tensor, b_tensor, c_tensor, c, ref
+
+
+def check_implementation(ref, c_tensor):
+    torch.testing.assert_close(
+        c_tensor.cpu(), ref.to(cutlass_torch.dtype(io_dtype)), atol=1e-01, rtol=1e-05
+    )
+    print("PASS")
+
+a_tensor, b_tensor, c_tensor, c, ref = generate_input(m=1024, n=1024, k=1024, seed=1111)
+ref_kernel(a_tensor, b_tensor, c_tensor)
+check_implementation(ref, c)
\ No newline at end of file
diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
new file mode 100644
index 00000000..32e57057
--- /dev/null
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -0,0 +1,96 @@
+import argparse
+import torch
+from typing import Tuple
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.torch as cutlass_torch
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+
+io_dtype = cutlass.Float16
+acc_dtype = cutlass.Float32
+mma_inst_shape_mnk = (128, 256, 16)
+mma_tiler_mnk = (128, 256, 64)
+threads_per_cta = 128
+
+# Pipeline stage configuration
+ab_stages = 4
+acc_stage = 1
+
+
+@cute.jit
+def custom_kernel(
+    a: cute.Tensor,
+    b: cute.Tensor,
+    c: cute.Tensor,
+):
+    # Construct tiled MMA
+    op = tcgen05.MmaF16BF16Op(
+        io_dtype,
+        acc_dtype,
+        mma_inst_shape_mnk,
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+        tcgen05.OperandMajorMode.K,
+        tcgen05.OperandMajorMode.K,
+    )
+    tiled_mma = cute.make_tiled_mma(op)
+
+    # Construct SMEM layouts for A and B
+    a_smem_layout = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        a.element_type,
+        ab_stages,
+    )
+    b_smem_layout = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        b.element_type,
+        ab_stages,
+    )
+    a_smem_layout_one_stage = cute.select(a_smem_layout, mode=[0, 1, 2])
+    b_smem_layout_one_stage = cute.select(b_smem_layout, mode=[0, 1, 2])
+
+    # Construct TMA load atoms
+    op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+    a_tma_atom, a_tma_tensor = cute.nvgpu.make_tiled_tma_atom_A(
+        op,
+        a,
+        a_smem_layout_one_stage,
+        mma_tiler_mnk,
+        tiled_mma,
+        (1, 1, 1),
+    )
+    b_tma_atom, b_tma_tensor = cute.nvgpu.make_tiled_tma_atom_B(
+        op,
+        b,
+        b_smem_layout_one_stage,
+        mma_tiler_mnk,
+        tiled_mma,
+        (1, 1, 1),
+    )
+
+    # Launch the kernel
+    grid_shape = cute.ceil_div((*c.layout.shape, 1), mma_tiler_mnk[:2])
+
+    # Implement your kernel
+    my_kernel(
+        tiled_mma,
+        a_tma_atom,
+        a_tma_tensor,
+        b_tma_atom,
+        b_tma_tensor,
+        c,
+        a_smem_layout,
+        b_smem_layout,
+    ).launch(
+        grid=grid_shape,
+        block=(threads_per_cta, 1, 1),
+    )
+
diff --git a/problems/nvidia/fp16_gemm/task.py b/problems/nvidia/fp16_gemm/task.py
new file mode 100644
index 00000000..769c5546
--- /dev/null
+++ b/problems/nvidia/fp16_gemm/task.py
@@ -0,0 +1,15 @@
+import argparse
+import torch
+from typing import Tuple
+
+import cutlass
+import cutlass.cute as cute
+
+input_t = TypeVar("input_t", bound=tuple[cute.Tensor, cute.Tensor, cute.Tensor])
+output_t = TypeVar("output_t", bound=cute.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
new file mode 100644
index 00000000..96ef86d7
--- /dev/null
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -0,0 +1,65 @@
+
+# name: fp8-matmul
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a fp16 matmul kernel optimized for NVIDIA B200.
+  The shapes of tensors are from DeepSeek-R1.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, c)
+  ```
+  where `a` and `b` are the input matrices, and `c` is the output matrix:
+  * `a` is M x K in row-major order in fp16
+  * `b` is N x K in column-major order in fp16
+  * `c` is M x N in row-major order in fp16
+  
+  Matrix sizes `M` and `N` are divisible by 64, `K` is divisible by 64.
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis is:
+   M       N       K     time[us]
+  7168    128    16384    17.38
+  4096    128    7168     4.34
+  7168    128    2048     2.17
+  ```
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+  HIP: "template-hip.py"
+
+tests:
+  - {"m": 64, "n": 64, "k": 64, "seed": 1111}
+  - {"m": 64, "n": 1536, "k": 7168, "seed": 1111}
+  - {"m": 64, "n": 3072, "k": 1536, "seed": 1111}
+  - {"m": 64, "n": 576, "k": 7168, "seed": 1111}
+  - {"m": 96, "n": 7168, "k": 256, "seed": 1111}
+  - {"m": 96, "n": 7168, "k": 2048, "seed": 1111}
+  - {"m": 96, "n": 4608, "k": 7168, "seed": 1111}
+  - {"m": 128, "n": 7168, "k": 2304, "seed": 1111}
+  - {"m": 128, "n": 512, "k": 7168, "seed": 1111}
+  - {"m": 512, "n": 4096, "k": 512, "seed": 1111}
+  - {"m": 512, "n": 1536, "k": 7168, "seed": 1111}
+
+benchmarks:
+  - {"m": 7168, "n": 128, "k": 16384, "seed": 1111}
+  - {"m": 4096, "n": 128, "k": 7168, "seed": 1111}
+  - {"m": 7168, "n": 128, "k": 2048, "seed": 1111}
+
+ranking_by: "geom"

From 3218ce467924e436cbe58528b36028fa3b45e7f3 Mon Sep 17 00:00:00 2001
From: vickiw973 <vickiw@nvidia.com>
Date: Thu, 2 Oct 2025 13:10:19 +0800
Subject: [PATCH 02/13] move cute reference to submission.py

---
 problems/nvidia/fp16_gemm/reference.py  | 394 +-----------------------
 problems/nvidia/fp16_gemm/submission.py | 256 ++++++++++++++-
 2 files changed, 266 insertions(+), 384 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/reference.py b/problems/nvidia/fp16_gemm/reference.py
index 77f52c7a..d2f18694 100644
--- a/problems/nvidia/fp16_gemm/reference.py
+++ b/problems/nvidia/fp16_gemm/reference.py
@@ -1,345 +1,15 @@
-import argparse
 import torch
-from typing import Tuple
+from task import input_t, output_t
+from utils import make_match_reference
 
-import cutlass
-import cutlass.cute as cute
-import cutlass.utils as utils
-import cutlass.torch as cutlass_torch
-import cutlass.pipeline as pipeline
-from cutlass.cute.nvgpu import cpasync, tcgen05
-import cutlass.utils.blackwell_helpers as sm100_utils
-from cutlass.cute.runtime import from_dlpack
 
-"""
-The first tutorial GEMM demonstrating a simple kernel implementation in CuTeDSL
-
-This dense GEMM kernel is implemented in just over 200 lines of code.
-With large tile sizes, it can achieve very high performance on 8k×8k×8k problem sizes.
-It can serve as a starting point to help users quickly experiment
-with optimizations for challenges that may arise with other problem sizes.
-
-To run this example:
-.. code-block:: bash
-
-    python examples/blackwell/tutorial_fp16_gemm_0.py  \
-      --mnk 8192,8192,8192                             \
-      --tolerance 1e-01
-
-Constraints for this example:
-* The problem size of m and n must be divisible by the tile size m & n (128, 256)
-"""
-
-io_dtype = cutlass.Float16
-acc_dtype = cutlass.Float32
-mma_inst_shape_mnk = (128, 256, 16)
-mma_tiler_mnk = (128, 256, 64)
-threads_per_cta = 128
-
-# Pipeline stage configuration
-ab_stages = 4
-acc_stage = 1
-
-
-@cute.struct
-class SharedStorage:
-    ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, ab_stages * 2]
-    acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, acc_stage * 2]
-    tmem_holding_buf: cutlass.Int32
-
-
-@cute.kernel
-def kernel(
-    tiled_mma: cute.TiledMma,
-    tma_atom_a: cute.CopyAtom,
-    mA_mkl: cute.Tensor,
-    tma_atom_b: cute.CopyAtom,
-    mB_nkl: cute.Tensor,
-    mC_mnl: cute.Tensor,
-    a_smem_layout: cute.ComposedLayout,
-    b_smem_layout: cute.ComposedLayout,
-):
-    # Current thread/warp/block coordinates
-    tidx, _, _ = cute.arch.thread_idx()
-    warp_idx = cute.arch.warp_idx()
-    warp_idx = cute.arch.make_warp_uniform(warp_idx)
-    bidx, bidy, _ = cute.arch.block_idx()
-    mma_coord_mnk = (bidx, bidy, None)
-
-    #
-    # 1. Prepare args
-    #
-
-    # Allocate SMEM
-    smem = cutlass.utils.SmemAllocator()
-    storage = smem.allocate(SharedStorage)
-    sA = smem.allocate_tensor(
-        element_type=io_dtype,
-        layout=a_smem_layout.outer,
-        byte_alignment=128,
-        swizzle=a_smem_layout.inner,
-    )
-    sB = smem.allocate_tensor(
-        element_type=io_dtype,
-        layout=b_smem_layout.outer,
-        byte_alignment=128,
-        swizzle=b_smem_layout.inner,
-    )
-
-    # Allocate all TMEM columns
-    # tmem_alloc_barrier = pipeline.NamedBarrier(
-    #     barrier_id=1,
-    #     num_threads=threads_per_cta,
-    # )
-    # tmem = utils.TmemAllocator(
-    #     storage.tmem_holding_buf,
-    #     barrier_for_retrieve=tmem_alloc_barrier,
-    # )
-    # num_tmem_cols = 512
-    # tmem.allocate(num_tmem_cols)
-
-    # Prefetch tma descriptor
-    if warp_idx == 0:
-        cpasync.prefetch_descriptor(tma_atom_a)
-        cpasync.prefetch_descriptor(tma_atom_b)
-
-    # Pipeline configuration
-    num_tma_copy_bytes = cute.size_in_bytes(
-        io_dtype, cute.select(a_smem_layout, mode=[0, 1, 2])
-    ) + cute.size_in_bytes(io_dtype, cute.select(b_smem_layout, mode=[0, 1, 2]))
-    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
-        num_stages=ab_stages,
-        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
-        consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
-        tx_count=num_tma_copy_bytes,
-        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
-    ).make_participants()
-    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
-        num_stages=acc_stage,
-        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
-        consumer_group=pipeline.CooperativeGroup(
-            pipeline.Agent.Thread, threads_per_cta, threads_per_cta
-        ),
-        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
-    ).make_participants()
-
-    # Partition tensors for MMA and make fragments
-    # (bM, bK, RestK)
-    gA = cute.local_tile(mA_mkl, mma_tiler_mnk, mma_coord_mnk, proj=(1, None, 1))
-    # (bN, bK, RestK)
-    gB = cute.local_tile(mB_nkl, mma_tiler_mnk, mma_coord_mnk, proj=(None, 1, 1))
-    # (bM, bN)
-    gC = cute.local_tile(mC_mnl, mma_tiler_mnk, mma_coord_mnk, proj=(1, 1, None))
-    thr_mma = tiled_mma.get_slice(0)
-    # (MMA, MMA_M, MMA_K)
-    tCgA = thr_mma.partition_A(gA)
-    # (MMA, MMA_N, MMA_K)
-    tCgB = thr_mma.partition_B(gB)
-    # (MMA, MMA_M, MMA_N)
-    tCgC = thr_mma.partition_C(gC)
-    # (MMA, MMA_M, MMA_K)
-    tCrA = tiled_mma.make_fragment_A(sA)
-    # (MMA, MMA_N, MMA_K)
-    tCrB = tiled_mma.make_fragment_B(sB)
-    # (MMA, MMA_M, MMA_N)
-    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
-    # (MMA, MMA_M, MMA_N)
-    tCtAcc = tiled_mma.make_fragment_C(acc_shape)
-    # Partition tensors for TMA; This requires the tensors partitioned for MMA
-    tAsA, tAgA = cute.nvgpu.cpasync.tma_partition(
-        tma_atom_a,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sA, 0, 3),
-        cute.group_modes(tCgA, 0, 3),
-    )
-    tBsB, tBgB = cute.nvgpu.cpasync.tma_partition(
-        tma_atom_b,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sB, 0, 3),
-        cute.group_modes(tCgB, 0, 3),
-    )
-
-    # CTA-wide sync before retrieving the pointer to the start of the allocated TMEM
-    # Only warp 0 does the allocation so we need to sync before retrieving the TMEM start address
-    if warp_idx == 0:
-        cute.arch.alloc_tmem(512, storage.tmem_holding_buf)
-    cute.arch.barrier()
-    tmem_ptr = cute.arch.retrieve_tmem_ptr(
-        acc_dtype,
-        alignment=16,
-        ptr_to_buffer_holding_addr=storage.tmem_holding_buf,
-    )
-
-    # Swap the pointer in tCtAcc
-    tCtAcc = cute.make_tensor(tmem_ptr, tCtAcc.layout)
-
-    subtile_cnt = 4
-    # (EpiTile)
-    epi_tiler = (
-        (cute.size(tCtAcc, mode=[0, 0]), cute.size(tCtAcc, mode=[0, 1]) // subtile_cnt),
-    )
-    # (EpiTile, NumTiles)
-    tCtAcc_epi = cute.zipped_divide(tCtAcc, epi_tiler)
-    # (EpiTile, NumTiles)
-    gC_epi = cute.zipped_divide(tCgC, epi_tiler)
-
-    # Every thread loads 32x128 bits
-    tmem_atom = cute.make_copy_atom(
-        tcgen05.Ld32x32bOp(tcgen05.Repetition.x64),
-        cutlass.Float32,
-    )
-    tmem_tiled_copy = tcgen05.make_tmem_copy(tmem_atom, tCtAcc_epi[None, 0])
-    tmem_thr_copy = tmem_tiled_copy.get_slice(tidx)
-
-    # (TmemCpy,NumTmemCpy,NumTiles)
-    tDtC = tmem_thr_copy.partition_S(tCtAcc_epi)
-    # (TmemCpy,NumTmemCpy,NumTiles)
-    tDgC = tmem_thr_copy.partition_D(gC_epi)
-
-    # (TmemCpy,NumTmemCpy)
-    tCrAcc = cute.make_fragment(tDgC[None, None, 0].shape, acc_dtype)
-    # (TmemCpy,NumTmemCpy)
-    tCrC = cute.make_fragment(tDgC[None, None, 0].shape, io_dtype)
-
-    #
-    # 2. Main loop
-    #
-    num_k_tiles = cute.size(gA, mode=[2])
-    if warp_idx == 0:
-        # Wait for a empty accumulator buffer
-        acc_empty = acc_producer.acquire_and_advance()
-        for k_tile_idx in cutlass.range(num_k_tiles, prefetch_stages=ab_stages - 2):
-            # Issue TMA loads
-            ab_empty = ab_producer.acquire_and_advance()
-            cute.copy(
-                tma_atom_a,
-                tAgA[(None, ab_empty.count)],
-                tAsA[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-            cute.copy(
-                tma_atom_b,
-                tBgB[(None, ab_empty.count)],
-                tBsB[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-
-            # Execute one K-block worth of MMA instructions
-            ab_full = ab_consumer.wait_and_advance()
-            num_k_blocks = cute.size(tCrA, mode=[2])
-            for k_block_idx in cutlass.range_constexpr(num_k_blocks):
-                k_block_coord = (None, None, k_block_idx, ab_full.index)
-                cute.gemm(
-                    tiled_mma,
-                    tCtAcc,
-                    tCrA[k_block_coord],
-                    tCrB[k_block_coord],
-                    tCtAcc,
-                )
-                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
-
-            # Signal that the A/B buffers have been consumed and are ready for the next load
-            ab_full.release()
-
-        # Signal that the accumulator is fully computed
-        acc_empty.commit()
-
-    #
-    # 3. Epilogue
-    #
-
-    # Release TMEM allocation lock
-    if warp_idx == 0:
-        cute.arch.relinquish_tmem_alloc_permit()
-
-    # Wait for the accumulator buffer to be full
-    acc_full = acc_consumer.wait_and_advance()
-
-    # TMEM -> RMEM -> GEMM
-    # Sub-tiling for better instruction-level parallelism
-    for i in cutlass.range(cute.size(tDtC, mode=[2])):
-        cute.copy(tmem_tiled_copy, tDtC[None, None, i], tCrAcc)
-        tCrC.store(tCrAcc.load().to(io_dtype))
-        cute.autovec_copy(tCrC, tDgC[None, None, i])
-
-    acc_full.release()
-
-    # Deallocate TMEM
-    cute.arch.barrier()
-    if warp_idx == 0:
-        cute.arch.dealloc_tmem(tmem_ptr, 512)
-
-
-@cute.jit
 def ref_kernel(
-    a: cute.Tensor,
-    b: cute.Tensor,
-    c: cute.Tensor,
-):
-    # Construct tiled MMA
-    op = tcgen05.MmaF16BF16Op(
-        io_dtype,
-        acc_dtype,
-        mma_inst_shape_mnk,
-        tcgen05.CtaGroup.ONE,
-        tcgen05.OperandSource.SMEM,
-        tcgen05.OperandMajorMode.K,
-        tcgen05.OperandMajorMode.K,
-    )
-    tiled_mma = cute.make_tiled_mma(op)
-
-    # Construct SMEM layouts for A and B
-    a_smem_layout = sm100_utils.make_smem_layout_a(
-        tiled_mma,
-        mma_tiler_mnk,
-        a.element_type,
-        ab_stages,
-    )
-    b_smem_layout = sm100_utils.make_smem_layout_b(
-        tiled_mma,
-        mma_tiler_mnk,
-        b.element_type,
-        ab_stages,
-    )
-    a_smem_layout_one_stage = cute.select(a_smem_layout, mode=[0, 1, 2])
-    b_smem_layout_one_stage = cute.select(b_smem_layout, mode=[0, 1, 2])
-
-    # Construct TMA load atoms
-    op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
-    a_tma_atom, a_tma_tensor = cute.nvgpu.make_tiled_tma_atom_A(
-        op,
-        a,
-        a_smem_layout_one_stage,
-        mma_tiler_mnk,
-        tiled_mma,
-        (1, 1, 1),
-    )
-    b_tma_atom, b_tma_tensor = cute.nvgpu.make_tiled_tma_atom_B(
-        op,
-        b,
-        b_smem_layout_one_stage,
-        mma_tiler_mnk,
-        tiled_mma,
-        (1, 1, 1),
-    )
-
-    # Launch the kernel
-    grid_shape = cute.ceil_div((*c.layout.shape, 1), mma_tiler_mnk[:2])
-    kernel(
-        tiled_mma,
-        a_tma_atom,
-        a_tma_tensor,
-        b_tma_atom,
-        b_tma_tensor,
-        c,
-        a_smem_layout,
-        b_smem_layout,
-    ).launch(
-        grid=grid_shape,
-        block=(threads_per_cta, 1, 1),
-    )
+    data: input_t,
+)->output_t:
+    a, b, c = data
+    # call torch matmul operation
+    c = torch.einsum("mk,nk->mn", a, b)
+    return c
 
 
 def generate_input(
@@ -347,49 +17,15 @@ def generate_input(
     n: int,
     k: int,
     seed: int,
-):
+) -> input_t:
     torch.manual_seed(seed)
 
-    # Make K-major tensors (torch tensors are row-major)
-    def make_tensors(mn, k, dtype):
-        shape = (mn, k)
-        return (
-            torch.empty(*shape, dtype=torch.int32)
-            .random_(-2, 2)
-            .to(dtype=dtype, device="cuda")
-        )
-
-    a = make_tensors(m, k, cutlass_torch.dtype(io_dtype))
-    b = make_tensors(n, k, cutlass_torch.dtype(io_dtype))
-    c = make_tensors(m, n, cutlass_torch.dtype(io_dtype))
-    a_tensor = (
-        from_dlpack(a, assumed_align=32)
-        .mark_layout_dynamic(leading_dim=1)
-        .mark_compact_shape_dynamic(mode=1, divisibility=k)
-    )
-    b_tensor = (
-        from_dlpack(b, assumed_align=32)
-        .mark_layout_dynamic(leading_dim=1)
-        .mark_compact_shape_dynamic(mode=1, divisibility=k)
-    )
-    c_tensor = (
-        from_dlpack(c, assumed_align=32)
-        .mark_layout_dynamic(leading_dim=1)
-        .mark_compact_shape_dynamic(mode=1, divisibility=n)
-    )
-
-    # Compute reference result and verify
-    ref = (torch.einsum("mk,nk->mn", a.to(torch.float32), b.to(torch.float32))).cpu()
-
-    return a_tensor, b_tensor, c_tensor, c, ref
+    # Generate a, b and c tensors
+    a = torch.empty(m, k, dtype=torch.float16).random_(-2, 2).to(device="cuda")
+    b = torch.empty(n, k, dtype=torch.float16).random_(-2, 2).to(device="cuda")
+    c = torch.empty(m, n, dtype=torch.float16).random_(-2, 2).to(device="cuda")
 
+    return (a, b, c)
 
-def check_implementation(ref, c_tensor):
-    torch.testing.assert_close(
-        c_tensor.cpu(), ref.to(cutlass_torch.dtype(io_dtype)), atol=1e-01, rtol=1e-05
-    )
-    print("PASS")
 
-a_tensor, b_tensor, c_tensor, c, ref = generate_input(m=1024, n=1024, k=1024, seed=1111)
-ref_kernel(a_tensor, b_tensor, c_tensor)
-check_implementation(ref, c)
\ No newline at end of file
+def check_implementation(ref, rtol=2e-02, atol=1e-03)
\ No newline at end of file
diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index 32e57057..68e8560f 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -1,5 +1,7 @@
 import argparse
 import torch
+from task import input_t, output_t
+from utils import make_match_reference
 from typing import Tuple
 
 import cutlass
@@ -11,7 +13,7 @@
 import cutlass.utils.blackwell_helpers as sm100_utils
 from cutlass.cute.runtime import from_dlpack
 
-
+# Kernel configuration
 io_dtype = cutlass.Float16
 acc_dtype = cutlass.Float32
 mma_inst_shape_mnk = (128, 256, 16)
@@ -22,9 +24,229 @@
 ab_stages = 4
 acc_stage = 1
 
+@cute.struct
+class SharedStorage:
+    ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, ab_stages * 2]
+    acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, acc_stage * 2]
+    tmem_holding_buf: cutlass.Int32
+
+
+# The naive CuTe implementation of a fp16 matmul
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b: cute.CopyAtom,
+    mB_nkl: cute.Tensor,
+    mC_mnl: cute.Tensor,
+    a_smem_layout: cute.ComposedLayout,
+    b_smem_layout: cute.ComposedLayout,
+):
+    # Current thread/warp/block coordinates
+    tidx, _, _ = cute.arch.thread_idx()
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    bidx, bidy, _ = cute.arch.block_idx()
+    mma_coord_mnk = (bidx, bidy, None)
+
+    #
+    # 1. Prepare args
+    #
+
+    # Allocate SMEM
+    smem = cutlass.utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+    sA = smem.allocate_tensor(
+        element_type=io_dtype,
+        layout=a_smem_layout.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout.inner,
+    )
+    sB = smem.allocate_tensor(
+        element_type=io_dtype,
+        layout=b_smem_layout.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout.inner,
+    )
+
+    # Prefetch tma descriptor
+    if warp_idx == 0:
+        cpasync.prefetch_descriptor(tma_atom_a)
+        cpasync.prefetch_descriptor(tma_atom_b)
+
+    # Pipeline configuration
+    num_tma_copy_bytes = cute.size_in_bytes(
+        io_dtype, cute.select(a_smem_layout, mode=[0, 1, 2])
+    ) + cute.size_in_bytes(io_dtype, cute.select(b_smem_layout, mode=[0, 1, 2]))
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        num_stages=ab_stages,
+        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        consumer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        tx_count=num_tma_copy_bytes,
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        num_stages=acc_stage,
+        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, threads_per_cta, threads_per_cta
+        ),
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+    ).make_participants()
+
+    # Partition tensors for MMA and make fragments
+    # (bM, bK, RestK)
+    gA = cute.local_tile(mA_mkl, mma_tiler_mnk, mma_coord_mnk, proj=(1, None, 1))
+    # (bN, bK, RestK)
+    gB = cute.local_tile(mB_nkl, mma_tiler_mnk, mma_coord_mnk, proj=(None, 1, 1))
+    # (bM, bN)
+    gC = cute.local_tile(mC_mnl, mma_tiler_mnk, mma_coord_mnk, proj=(1, 1, None))
+    thr_mma = tiled_mma.get_slice(0)
+    # (MMA, MMA_M, MMA_K)
+    tCgA = thr_mma.partition_A(gA)
+    # (MMA, MMA_N, MMA_K)
+    tCgB = thr_mma.partition_B(gB)
+    # (MMA, MMA_M, MMA_N)
+    tCgC = thr_mma.partition_C(gC)
+    # (MMA, MMA_M, MMA_K)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K)
+    tCrB = tiled_mma.make_fragment_B(sB)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc = tiled_mma.make_fragment_C(acc_shape)
+    # Partition tensors for TMA; This requires the tensors partitioned for MMA
+    tAsA, tAgA = cute.nvgpu.cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    tBsB, tBgB = cute.nvgpu.cpasync.tma_partition(
+        tma_atom_b,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB, 0, 3),
+        cute.group_modes(tCgB, 0, 3),
+    )
+
+    # Allocate all TMEM columns
+    # CTA-wide sync before retrieving the pointer to the start of the allocated TMEM
+    # Only warp 0 does the allocation so we need to sync before retrieving the TMEM start address
+    if warp_idx == 0:
+        cute.arch.alloc_tmem(512, storage.tmem_holding_buf)
+    cute.arch.barrier()
+    tmem_ptr = cute.arch.retrieve_tmem_ptr(
+        acc_dtype,
+        alignment=16,
+        ptr_to_buffer_holding_addr=storage.tmem_holding_buf,
+    )
+
+    # Create an accumulator Tensor
+    tCtAcc = cute.make_tensor(tmem_ptr, tCtAcc.layout)
+
+    subtile_cnt = 4
+    # (EpiTile)
+    epi_tiler = (
+        (cute.size(tCtAcc, mode=[0, 0]), cute.size(tCtAcc, mode=[0, 1]) // subtile_cnt),
+    )
+    # (EpiTile, NumTiles)
+    tCtAcc_epi = cute.zipped_divide(tCtAcc, epi_tiler)
+    # (EpiTile, NumTiles)
+    gC_epi = cute.zipped_divide(tCgC, epi_tiler)
+
+    # Every thread loads 32x128 bits
+    tmem_atom = cute.make_copy_atom(
+        tcgen05.Ld32x32bOp(tcgen05.Repetition.x64),
+        cutlass.Float32,
+    )
+    tmem_tiled_copy = tcgen05.make_tmem_copy(tmem_atom, tCtAcc_epi[None, 0])
+    tmem_thr_copy = tmem_tiled_copy.get_slice(tidx)
+
+    # (TmemCpy,NumTmemCpy,NumTiles)
+    tDtC = tmem_thr_copy.partition_S(tCtAcc_epi)
+    # (TmemCpy,NumTmemCpy,NumTiles)
+    tDgC = tmem_thr_copy.partition_D(gC_epi)
+
+    # (TmemCpy,NumTmemCpy)
+    tCrAcc = cute.make_fragment(tDgC[None, None, 0].shape, acc_dtype)
+    # (TmemCpy,NumTmemCpy)
+    tCrC = cute.make_fragment(tDgC[None, None, 0].shape, io_dtype)
+
+    #
+    # 2. Main loop
+    #
+    num_k_tiles = cute.size(gA, mode=[2])
+    if warp_idx == 0:
+        # Wait for a empty accumulator buffer
+        acc_empty = acc_producer.acquire_and_advance()
+        for k_tile_idx in cutlass.range(num_k_tiles, prefetch_stages=ab_stages - 2):
+            # Issue TMA loads
+            ab_empty = ab_producer.acquire_and_advance()
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, ab_empty.count)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b,
+                tBgB[(None, ab_empty.count)],
+                tBsB[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+
+            # Execute one K-block worth of MMA instructions
+            ab_full = ab_consumer.wait_and_advance()
+            num_k_blocks = cute.size(tCrA, mode=[2])
+            for k_block_idx in cutlass.range_constexpr(num_k_blocks):
+                k_block_coord = (None, None, k_block_idx, ab_full.index)
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc,
+                    tCrA[k_block_coord],
+                    tCrB[k_block_coord],
+                    tCtAcc,
+                )
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Signal that the A/B buffers have been consumed and are ready for the next load
+            ab_full.release()
+
+        # Signal that the accumulator is fully computed
+        acc_empty.commit()
+
+    #
+    # 3. Epilogue
+    #
+
+    # Release TMEM allocation lock
+    if warp_idx == 0:
+        cute.arch.relinquish_tmem_alloc_permit()
+
+    # Wait for the accumulator buffer to be full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # TMEM -> RMEM -> GEMM
+    # Sub-tiling for better instruction-level parallelism
+    for i in cutlass.range(cute.size(tDtC, mode=[2])):
+        cute.copy(tmem_tiled_copy, tDtC[None, None, i], tCrAcc)
+        tCrC.store(tCrAcc.load().to(io_dtype))
+        cute.autovec_copy(tCrC, tDgC[None, None, i])
+
+    acc_full.release()
+
+    # Deallocate TMEM
+    cute.arch.barrier()
+    if warp_idx == 0:
+        cute.arch.dealloc_tmem(tmem_ptr, 512)
+
 
 @cute.jit
-def custom_kernel(
+def my_kernel(
     a: cute.Tensor,
     b: cute.Tensor,
     c: cute.Tensor,
@@ -59,6 +281,7 @@ def custom_kernel(
 
     # Construct TMA load atoms
     op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+    # Construct TMA load atoms and Tensors for A and B respectively
     a_tma_atom, a_tma_tensor = cute.nvgpu.make_tiled_tma_atom_A(
         op,
         a,
@@ -76,11 +299,11 @@ def custom_kernel(
         (1, 1, 1),
     )
 
-    # Launch the kernel
+    # Calculate the grid shape
     grid_shape = cute.ceil_div((*c.layout.shape, 1), mma_tiler_mnk[:2])
 
-    # Implement your kernel
-    my_kernel(
+    # Launch the kernel
+    kernel(
         tiled_mma,
         a_tma_atom,
         a_tma_tensor,
@@ -94,3 +317,26 @@ def custom_kernel(
         block=(threads_per_cta, 1, 1),
     )
 
+def customized_kernel(data: input_t):
+    # Get input tensors
+    a, b, c = data
+    k = a.shape[1]
+    n = b.shape[1]
+    # Convert torch tensors to CuTe tensors via dlpack protocol
+    a_tensor = (
+        from_dlpack(a, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=k)
+    )
+    b_tensor = (
+        from_dlpack(b, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=k)
+    )
+    c_tensor = (
+        from_dlpack(c, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=n)
+    )
+    my_kernel(a_tensor, b_tensor, c_tensor)
+

From 2cfc4d1df4bc31f23721af03bdeb10a4ba45bad4 Mon Sep 17 00:00:00 2001
From: vickiw973 <vickiw@nvidia.com>
Date: Thu, 2 Oct 2025 13:15:49 +0800
Subject: [PATCH 03/13] fix test problems

---
 problems/nvidia/fp16_gemm/task.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
index 96ef86d7..0341ad4e 100644
--- a/problems/nvidia/fp16_gemm/task.yml
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -23,7 +23,7 @@ description: |
   * `b` is N x K in column-major order in fp16
   * `c` is M x N in row-major order in fp16
   
-  Matrix sizes `M` and `N` are divisible by 64, `K` is divisible by 64.
+  Matrix sizes `M` and `N` are divisible by mma_tiler_mnk defined in the kernel, `K` is divisible by 64.
 
   The ranking criteria is the geometric mean of the benchmark results.
 
@@ -45,15 +45,15 @@ templates:
   HIP: "template-hip.py"
 
 tests:
-  - {"m": 64, "n": 64, "k": 64, "seed": 1111}
-  - {"m": 64, "n": 1536, "k": 7168, "seed": 1111}
-  - {"m": 64, "n": 3072, "k": 1536, "seed": 1111}
-  - {"m": 64, "n": 576, "k": 7168, "seed": 1111}
-  - {"m": 96, "n": 7168, "k": 256, "seed": 1111}
-  - {"m": 96, "n": 7168, "k": 2048, "seed": 1111}
-  - {"m": 96, "n": 4608, "k": 7168, "seed": 1111}
-  - {"m": 128, "n": 7168, "k": 2304, "seed": 1111}
-  - {"m": 128, "n": 512, "k": 7168, "seed": 1111}
+  - {"m": 128, "n": 256, "k": 64, "seed": 1111}
+  - {"m": 128, "n": 1536, "k": 7168, "seed": 1111}
+  - {"m": 128, "n": 3072, "k": 1536, "seed": 1111}
+  - {"m": 256, "n": 576, "k": 7168, "seed": 1111}
+  - {"m": 256, "n": 7168, "k": 256, "seed": 1111}
+  - {"m": 256, "n": 7168, "k": 2048, "seed": 1111}
+  - {"m": 2384, "n": 4608, "k": 7168, "seed": 1111}
+  - {"m": 384, "n": 7168, "k": 2304, "seed": 1111}
+  - {"m": 512, "n": 512, "k": 7168, "seed": 1111}
   - {"m": 512, "n": 4096, "k": 512, "seed": 1111}
   - {"m": 512, "n": 1536, "k": 7168, "seed": 1111}
 

From 42cf0f8232fee38cc40faf5e6f6906350433864a Mon Sep 17 00:00:00 2001
From: vickiw973 <vickiw@nvidia.com>
Date: Thu, 2 Oct 2025 13:17:26 +0800
Subject: [PATCH 04/13] change to torch tensor.

---
 problems/nvidia/fp16_gemm/task.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/task.py b/problems/nvidia/fp16_gemm/task.py
index 769c5546..01a484a7 100644
--- a/problems/nvidia/fp16_gemm/task.py
+++ b/problems/nvidia/fp16_gemm/task.py
@@ -5,11 +5,11 @@
 import cutlass
 import cutlass.cute as cute
 
-input_t = TypeVar("input_t", bound=tuple[cute.Tensor, cute.Tensor, cute.Tensor])
-output_t = TypeVar("output_t", bound=cute.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     m: int
     n: int
     k: int
-    seed: int
\ No newline at end of file
+    seed: int

From 0d6fc01158c91ee87ea2a48a94457b9694a66c34 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Mon, 6 Oct 2025 21:10:16 -0700
Subject: [PATCH 05/13] fix test error.

---
 problems/nvidia/eval.py                 | 381 ++++++++++++++++++++++++
 problems/nvidia/fp16_gemm/reference.py  |   8 +-
 problems/nvidia/fp16_gemm/submission.py |   3 +-
 problems/nvidia/fp16_gemm/task.yml      |   5 +-
 problems/nvidia/fp16_gemm/template.py   |  20 ++
 problems/nvidia/utils.py                | 118 ++++++++
 6 files changed, 527 insertions(+), 8 deletions(-)
 create mode 100644 problems/nvidia/eval.py
 create mode 100644 problems/nvidia/fp16_gemm/template.py
 create mode 100644 problems/nvidia/utils.py

diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py
new file mode 100644
index 00000000..ac3a6325
--- /dev/null
+++ b/problems/nvidia/eval.py
@@ -0,0 +1,381 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(end - start)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/nvidia/fp16_gemm/reference.py b/problems/nvidia/fp16_gemm/reference.py
index d2f18694..127a70d7 100644
--- a/problems/nvidia/fp16_gemm/reference.py
+++ b/problems/nvidia/fp16_gemm/reference.py
@@ -6,10 +6,10 @@
 def ref_kernel(
     data: input_t,
 )->output_t:
-    a, b, c = data
+    a, b, _ = data
     # call torch matmul operation
-    c = torch.einsum("mk,nk->mn", a, b)
-    return c
+    ref = torch.einsum("mk,nk->mn", a, b).cpu()
+    return ref
 
 
 def generate_input(
@@ -28,4 +28,4 @@ def generate_input(
     return (a, b, c)
 
 
-def check_implementation(ref, rtol=2e-02, atol=1e-03)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=1e-01, atol=1e-05)
\ No newline at end of file
diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index 68e8560f..c985ef14 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -317,7 +317,7 @@ def my_kernel(
         block=(threads_per_cta, 1, 1),
     )
 
-def customized_kernel(data: input_t):
+def customized_kernel(data: input_t) -> output_t:
     # Get input tensors
     a, b, c = data
     k = a.shape[1]
@@ -339,4 +339,5 @@ def customized_kernel(data: input_t):
         .mark_compact_shape_dynamic(mode=1, divisibility=n)
     )
     my_kernel(a_tensor, b_tensor, c_tensor)
+    return c
 
diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
index 0341ad4e..a2244bec 100644
--- a/problems/nvidia/fp16_gemm/task.yml
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -1,5 +1,5 @@
 
-# name: fp8-matmul
+# name: fp16-gemm
 
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
@@ -12,7 +12,7 @@ lang: "py"
 
 description: |
   
-  You will implement a fp16 matmul kernel optimized for NVIDIA B200.
+  You will implement a fp16 matrix multiplication kernel optimized for NVIDIA B200.
   The shapes of tensors are from DeepSeek-R1.
   To be explicit, you will be given a tuple of tensors:
   ```
@@ -42,7 +42,6 @@ config:
 
 templates:
   Python: "template.py"
-  HIP: "template-hip.py"
 
 tests:
   - {"m": 128, "n": 256, "k": 64, "seed": 1111}
diff --git a/problems/nvidia/fp16_gemm/template.py b/problems/nvidia/fp16_gemm/template.py
new file mode 100644
index 00000000..e3149aa0
--- /dev/null
+++ b/problems/nvidia/fp16_gemm/template.py
@@ -0,0 +1,20 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp8 gemm
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k],
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k],
+            c: torch.Tensor[bfloat16] of shape [m, n]
+    Returns:
+        Tensor containing output in bf16
+    """
+    # c: [m, n] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, c = data
+
+    # Your implementation here
+
+    return c
diff --git a/problems/nvidia/utils.py b/problems/nvidia/utils.py
new file mode 100644
index 00000000..8859b38b
--- /dev/null
+++ b/problems/nvidia/utils.py
@@ -0,0 +1,118 @@
+from typing import Tuple
+import torch
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return False, ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return good, "\\n".join(reasons)
+
+    return good, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped

From 913a5ed2e98598f27443eeec9cac08bfb7c241a3 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 7 Oct 2025 06:35:17 -0700
Subject: [PATCH 06/13] modify the example based on feedback.

---
 problems/nvidia/eval.py                 | 28 ++++----
 problems/nvidia/fp16_gemm/reference.py  |  6 +-
 problems/nvidia/fp16_gemm/submission.py |  8 ++-
 problems/nvidia/fp16_gemm/task.py       |  7 +-
 problems/nvidia/fp16_gemm/task.yml      |  1 -
 problems/nvidia/fp16_gemm/template.py   |  8 +--
 problems/nvidia/utils.py                | 88 ++++++++++++++++++++-----
 7 files changed, 98 insertions(+), 48 deletions(-)

diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py
index ac3a6325..981b9322 100644
--- a/problems/nvidia/eval.py
+++ b/problems/nvidia/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed
+from utils import set_seed, clear_l2_cache
 try:
     from task import TestSpec
 except ImportError:
@@ -138,17 +138,6 @@ def _clone_data(data):
         return data
 
 
-def wrap_check_implementation(data, submission_output):
-    # Old version returned just a single string, new version
-    # returns (bool, str); this function ensures compatibility with old
-    # problem definitions.
-    result = check_implementation(data, submission_output)
-    if isinstance(result, tuple):
-        return result
-    else:
-        return not bool(result), result
-
-
 def _run_single_test(test: TestCase):
     """
     Runs a single test case. Do not call directly
@@ -158,7 +147,7 @@ def _run_single_test(test: TestCase):
     torch.cuda.synchronize()
     submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
-    return wrap_check_implementation(data, submission_output)
+    return check_implementation(data, submission_output)
 
 
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
@@ -210,7 +199,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     check_copy = _clone_data(data)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
-    good, message = wrap_check_implementation(check_copy, output)
+    good, message = check_implementation(check_copy, output)
     if not good:
         return message
 
@@ -229,10 +218,15 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
             data = generate_input(**test.args)
             check_copy = _clone_data(data)
         torch.cuda.synchronize()
-        start = time.perf_counter_ns()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
         output = custom_kernel(data)
+        end_event.record()
         torch.cuda.synchronize()
-        end = time.perf_counter_ns()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
 
         if recheck:
             good, message = check_implementation(check_copy, output)
@@ -240,7 +234,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
                 return message
 
         del output
-        durations.append(end - start)
+        durations.append(duration)
 
         if i > 1:
             total_bm_duration = time.perf_counter_ns() - bm_start_time
diff --git a/problems/nvidia/fp16_gemm/reference.py b/problems/nvidia/fp16_gemm/reference.py
index 127a70d7..dae1c9dc 100644
--- a/problems/nvidia/fp16_gemm/reference.py
+++ b/problems/nvidia/fp16_gemm/reference.py
@@ -6,10 +6,10 @@
 def ref_kernel(
     data: input_t,
 )->output_t:
-    a, b, _ = data
+    a, b, c = data
     # call torch matmul operation
-    ref = torch.einsum("mk,nk->mn", a, b).cpu()
-    return ref
+    c[...] = a @ b
+    return c
 
 
 def generate_input(
diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index c985ef14..2280c886 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -1,7 +1,6 @@
 import argparse
 import torch
 from task import input_t, output_t
-from utils import make_match_reference
 from typing import Tuple
 
 import cutlass
@@ -319,7 +318,11 @@ def my_kernel(
 
 def customized_kernel(data: input_t) -> output_t:
     # Get input tensors
-    a, b, c = data
+    a, b, ref = data
+
+    # Clone ref to avoid modifying the original tensor
+    c = ref.clone()
+
     k = a.shape[1]
     n = b.shape[1]
     # Convert torch tensors to CuTe tensors via dlpack protocol
@@ -339,5 +342,6 @@ def customized_kernel(data: input_t) -> output_t:
         .mark_compact_shape_dynamic(mode=1, divisibility=n)
     )
     my_kernel(a_tensor, b_tensor, c_tensor)
+    torch.cuda.synchronize()
     return c
 
diff --git a/problems/nvidia/fp16_gemm/task.py b/problems/nvidia/fp16_gemm/task.py
index 01a484a7..d35ed2e7 100644
--- a/problems/nvidia/fp16_gemm/task.py
+++ b/problems/nvidia/fp16_gemm/task.py
@@ -1,13 +1,8 @@
-import argparse
 import torch
-from typing import Tuple
-
-import cutlass
-import cutlass.cute as cute
+from typing import TypedDict, TypeVar
 
 input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
-
 class TestSpec(TypedDict):
     m: int
     n: int
diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
index a2244bec..4546a469 100644
--- a/problems/nvidia/fp16_gemm/task.yml
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -1,4 +1,3 @@
-
 # name: fp16-gemm
 
 files:
diff --git a/problems/nvidia/fp16_gemm/template.py b/problems/nvidia/fp16_gemm/template.py
index e3149aa0..e4615a0b 100644
--- a/problems/nvidia/fp16_gemm/template.py
+++ b/problems/nvidia/fp16_gemm/template.py
@@ -6,11 +6,11 @@ def custom_kernel(data: input_t) -> output_t:
     Reference implementation of block-scale fp8 gemm
     Args:
         data: Tuple that expands to:
-            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k],
-            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k],
-            c: torch.Tensor[bfloat16] of shape [m, n]
+            a: torch.Tensor[float16] of shape [m, k],
+            b: torch.Tensor[float16] of shape [n, k],
+            c: torch.Tensor[float16] of shape [m, n]
     Returns:
-        Tensor containing output in bf16
+        Tensor containing output in float16
     """
     # c: [m, n] is pre-allocated memory to avoid timing allocation overhead.
     a, b, c = data
diff --git a/problems/nvidia/utils.py b/problems/nvidia/utils.py
index 8859b38b..e8a9082f 100644
--- a/problems/nvidia/utils.py
+++ b/problems/nvidia/utils.py
@@ -1,6 +1,30 @@
-from typing import Tuple
+import os
+import random
+import numpy as np
 import torch
 
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
 @torch.no_grad()
 def verbose_allclose(
@@ -9,7 +33,7 @@ def verbose_allclose(
         rtol=1e-05,
         atol=1e-08,
         max_print=5
-) -> Tuple[bool, list[str]]:
+) -> list[str]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
@@ -19,13 +43,16 @@ def verbose_allclose(
     rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
     if received.shape != expected.shape:
-        return False, ["SIZE MISMATCH"]
+        return ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+    diff = torch.abs(received - expected)
 
     # Determine the tolerance
     tolerance = atol + rtol * torch.abs(expected)
@@ -58,16 +85,16 @@ def verbose_allclose(
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return False, mismatch_details
+        return mismatch_details
 
-    return True, [f"Maximum error: {torch.max(diff)}"]
+    return []
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -91,28 +118,59 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return False, mismatch_details
+        return mismatch_details
 
-    return True, []
+    return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
     expected = reference(data)
-    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
 
     if len(reasons) > 0:
-        return good, "\\n".join(reasons)
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
-    return good, ''
+    return True, ''
 
 
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
     return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy
\ No newline at end of file

From 850cca981fe96dc3cae68cff255dc68ecb880a35 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 7 Oct 2025 15:32:00 +0000
Subject: [PATCH 07/13] Fix: correct name of custom kernel

---
 problems/nvidia/fp16_gemm/submission.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index 2280c886..a1b8d06e 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -23,6 +23,7 @@
 ab_stages = 4
 acc_stage = 1
 
+
 @cute.struct
 class SharedStorage:
     ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, ab_stages * 2]
@@ -316,7 +317,8 @@ def my_kernel(
         block=(threads_per_cta, 1, 1),
     )
 
-def customized_kernel(data: input_t) -> output_t:
+
+def custom_kernel(data: input_t) -> output_t:
     # Get input tensors
     a, b, ref = data
 
@@ -344,4 +346,3 @@ def customized_kernel(data: input_t) -> output_t:
     my_kernel(a_tensor, b_tensor, c_tensor)
     torch.cuda.synchronize()
     return c
-

From 25dc73cdec38d92c1e4775a7e575acc7794ee96e Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 7 Oct 2025 15:41:12 +0000
Subject: [PATCH 08/13] Fix: catch OpError

---
 problems/nvidia/fp16_gemm/eval.py | 441 ++++++++++++++++++++++++++++++
 1 file changed, 441 insertions(+)
 create mode 100644 problems/nvidia/fp16_gemm/eval.py

diff --git a/problems/nvidia/fp16_gemm/eval.py b/problems/nvidia/fp16_gemm/eval.py
new file mode 100644
index 00000000..481c3640
--- /dev/null
+++ b/problems/nvidia/fp16_gemm/eval.py
@@ -0,0 +1,441 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+import tempfile
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    try:
+        output = custom_kernel(_clone_data(data))
+    except OpError as E:
+        return f"Encountered {E}"
+    good, message = check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    filename = None
+
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+
+        def build_test_string(tests: list[dict]):
+            as_str = ""
+            for test in tests:
+                kvs = []
+                for k, v in test.items():
+                    kvs.append(f"{k}: {v}")
+                as_str += "; ".join(kvs) + "\n"
+            return as_str
+
+        import yaml
+
+        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
+        if mode == "test":
+            tests_str = build_test_string(yaml_content.get("tests", []))
+        elif mode in ("benchmark", "leaderboard", "profile"):
+            tests_str = build_test_string(yaml_content.get("benchmarks", []))
+
+        tmp.write(tests_str.encode("utf-8"))
+        tmp.flush()
+        filename = tmp.name
+
+    tests = get_test_cases(filename, seed)
+
+    os.unlink(filename)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 3c7592b26d0402b9998ff3763e07f052de8708a9 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 7 Oct 2025 17:01:37 -0700
Subject: [PATCH 09/13] fix test error.

---
 problems/nvidia/fp16_gemm/reference.py  |  3 ++-
 problems/nvidia/fp16_gemm/submission.py | 21 ++++++++-------------
 problems/nvidia/fp16_gemm/task.yml      |  1 -
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/reference.py b/problems/nvidia/fp16_gemm/reference.py
index dae1c9dc..8eca7b76 100644
--- a/problems/nvidia/fp16_gemm/reference.py
+++ b/problems/nvidia/fp16_gemm/reference.py
@@ -8,7 +8,8 @@ def ref_kernel(
 )->output_t:
     a, b, c = data
     # call torch matmul operation
-    c[...] = a @ b
+    # b is N x K in column-major order, need to transpose b before matmul
+    c[...] = a @ b.t()
     return c
 
 
diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index a1b8d06e..89fd655e 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -15,8 +15,8 @@
 # Kernel configuration
 io_dtype = cutlass.Float16
 acc_dtype = cutlass.Float32
-mma_inst_shape_mnk = (128, 256, 16)
-mma_tiler_mnk = (128, 256, 64)
+mma_inst_shape_mnk = (128, 128, 16)
+mma_tiler_mnk = (128, 128, 64)
 threads_per_cta = 128
 
 # Pipeline stage configuration
@@ -158,9 +158,9 @@ def kernel(
     # (EpiTile, NumTiles)
     gC_epi = cute.zipped_divide(tCgC, epi_tiler)
 
-    # Every thread loads 32x128 bits
+    # Every thread loads 32x64 bits
     tmem_atom = cute.make_copy_atom(
-        tcgen05.Ld32x32bOp(tcgen05.Repetition.x64),
+        tcgen05.Ld32x32bOp(tcgen05.Repetition.x32),
         cutlass.Float32,
     )
     tmem_tiled_copy = tcgen05.make_tmem_copy(tmem_atom, tCtAcc_epi[None, 0])
@@ -320,28 +320,23 @@ def my_kernel(
 
 def custom_kernel(data: input_t) -> output_t:
     # Get input tensors
-    a, b, ref = data
+    a, b, c = data
 
-    # Clone ref to avoid modifying the original tensor
-    c = ref.clone()
-
-    k = a.shape[1]
-    n = b.shape[1]
     # Convert torch tensors to CuTe tensors via dlpack protocol
     a_tensor = (
         from_dlpack(a, assumed_align=32)
         .mark_layout_dynamic(leading_dim=1)
-        .mark_compact_shape_dynamic(mode=1, divisibility=k)
+        .mark_compact_shape_dynamic(mode=1, divisibility=32)
     )
     b_tensor = (
         from_dlpack(b, assumed_align=32)
         .mark_layout_dynamic(leading_dim=1)
-        .mark_compact_shape_dynamic(mode=1, divisibility=k)
+        .mark_compact_shape_dynamic(mode=1, divisibility=32)
     )
     c_tensor = (
         from_dlpack(c, assumed_align=32)
         .mark_layout_dynamic(leading_dim=1)
-        .mark_compact_shape_dynamic(mode=1, divisibility=n)
+        .mark_compact_shape_dynamic(mode=1, divisibility=32)
     )
     my_kernel(a_tensor, b_tensor, c_tensor)
     torch.cuda.synchronize()
diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
index 4546a469..60fedd3d 100644
--- a/problems/nvidia/fp16_gemm/task.yml
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -46,7 +46,6 @@ tests:
   - {"m": 128, "n": 256, "k": 64, "seed": 1111}
   - {"m": 128, "n": 1536, "k": 7168, "seed": 1111}
   - {"m": 128, "n": 3072, "k": 1536, "seed": 1111}
-  - {"m": 256, "n": 576, "k": 7168, "seed": 1111}
   - {"m": 256, "n": 7168, "k": 256, "seed": 1111}
   - {"m": 256, "n": 7168, "k": 2048, "seed": 1111}
   - {"m": 2384, "n": 4608, "k": 7168, "seed": 1111}

From 066527024a951ad2cad698eee6be16ee769356d1 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 7 Oct 2025 19:03:42 -0700
Subject: [PATCH 10/13] split test to compile and run.

---
 problems/nvidia/fp16_gemm/eval.py       | 80 ++++++++++++++++++++++---
 problems/nvidia/fp16_gemm/submission.py | 59 +++++++++++++++++-
 2 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/eval.py b/problems/nvidia/fp16_gemm/eval.py
index 481c3640..4169b801 100644
--- a/problems/nvidia/fp16_gemm/eval.py
+++ b/problems/nvidia/fp16_gemm/eval.py
@@ -177,6 +177,16 @@ def run_testing(
     @param tests: A list of TestCase objects representing the test cases to be executed.
     @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
     """
+    # Step 1: Compile kernel once before running tests
+    logger.log("compile", "start")
+    compile_success, compile_error = pool.apply(_compile_kernel_once)
+    if not compile_success:
+        logger.log("compile", "fail")
+        logger.log("compile.error", compile_error)
+        return 112
+    logger.log("compile", "pass")
+    
+    # Step 2: Run all tests with compiled kernel
     passed = True
     logger.log("test-count", len(tests))
     for idx, test in enumerate(tests):
@@ -199,18 +209,51 @@ def run_testing(
         return 112
 
 
+def _compile_kernel_once():
+    """
+    Compile the kernel once before any benchmarking.
+    This ensures compilation time is not included in benchmark results.
+    """
+    from submission import compile_kernel
+    
+    # Use a minimal test case for compilation
+    compile_data = generate_input(m=128, n=128, k=64, seed=42)
+    a, b, c = compile_data
+    
+    try:
+        # Trigger compilation (will be cached)
+        compile_kernel(a, b, c)
+        torch.cuda.synchronize()
+        return True, None
+    except OpError as E:
+        return False, f"Compilation failed: {E}"
+    except Exception as E:
+        return False, f"Compilation failed: {E}"
+
+
 def _run_single_benchmark(
     test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
 ) -> Stats | Any:
     """
     Runs one benchmark. Do not call directly.
     """
-    from submission import custom_kernel
+    from submission import custom_kernel, compile_kernel
 
     durations = []
     # generate input data once
     data = generate_input(**test.args)
     check_copy = _clone_data(data)
+    
+    # Ensure kernel is compiled before any timing (compilation is cached)
+    try:
+        a, b, c = data
+        compile_kernel(a, b, c)
+        torch.cuda.synchronize()
+    except OpError as E:
+        return f"Compilation failed: {E}"
+    except Exception as E:
+        return f"Compilation failed: {E}"
+    
     #  first, one obligatory correctness check
     try:
         output = custom_kernel(_clone_data(data))
@@ -221,7 +264,7 @@ def _run_single_benchmark(
         return message
 
     # now, do multiple timing runs without further correctness testing
-    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
@@ -301,14 +344,24 @@ def run_benchmarking(
     @param tests: A list of TestCase objects representing the test cases to be benchmarked.
     @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
     """
-    # warm up
-    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+    # Step 1: Compile kernel once (outside of timing)
+    logger.log("compile", "start")
+    compile_success, compile_error = pool.apply(_compile_kernel_once)
+    if not compile_success:
+        logger.log("compile", "fail")
+        logger.log("compile.error", compile_error)
+        return 112
+    logger.log("compile", "pass")
+    
+    # Step 2: Warm up with compiled kernel
+    run_single_benchmark(pool, tests[0], False, 200, 10e7)
 
+    # Step 3: Run benchmarks (compilation time excluded)
     passed = True
     logger.log("benchmark-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"benchmark.{idx}.spec", test.spec)
-        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        result = run_single_benchmark(pool, test, False, 200, 10e9)
         if isinstance(result, Stats):
             for field in dataclasses.fields(Stats):
                 logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
@@ -408,12 +461,23 @@ def build_test_string(tests: list[dict]):
                 return run_benchmarking(logger, pool, tests)
 
             if mode == "leaderboard":
-                # warmup
-                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                # Step 1: Compile kernel once (outside of timing)
+                logger.log("compile", "start")
+                compile_success, compile_error = pool.apply(_compile_kernel_once)
+                if not compile_success:
+                    logger.log("compile", "fail")
+                    logger.log("compile.error", compile_error)
+                    return 112
+                logger.log("compile", "pass")
+                
+                # Step 2: Warmup with compiled kernel
+                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                
+                # Step 3: Run leaderboard benchmarks (compilation time excluded)
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
                     logger.log(f"benchmark.{i}.spec", tests[i].spec)
                     if isinstance(result, Stats):
                         for field in dataclasses.fields(Stats):
diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index 89fd655e..65634e50 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -318,10 +318,65 @@ def my_kernel(
     )
 
 
+# Global cache for compiled kernel
+_compiled_kernel_cache = None
+
+
+def compile_kernel(a, b, c):
+    """
+    Compile the kernel once and cache it.
+    This should be called before any timing measurements.
+    
+    Args:
+        a, b, c: Sample tensors with the expected shapes and types
+    
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+    
+    if _compiled_kernel_cache is not None:
+        return _compiled_kernel_cache
+    
+    # Convert torch tensors to CuTe tensors via dlpack protocol
+    a_tensor = (
+        from_dlpack(a, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=32)
+    )
+    b_tensor = (
+        from_dlpack(b, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=32)
+    )
+    c_tensor = (
+        from_dlpack(c, assumed_align=32)
+        .mark_layout_dynamic(leading_dim=1)
+        .mark_compact_shape_dynamic(mode=1, divisibility=32)
+    )
+    
+    # Compile the kernel
+    _compiled_kernel_cache = cute.compile(my_kernel, a_tensor, b_tensor, c_tensor)
+    
+    return _compiled_kernel_cache
+
+
 def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the kernel. If not already compiled, compile it first.
+    
+    Args:
+        data: Tuple of (a, b, c) tensors
+    
+    Returns:
+        Output tensor c
+    """
     # Get input tensors
     a, b, c = data
 
+    # Ensure kernel is compiled (will use cached version if available)
+    compiled_func = compile_kernel(a, b, c)
+
     # Convert torch tensors to CuTe tensors via dlpack protocol
     a_tensor = (
         from_dlpack(a, assumed_align=32)
@@ -338,6 +393,8 @@ def custom_kernel(data: input_t) -> output_t:
         .mark_layout_dynamic(leading_dim=1)
         .mark_compact_shape_dynamic(mode=1, divisibility=32)
     )
-    my_kernel(a_tensor, b_tensor, c_tensor)
+    
+    # Execute the compiled kernel
+    compiled_func(a_tensor, b_tensor, c_tensor)
     torch.cuda.synchronize()
     return c

From a1d9a295f2ccb99cd645efc9fd65fe761947bbf7 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Wed, 8 Oct 2025 22:08:54 -0700
Subject: [PATCH 11/13] fix a perf issue.

---
 problems/nvidia/fp16_gemm/submission.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/problems/nvidia/fp16_gemm/submission.py b/problems/nvidia/fp16_gemm/submission.py
index 65634e50..9e000be1 100644
--- a/problems/nvidia/fp16_gemm/submission.py
+++ b/problems/nvidia/fp16_gemm/submission.py
@@ -396,5 +396,4 @@ def custom_kernel(data: input_t) -> output_t:
     
     # Execute the compiled kernel
     compiled_func(a_tensor, b_tensor, c_tensor)
-    torch.cuda.synchronize()
     return c

From 72790bf496069404a36114c1fc04e2912d5f960f Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Thu, 16 Oct 2025 01:58:30 -0700
Subject: [PATCH 12/13] enhance comment

---
 problems/nvidia/fp16_gemm/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
index 60fedd3d..0d7a4a3d 100644
--- a/problems/nvidia/fp16_gemm/task.yml
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -29,7 +29,7 @@ description: |
   For the grand price, your kernel will be evaluated against the speed of light analysis
   and the solution closest to the speed of light will be awarded the grand price.
   ```
-  The speed of light analysis is:
+  The speed of light analysis is (using 1.5Ghz clock):
    M       N       K     time[us]
   7168    128    16384    17.38
   4096    128    7168     4.34

From 4f49b452689c91e6d4d8ef7fee10103c81fb702f Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Wed, 22 Oct 2025 20:26:08 -0700
Subject: [PATCH 13/13] improve speed of light analysis results.

---
 problems/nvidia/fp16_gemm/task.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/problems/nvidia/fp16_gemm/task.yml b/problems/nvidia/fp16_gemm/task.yml
index 0d7a4a3d..04a5fa50 100644
--- a/problems/nvidia/fp16_gemm/task.yml
+++ b/problems/nvidia/fp16_gemm/task.yml
@@ -29,11 +29,12 @@ description: |
   For the grand price, your kernel will be evaluated against the speed of light analysis
   and the solution closest to the speed of light will be awarded the grand price.
   ```
-  The speed of light analysis is (using 1.5Ghz clock):
-   M       N       K     time[us]
-  7168    128    16384    17.38
-  4096    128    7168     4.34
-  7168    128    2048     2.17
+  The speed of light analysis is based on the max(math throughput, memory throughput) of B200 and tested under 1.5Ghz clock:
+  | M | N | K | time[us] |
+  |---|---|---|----------|
+  | 7168 | 128 | 16384 | 31.369 |
+  | 4096 | 128 | 7168 | 8.021 |
+  | 7168 | 128 | 2048 | 4.130 |
   ```
 
 config: