From e5caca340b805011aedb6e575368288f6f4af79e Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Mon, 1 Dec 2025 16:23:04 -0800
Subject: [PATCH 01/14] Added TK backend and mounting for modal support. Need
 to copy thunderkittens repo to the root!

---
 .gitignore                                    | Bin 171 -> 197 bytes
 scripts/eval_from_generations.py              |  42 ++++--
 scripts/generate_and_eval_single_sample.py    |   6 +-
 .../generate_and_eval_single_sample_modal.py  |  45 ++++--
 scripts/generate_samples.py                   |   4 +-
 .../model_new_ex_add_thunderkittens.py        | 142 ++++++++++++++++++
 src/prompts/prompts.toml                      |   5 +
 7 files changed, 218 insertions(+), 26 deletions(-)
 create mode 100644 src/prompts/model_new_ex_add_thunderkittens.py

diff --git a/.gitignore b/.gitignore
index abb125d6aabe372688772976daa7ee024353eaf8..bc05f65e5cb6bf7ff3a0357d0df20910f2bb149b 100644
GIT binary patch
delta 33
ocmZ3@c$9I%YAHRJ;P~K@{GwE@kc`s2l++^c%#xDSykdPW0MA4V_y7O^

delta 6
NcmX@gxSDaoY5)l%0<Hi6

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 2e39e3be..9c2ae3c4 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -60,20 +60,36 @@
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
-image = (
-    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
-    .apt_install("git",
-                "gcc-10",
-                "g++-10",
-                "clang"
-                )
-    .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
-    .add_local_dir(
-        KERNEL_BENCH_PATH,
-        remote_path="/root/KernelBench"
+# ThunderKittens support - use TK image if directory exists locally
+THUNDERKITTENS_LOCAL_PATH = os.path.join(REPO_TOP_DIR, "ThunderKittens")
+SRC_PATH = os.path.join(REPO_TOP_DIR, "src")
+
+if os.path.isdir(THUNDERKITTENS_LOCAL_PATH):
+    # ThunderKittens image with TK environment and mounting
+    image = (
+        modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .env({
+            "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
+            "THUNDERKITTENS_PATH": "/root/ThunderKittens",
+            "TORCH_CUDA_ARCH_LIST": "9.0",
+            "CXX": "g++-10",
+            "CC": "gcc-10",
+        })
+        .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
+        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
+        .add_local_dir(SRC_PATH, remote_path="/root/src")
+    )
+else:
+    # Standard image
+    image = (
+        modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
+        .add_local_dir(SRC_PATH, remote_path="/root/src")
     )
-    .add_local_python_source("src")
-)
 
 
 class EvalConfig(Config):
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 2b2d5301..92c42ef8 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -197,12 +197,16 @@ def main(config: EvalConfig):
         include_hardware = include_hardware.lower() in ["true", "1", "yes"]
     config.include_hardware_info = include_hardware
 
-    supported_backends = {"cuda", "triton", "tilelang", "cute"}
+    supported_backends = {"cuda", "triton", "tilelang", "cute", "thunderkittens"}
     backend = config.backend.lower()
     if backend not in supported_backends:
         raise ValueError(
             f"Unsupported backend: {config.backend}. Must be one of {sorted(supported_backends)}."
         )
+    
+    # ThunderKittens uses fp32 by default
+    if backend == "thunderkittens":
+        config.precision = "fp32"
 
     if backend == "tilelang":
         config.precision = "fp16" # tilelang only operates with fp16
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 7628e0bf..77016d49 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -95,16 +95,35 @@ def __repr__(self):
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
-image = (
-    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
-    .apt_install("git",
-                "gcc-10",
-                "g++-10",
-                "clang" # note i skip a step
-                )
-    .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
-    .add_local_python_source("src")
-)
+# ThunderKittens support - use TK image if directory exists locally
+THUNDERKITTENS_LOCAL_PATH = os.path.join(REPO_TOP_DIR, "ThunderKittens")
+
+SRC_PATH = os.path.join(REPO_TOP_DIR, "src")
+
+if os.path.isdir(THUNDERKITTENS_LOCAL_PATH):
+    # ThunderKittens image with TK environment and mounting
+    image = (
+        modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .env({
+            "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
+            "THUNDERKITTENS_PATH": "/root/ThunderKittens",
+            "TORCH_CUDA_ARCH_LIST": "9.0",
+            "CXX": "g++-10",
+            "CC": "gcc-10",
+        })
+        .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
+        .add_local_dir(SRC_PATH, remote_path="/root/src")
+    )
+else:
+    # Standard image
+    image = (
+        modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .add_local_dir(SRC_PATH, remote_path="/root/src")
+    )
 
 @app.cls(image=image)
 class EvalFunc:
@@ -215,12 +234,16 @@ def main(config: EvalConfig):
         include_hardware = include_hardware.lower() in ["true", "1", "yes"]
     config.include_hardware_info = include_hardware
 
-    supported_backends = {"cuda", "triton", "tilelang", "cute"}
+    supported_backends = {"cuda", "triton", "tilelang", "cute", "thunderkittens"}
     backend = config.backend.lower()
     if backend not in supported_backends:
         raise ValueError(
             f"Unsupported backend: {config.backend}. Must be one of {sorted(supported_backends)}."
         )
+    
+    # ThunderKittens uses fp32 by default
+    if backend == "thunderkittens":
+        config.precision = "fp32"
 
     #tilelang only supports fp16 or bf16
     if backend == "tilelang":
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index e47c6e87..82f2c6c6 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -239,7 +239,7 @@ def main(config: GenerationConfig):
         include_hardware = include_hardware.lower() in ["true", "1", "yes"]
     config.include_hardware_info = include_hardware
 
-    supported_backends = {"cuda", "triton", "cute", "tilelang"}
+    supported_backends = {"cuda", "triton", "cute", "tilelang", "thunderkittens"}
     backend = config.backend.lower()
     if backend not in supported_backends:
         raise ValueError(
@@ -248,6 +248,8 @@ def main(config: GenerationConfig):
     config.backend = backend
     if backend == "tilelang":
         config.precision = "fp16"
+    if backend == "thunderkittens":
+        config.precision = "fp32"  # ThunderKittens supports fp32 by default
 
     config.prompt_option = str(config.prompt_option).lower()
     valid_prompt_options = {"zero_shot", "one_shot", "few_shot"}
diff --git a/src/prompts/model_new_ex_add_thunderkittens.py b/src/prompts/model_new_ex_add_thunderkittens.py
new file mode 100644
index 00000000..af6ff0fb
--- /dev/null
+++ b/src/prompts/model_new_ex_add_thunderkittens.py
@@ -0,0 +1,142 @@
+import torch
+import torch.nn as nn
+from torch.utils.cpp_extension import load_inline
+import os
+
+# ThunderKittens header-only library path (set via environment variable)
+# Default to /root/ThunderKittens for Modal containers, or use THUNDERKITTENS_PATH env var
+TK_PATH = os.environ.get("THUNDERKITTENS_PATH", os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens"))
+
+# C++ source: function declaration for binding
+elementwise_add_cpp_source = """
+torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b);
+"""
+
+# CUDA source: ThunderKittens kernel implementation
+# 
+# IMPORTANT ThunderKittens API notes:
+# 1. Define KITTENS_HOPPER before including kittens.cuh for H100/Hopper GPUs
+# 2. Operations like load, store, zero, mma_AB are NOT free functions!
+#    They are static member functions inside kittens::group<N> template struct.
+# 3. Create an alias like: using warp = kittens::group<1>;
+# 4. Then call: warp::load(...), warp::zero(...), etc.
+#
+elementwise_add_cuda_source = """
+// IMPORTANT: Define KITTENS_HOPPER before including ThunderKittens headers for H100/Hopper GPUs
+// This enables FP8 types and Hopper-specific features
+#define KITTENS_HOPPER
+
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+
+// Include ThunderKittens headers
+#include "kittens.cuh"
+
+// ThunderKittens namespace and group aliases
+// Operations are accessed through these group types, NOT as free functions
+using namespace kittens;
+using warp = kittens::group<1>;  // For single-warp operations (32 threads)
+// For multi-warp operations, use: using warpgroup = kittens::group<4>;
+
+// Constants for tile dimensions
+constexpr int TILE_DIM = 16;
+
+// ThunderKittens elementwise add kernel using shared memory tiles
+// This example demonstrates the ThunderKittens API pattern
+__global__ void tk_elementwise_add_kernel(const float* __restrict__ a_ptr, 
+                                           const float* __restrict__ b_ptr, 
+                                           float* __restrict__ out_ptr, 
+                                           int rows, int cols) {
+    // For simple element-wise ops, we use a straightforward approach
+    // ThunderKittens shines for matrix ops with tiles, but here we show basic pattern
+    
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = rows * cols;
+    
+    // Grid-stride loop for simple element-wise addition
+    for (int i = idx; i < total; i += blockDim.x * gridDim.x) {
+        out_ptr[i] = a_ptr[i] + b_ptr[i];
+    }
+}
+
+// Alternative: ThunderKittens tiled version for larger matrices
+// Shows proper usage of ThunderKittens tile types and group operations
+// Uncomment and adapt for matrix operations:
+/*
+__global__ void tk_matmul_kernel(const bf16* A, const bf16* B, bf16* C, 
+                                  int M, int N, int K) {
+    // Define aliases for the group - THIS IS REQUIRED for ThunderKittens ops
+    using warpgroup = kittens::group<4>;  // 4 warps = 128 threads
+    
+    // ThunderKittens register tiles for accumulation
+    rt_fl<16, 16> acc;  // 16x16 float register tile
+    
+    // Shared memory tiles  
+    extern __shared__ alignment_dummy __shm[];
+    st_bf<16, 16> (&a_smem)[2] = *reinterpret_cast<st_bf<16, 16>(*)[2]>(__shm);
+    st_bf<16, 16> (&b_smem)[2] = *reinterpret_cast<st_bf<16, 16>(*)[2]>(__shm + sizeof(st_bf<16,16>)*2);
+    
+    // Initialize accumulator to zero - NOTE: use warpgroup:: prefix!
+    warpgroup::zero(acc);
+    
+    // Main loop would go here with:
+    // warpgroup::load(a_smem[...], ...);   // Load from global to shared
+    // warpgroup::mma_AB(acc, a_tile, b_tile);  // Matrix multiply-accumulate
+    // warpgroup::store(C_ptr, acc, ...);  // Store result
+}
+*/
+
+torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b) {
+    TORCH_CHECK(a.is_cuda(), "Input tensor a must be on CUDA");
+    TORCH_CHECK(b.is_cuda(), "Input tensor b must be on CUDA");
+    TORCH_CHECK(a.sizes() == b.sizes(), "Input tensors must have the same shape");
+    
+    auto out = torch::empty_like(a);
+    int rows = a.size(0);
+    int cols = a.numel() / rows;
+
+    const int block_size = 256;
+    const int num_blocks = (a.numel() + block_size - 1) / block_size;
+
+    tk_elementwise_add_kernel<<<num_blocks, block_size>>>(
+        a.data_ptr<float>(), 
+        b.data_ptr<float>(), 
+        out.data_ptr<float>(), 
+        rows, cols
+    );
+
+    return out;
+}
+"""
+
+# Compile the ThunderKittens kernel inline
+elementwise_add = load_inline(
+    name="elementwise_add_tk",
+    cpp_sources=elementwise_add_cpp_source,
+    cuda_sources=elementwise_add_cuda_source,
+    functions=["elementwise_add_cuda"],
+    verbose=True,
+    extra_include_paths=[
+        TK_PATH,
+        os.path.join(TK_PATH, "include"),
+    ],
+    extra_cflags=["-std=c++20", "-O3"],
+    extra_cuda_cflags=[
+        "-std=c++20",
+        "-O3",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "-Xcompiler", "-fPIC",
+        "-DNDEBUG",
+        "-DKITTENS_HOPPER",
+    ],
+)
+
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.elementwise_add = elementwise_add
+
+    def forward(self, a, b):
+        return self.elementwise_add.elementwise_add_cuda(a, b)
diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml
index bcf4e4ed..1d0c1fed 100644
--- a/src/prompts/prompts.toml
+++ b/src/prompts/prompts.toml
@@ -49,6 +49,11 @@ backend_display = "TileLang kernels"
 one_shot_new_arch = "src/prompts/model_new_ex_add_tilelang.py"
 # No few_shot_examples - will use one-shot when few_shot option is selected
 
+[backends.thunderkittens]
+backend_display = "ThunderKittens kernels"
+one_shot_new_arch = "src/prompts/model_new_ex_add_thunderkittens.py"
+# No few_shot_examples - will use one-shot when few_shot option is selected
+
 # -------------------------------------------------------------------------
 # Precision: Precision-specific configuration
 # -------------------------------------------------------------------------

From 56c977a4e78fc43557717fcd6a1bb75e44fdabc8 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Sat, 6 Dec 2025 14:48:40 -0800
Subject: [PATCH 02/14] run and check works with TK

---
 scripts/run_and_check.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index 316b96ee..04a61d57 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -26,20 +26,44 @@
 
 REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
+THUNDERKITTENS_LOCAL_PATH = os.path.join(REPO_TOP_PATH, "ThunderKittens")
+SRC_PATH = os.path.join(REPO_TOP_PATH, "src")
 
 cuda_version = "12.8.0"
 flavor = "devel"
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
-image = (
-    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
-    .apt_install("git", "gcc-10", "g++-10", "clang")
-    .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
-    .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
-    .add_local_python_source("src")
-    .add_local_python_source("scripts")
-)
+# ThunderKittens support - use TK image if directory exists locally
+if os.path.isdir(THUNDERKITTENS_LOCAL_PATH):
+    # ThunderKittens image with TK environment and mounting
+    image = (
+        modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
+        .env({
+            "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
+            "THUNDERKITTENS_PATH": "/root/ThunderKittens",
+            "TORCH_CUDA_ARCH_LIST": "9.0",
+            "CXX": "g++-10",
+            "CC": "gcc-10",
+        })
+        .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
+        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
+        .add_local_dir(SRC_PATH, remote_path="/root/src")
+        .add_local_python_source("src")
+        .add_local_python_source("scripts")
+    )
+else:
+    # Standard image without ThunderKittens
+    image = (
+        modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
+        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
+        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
+        .add_local_python_source("src")
+        .add_local_python_source("scripts")
+    )
 
 """
 Run a pair of KernelBench format (problem, solution) to check if solution is correct and compute speedup

From 4783431fd4330e8ca32e63e0b3ccdc7c8f339b5b Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Sat, 6 Dec 2025 20:25:42 -0800
Subject: [PATCH 03/14] run and check working with old .cu version lol

---
 scripts/run_and_check.py | 429 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 422 insertions(+), 7 deletions(-)

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index 04a61d57..24b2b09d 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -41,6 +41,7 @@
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
+        .pip_install("pybind11")  # Ensure pybind11 is available for ThunderKittens compilation
         .env({
             "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
             "THUNDERKITTENS_PATH": "/root/ThunderKittens",
@@ -60,6 +61,7 @@
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
+        .pip_install("pybind11")  # Ensure pybind11 is available
         .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_python_source("src")
         .add_local_python_source("scripts")
@@ -70,8 +72,7 @@
 
 You will need two files
 1. Reference: PyTorch reference (module Model) implementation with init and input shapes
-2. Solution: PyTorch solution (module ModelNew) with inline CUDA Code
-Please see examples in src/prompts
+2. Solution: PyTorch solution (module ModelNew) with inline CUDA Code OR separate .cu/.py files
 
 The Reference could be either
 1. a local file: specify the path to the file
@@ -90,12 +91,160 @@
 
 4. PyTorch reference is a kernelbench problem (modal eval on cloud GPU)
 python3 scripts/run_and_check.py ref_origin=kernelbench level=<level> problem_id=<problem_id> kernel_src_path=<path to model-generated kernel> eval_mode=modal gpu=L40S
+
+5. ThunderKittens separate .cu and .py files (like original framework)
+python3 scripts/run_and_check.py ref_origin=kernelbench level=1 problem_id=1 cuda_src_path=results/eval_logs/Archive/1_1.cu kernel_src_path=results/eval_logs/Archive/1_1.py eval_mode=modal gpu=H100
 ====================================================
 
 """
 
 torch.set_printoptions(precision=4, threshold=10)
 
+
+def compile_thunderkittens_cuda(cuda_src_path: str, module_name: str = "tk_kernels", 
+                                 build_dir: str = None, verbose: bool = False) -> str:
+    """
+    Compile a ThunderKittens .cu file into a Python module.
+    
+    Args:
+        cuda_src_path: Path to the .cu file
+        module_name: Name of the compiled module (default: tk_kernels)
+        build_dir: Build directory for compiled artifacts
+        verbose: Whether to print compilation output
+        
+    Returns:
+        Path to the directory containing the compiled module
+    """
+    import subprocess
+    import sys
+    import tempfile
+    
+    # Find ThunderKittens
+    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT")
+    if not tk_path:
+        # Try common locations
+        candidates = [
+            "/root/ThunderKittens",
+            os.path.join(REPO_TOP_PATH, "ThunderKittens"),
+            os.path.expanduser("~/ThunderKittens")
+        ]
+        for path in candidates:
+            if os.path.exists(os.path.join(path, "include", "kittens.cuh")):
+                tk_path = path
+                break
+    
+    if not tk_path or not os.path.exists(tk_path):
+        raise RuntimeError(f"ThunderKittens not found. Set THUNDERKITTENS_PATH environment variable.")
+    
+    print(f"[INFO] Using ThunderKittens at: {tk_path}")
+    
+    # Read the CUDA source
+    with open(cuda_src_path, 'r') as f:
+        cuda_source = f.read()
+    
+    # Create build directory
+    if build_dir is None:
+        build_dir = tempfile.mkdtemp(prefix="tk_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source to the build directory
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_source)
+    
+    # Create setup.py for compilation
+    # Note: torch.utils.cpp_extension automatically includes pybind11 headers
+    # We don't need to import pybind11 - CUDAExtension handles it
+    setup_py = f'''
+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+TK_PATH = "{tk_path}"
+
+setup(
+    name="{module_name}",
+    ext_modules=[
+        CUDAExtension(
+            name="{module_name}",
+            sources=["{cu_file}"],
+            include_dirs=[
+                TK_PATH,
+                os.path.join(TK_PATH, "include"),
+            ],
+            extra_compile_args={{
+                "cxx": ["-std=c++20", "-O3", "-fPIC"],
+                "nvcc": [
+                    "-std=c++20", "-O3",
+                    "-arch=sm_90a",
+                    "-DNDEBUG",
+                    "-DKITTENS_HOPPER",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "-Xcompiler", "-fPIC",
+                    "-diag-suppress=20012",
+                ],
+            }},
+            extra_link_args=["-lcuda"],
+            language="c++",
+        )
+    ],
+    cmdclass={{"build_ext": BuildExtension}},
+)
+'''
+    
+    setup_file = os.path.join(build_dir, "setup.py")
+    with open(setup_file, 'w') as f:
+        f.write(setup_py)
+    
+    # Compile the extension
+    print(f"[INFO] Compiling {cuda_src_path} as module '{module_name}'...")
+    
+    env = os.environ.copy()
+    env["TORCH_CUDA_ARCH_LIST"] = "9.0"
+    
+    try:
+        result = subprocess.run(
+            [sys.executable, "setup.py", "build_ext", "--inplace"],
+            cwd=build_dir,
+            capture_output=not verbose,
+            text=True,
+            env=env
+        )
+        
+        if result.returncode != 0:
+            print(f"[ERROR] Compilation failed:")
+            if result.stdout:
+                print(result.stdout)
+            if result.stderr:
+                print(result.stderr)
+            raise RuntimeError(f"Failed to compile {cuda_src_path}")
+        
+        if verbose and result.stdout:
+            print(result.stdout)
+            
+    except Exception as e:
+        raise RuntimeError(f"Failed to compile {cuda_src_path}: {e}")
+    
+    print(f"[INFO] Successfully compiled {module_name} to {build_dir}")
+    return build_dir
+
+
+def prepare_kernel_src_with_cuda(kernel_py_src: str, cuda_module_path: str, module_name: str = "tk_kernels") -> str:
+    """
+    Prepare the Python kernel source to use the pre-compiled CUDA module.
+    Adds the module path to sys.path so import works.
+    """
+    import_hook = f'''
+import sys
+import os
+# Add compiled CUDA module to path
+_tk_module_path = "{cuda_module_path}"
+if _tk_module_path not in sys.path:
+    sys.path.insert(0, _tk_module_path)
+'''
+    return import_hook + "\n" + kernel_py_src
+
 class ScriptConfig(Config):
     def __init__(self):
 
@@ -109,7 +258,14 @@ def __init__(self):
         self.level = ""
         self.problem_id = ""
         # Solution src definition
-        self.kernel_src_path = ""
+        self.kernel_src_path = ""  # .py file with ModelNew
+        # Optional: separate CUDA source file (ThunderKittens style)
+        # If provided, this .cu file will be compiled as tk_kernels module
+        # and the .py file (kernel_src_path) can import it
+        self.cuda_src_path = ""  # .cu file with PYBIND11_MODULE(tk_kernels, ...)
+        
+        # Module name for the compiled CUDA kernel (default: tk_kernels)
+        self.cuda_module_name = "tk_kernels"
 
         # Evaluation mode
         self.eval_mode = "local"  # either "local" or "modal"
@@ -129,8 +285,8 @@ def __init__(self):
         self.clear_cache = False # TODO
 
         # Replace with your NVIDIA GPU architecture, e.g. ["Hopper"]
-        self.gpu_arch = ["Ada"]
-        self.precision = "fp32"
+        self.gpu_arch = ["Hopper"]
+        self.precision = "fp16"
         self.backend = "cuda"
 
     def __repr__(self):
@@ -187,12 +343,202 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
             return eval_result
 
 
+# Helper function for compiling CUDA on Modal using nvcc directly (like the Makefile)
+def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
+    """Compile CUDA source on Modal using nvcc directly (matching the Makefile approach)"""
+    import subprocess
+    import sys
+    import tempfile
+    from src.utils import set_gpu_arch
+    
+    set_gpu_arch(gpu_arch)
+    
+    # Find ThunderKittens
+    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT") or "/root/ThunderKittens"
+    if not os.path.exists(os.path.join(tk_path, "include", "kittens.cuh")):
+        raise RuntimeError(f"ThunderKittens not found at {tk_path}")
+    
+    print(f"[Modal] Using ThunderKittens at: {tk_path}")
+    
+    # Create build directory
+    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_src)
+    
+    # Get pybind11 includes - try command line first, then find in site-packages
+    pybind11_includes = ""
+    try:
+        pybind11_result = subprocess.run(
+            [sys.executable, "-m", "pybind11", "--includes"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        pybind11_includes = pybind11_result.stdout.strip()
+    except:
+        # Fallback: find pybind11 in site-packages
+        import site
+        import glob
+        for site_pkg in site.getsitepackages():
+            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
+            if pybind11_paths:
+                pybind11_includes = f"-I{pybind11_paths[0]}"
+                break
+        
+        # If still not found, try common locations
+        if not pybind11_includes:
+            common_paths = [
+                "/usr/local/include/pybind11",
+                "/usr/include/pybind11",
+                os.path.expanduser("~/.local/include/pybind11"),
+            ]
+            for path in common_paths:
+                if os.path.exists(path):
+                    pybind11_includes = f"-I{path}"
+                    break
+        
+        if not pybind11_includes:
+            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
+    
+    # Get Python config - try python3-config first, then python-config
+    python_ldflags = ""
+    try:
+        python_config_result = subprocess.run(
+            ["python3-config", "--ldflags"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        python_ldflags = python_config_result.stdout.strip()
+    except:
+        try:
+            python_config_result = subprocess.run(
+                ["python-config", "--ldflags"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            python_ldflags = python_config_result.stdout.strip()
+        except:
+            # Fallback - try to construct from sysconfig
+            import sysconfig
+            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
+    
+    # Get Python extension suffix
+    try:
+        ext_suffix_result = subprocess.run(
+            ["python3-config", "--extension-suffix"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        ext_suffix = ext_suffix_result.stdout.strip()
+    except:
+        try:
+            ext_suffix_result = subprocess.run(
+                ["python-config", "--extension-suffix"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            ext_suffix = ext_suffix_result.stdout.strip()
+        except:
+            # Fallback
+            import sysconfig
+            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
+    
+    # Build nvcc command matching the Makefile
+    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
+    
+    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
+    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
+    
+    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
+    python_ldflags_list = python_ldflags.split() if python_ldflags else []
+    
+    nvcc_flags = [
+        "-DNDEBUG",
+        "-Xcompiler", "-fPIE",
+        "--expt-extended-lambda",
+        "--expt-relaxed-constexpr",
+        "-Xcompiler", "-Wno-psabi",
+        "-Xcompiler", "-fno-strict-aliasing",
+        "--use_fast_math",
+        "-forward-unknown-to-host-compiler",
+        "-O3",
+        "-Xnvlink=--verbose",
+        "-Xptxas=--verbose",
+        "-Xptxas=--warn-on-spills",
+        "-std=c++20",
+        "-x", "cu",
+        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
+        f"-I{tk_path}/include",
+    ]
+    
+    # Add prototype include if it exists
+    if os.path.exists(os.path.join(tk_path, "prototype")):
+        nvcc_flags.append(f"-I{tk_path}/prototype")
+    
+    nvcc_flags.extend(pybind11_include_list)
+    nvcc_flags.extend(python_ldflags_list)
+    nvcc_flags.extend([
+        "-shared",
+        "-fPIC",
+        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
+        "-DKITTENS_HOPPER",
+        "-arch=sm_90a",
+        cu_file,
+        "-o", output_so
+    ])
+    
+    # Filter out empty strings
+    nvcc_flags = [f for f in nvcc_flags if f]
+    
+    print(f"[Modal] Compiling {module_name} with nvcc...")
+    print(f"[Modal] Build directory: {build_dir}")
+    print(f"[Modal] CUDA file: {cu_file}")
+    print(f"[Modal] Output: {output_so}")
+    
+    # Run nvcc
+    result = subprocess.run(
+        ["nvcc"] + nvcc_flags,
+        cwd=build_dir,
+        capture_output=True,
+        text=True
+    )
+    
+    # Always print output for debugging
+    if result.stdout:
+        print(f"[Modal] Compilation stdout:\n{result.stdout}")
+    if result.stderr:
+        print(f"[Modal] Compilation stderr:\n{result.stderr}")
+    
+    if result.returncode != 0:
+        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
+        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
+        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
+        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
+    
+    # Verify the .so file was created
+    if not os.path.exists(output_so):
+        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
+    
+    print(f"[Modal] Successfully compiled {module_name}")
+    print(f"[Modal] Generated .so file: {output_so}")
+    return build_dir
+
+
 # Modal evaluation class
 @app.cls(image=image, scaledown_window=5)
 class EvalFunc:
 
     @modal.method()
-    def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, configs: dict, gpu_arch: list):
+    def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, configs: dict, gpu_arch: list,
+                                          cuda_src: str = None, cuda_module_name: str = "tk_kernels"):
         """Evaluate a single sample source code against a reference source code on Modal"""
         from src.utils import set_gpu_arch
         from src.eval import eval_kernel_against_ref, get_torch_dtype_from_string
@@ -200,6 +546,21 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c
         set_gpu_arch(gpu_arch)
         device = torch.device("cuda:0")
 
+        # If CUDA source provided, compile it first
+        if cuda_src:
+            cuda_module_path = _compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
+            
+            # Modify kernel_src to import the compiled module
+            import_hook = f'''
+import sys
+import os
+_tk_module_path = "{cuda_module_path}"
+if _tk_module_path not in sys.path:
+    sys.path.insert(0, _tk_module_path)
+'''
+            kernel_src = import_hook + "\n" + kernel_src
+            print(f"[Modal] Modified kernel source to use compiled module at {cuda_module_path}")
+
         num_correct_trials = configs["num_correct_trials"]
         num_perf_trials = configs["num_perf_trials"]
         verbose = configs["verbose"]
@@ -282,6 +643,50 @@ def main(config: ScriptConfig):
         raise ValueError("Invalid ref_origin")
     
     kernel_src = read_file(config.kernel_src_path)
+    
+    # Handle separate .cu file if provided (ThunderKittens style)
+    # For modal mode, compilation happens on the remote - skip local compilation
+    cuda_module_path = None
+    if config.cuda_src_path and config.eval_mode == "local":
+        print(f"[INFO] Separate CUDA source provided: {config.cuda_src_path}")
+        
+        # Create a unique build directory based on the cuda source hash
+        cuda_src_content = read_file(config.cuda_src_path)
+        cuda_hash = str(hash(cuda_src_content))
+        cuda_build_dir = os.path.join(
+            config.build_dir_prefix if config.build_dir_prefix else os.path.join(REPO_TOP_PATH, "cache"),
+            "tk_cuda_build",
+            cuda_hash
+        )
+        
+        # Compile the CUDA module if not in cache
+        so_file = os.path.join(cuda_build_dir, f"{config.cuda_module_name}*.so")
+        import glob
+        existing_so = glob.glob(so_file)
+        
+        if existing_so and not config.clear_cache:
+            print(f"[INFO] Using cached compiled module from {cuda_build_dir}")
+            cuda_module_path = cuda_build_dir
+        else:
+            if config.clear_cache and os.path.exists(cuda_build_dir):
+                print(f"[INFO] Clearing CUDA cache: {cuda_build_dir}")
+                shutil.rmtree(cuda_build_dir, ignore_errors=True)
+            
+            cuda_module_path = compile_thunderkittens_cuda(
+                cuda_src_path=config.cuda_src_path,
+                module_name=config.cuda_module_name,
+                build_dir=cuda_build_dir,
+                verbose=config.verbose
+            )
+        
+        # Modify kernel_src to import the compiled module
+        kernel_src = prepare_kernel_src_with_cuda(kernel_src, cuda_module_path, config.cuda_module_name)
+        
+        if config.verbose:
+            print(f"[DEBUG] Modified kernel source with CUDA module path: {cuda_module_path}")
+    elif config.cuda_src_path and config.eval_mode == "modal":
+        print(f"[INFO] Separate CUDA source provided: {config.cuda_src_path}")
+        print(f"[INFO] CUDA compilation will happen on Modal (remote GPU)")
 
     # Start Evaluation
     assert config.eval_mode in ["local", "modal"], "eval_mode must be either 'local' or 'modal'"
@@ -325,6 +730,14 @@ def main(config: ScriptConfig):
         # Modal evaluation (remote execution)
         gpu_arch = gpu_arch_mapping.get(config.gpu, config.gpu_arch)
         print(f"[INFO] Using GPU: {config.gpu} with architecture: {gpu_arch}")
+        
+        # Read CUDA source if provided (will be compiled on Modal)
+        cuda_src = None
+        if config.cuda_src_path:
+            print(f"[INFO] Will compile CUDA source on Modal: {config.cuda_src_path}")
+            cuda_src = read_file(config.cuda_src_path)
+            # For Modal, we use the original kernel_src (without local path modifications)
+            kernel_src = read_file(config.kernel_src_path)
 
         with app.run():
             print("[INFO] Evaluating kernel against reference code (MODAL)")
@@ -335,7 +748,9 @@ def main(config: ScriptConfig):
                 ref_arch_src=ref_arch_src,
                 kernel_src=kernel_src,
                 configs=config.to_dict(),
-                gpu_arch=gpu_arch
+                gpu_arch=gpu_arch,
+                cuda_src=cuda_src,
+                cuda_module_name=config.cuda_module_name
             )
             kernel_exec_time = kernel_eval_result.runtime
 

From 1241d3a57ab18d6be859a75ceaa743bb9b55d824 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Sat, 6 Dec 2025 20:33:51 -0800
Subject: [PATCH 04/14] eval_from_generations seems to use new .cu and .py
 framework

---
 scripts/eval_from_generations.py | 412 ++++++++++++++++++++++++++++++-
 1 file changed, 402 insertions(+), 10 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 9c2ae3c4..020b5d70 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -70,6 +70,7 @@
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .pip_install("pybind11")  # Ensure pybind11 is available for ThunderKittens compilation
         .env({
             "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
             "THUNDERKITTENS_PATH": "/root/ThunderKittens",
@@ -87,6 +88,7 @@
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .pip_install("pybind11")  # Ensure pybind11 is available
         .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
     )
@@ -166,6 +168,195 @@ class WorkArgs:
     device: torch.device
 
 
+# Helper function for compiling CUDA on Modal using nvcc directly (like the Makefile)
+def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
+    """Compile CUDA source on Modal using nvcc directly (matching the Makefile approach)"""
+    import subprocess
+    import sys
+    import tempfile
+    from src.utils import set_gpu_arch
+    
+    set_gpu_arch(gpu_arch)
+    
+    # Find ThunderKittens
+    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT") or "/root/ThunderKittens"
+    if not os.path.exists(os.path.join(tk_path, "include", "kittens.cuh")):
+        raise RuntimeError(f"ThunderKittens not found at {tk_path}")
+    
+    print(f"[Modal] Using ThunderKittens at: {tk_path}")
+    
+    # Create build directory
+    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_src)
+    
+    # Get pybind11 includes - try command line first, then find in site-packages
+    pybind11_includes = ""
+    try:
+        pybind11_result = subprocess.run(
+            [sys.executable, "-m", "pybind11", "--includes"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        pybind11_includes = pybind11_result.stdout.strip()
+    except:
+        # Fallback: find pybind11 in site-packages
+        import site
+        import glob
+        for site_pkg in site.getsitepackages():
+            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
+            if pybind11_paths:
+                pybind11_includes = f"-I{pybind11_paths[0]}"
+                break
+        
+        # If still not found, try common locations
+        if not pybind11_includes:
+            common_paths = [
+                "/usr/local/include/pybind11",
+                "/usr/include/pybind11",
+                os.path.expanduser("~/.local/include/pybind11"),
+            ]
+            for path in common_paths:
+                if os.path.exists(path):
+                    pybind11_includes = f"-I{path}"
+                    break
+        
+        if not pybind11_includes:
+            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
+    
+    # Get Python config - try python3-config first, then python-config
+    python_ldflags = ""
+    try:
+        python_config_result = subprocess.run(
+            ["python3-config", "--ldflags"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        python_ldflags = python_config_result.stdout.strip()
+    except:
+        try:
+            python_config_result = subprocess.run(
+                ["python-config", "--ldflags"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            python_ldflags = python_config_result.stdout.strip()
+        except:
+            # Fallback - try to construct from sysconfig
+            import sysconfig
+            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
+    
+    # Get Python extension suffix
+    try:
+        ext_suffix_result = subprocess.run(
+            ["python3-config", "--extension-suffix"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        ext_suffix = ext_suffix_result.stdout.strip()
+    except:
+        try:
+            ext_suffix_result = subprocess.run(
+                ["python-config", "--extension-suffix"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            ext_suffix = ext_suffix_result.stdout.strip()
+        except:
+            # Fallback
+            import sysconfig
+            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
+    
+    # Build nvcc command matching the Makefile
+    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
+    
+    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
+    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
+    
+    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
+    python_ldflags_list = python_ldflags.split() if python_ldflags else []
+    
+    nvcc_flags = [
+        "-DNDEBUG",
+        "-Xcompiler", "-fPIE",
+        "--expt-extended-lambda",
+        "--expt-relaxed-constexpr",
+        "-Xcompiler", "-Wno-psabi",
+        "-Xcompiler", "-fno-strict-aliasing",
+        "--use_fast_math",
+        "-forward-unknown-to-host-compiler",
+        "-O3",
+        "-Xnvlink=--verbose",
+        "-Xptxas=--verbose",
+        "-Xptxas=--warn-on-spills",
+        "-std=c++20",
+        "-x", "cu",
+        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
+        f"-I{tk_path}/include",
+    ]
+    
+    # Add prototype include if it exists
+    if os.path.exists(os.path.join(tk_path, "prototype")):
+        nvcc_flags.append(f"-I{tk_path}/prototype")
+    
+    nvcc_flags.extend(pybind11_include_list)
+    nvcc_flags.extend(python_ldflags_list)
+    nvcc_flags.extend([
+        "-shared",
+        "-fPIC",
+        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
+        "-DKITTENS_HOPPER",
+        "-arch=sm_90a",
+        cu_file,
+        "-o", output_so
+    ])
+    
+    # Filter out empty strings
+    nvcc_flags = [f for f in nvcc_flags if f]
+    
+    print(f"[Modal] Compiling {module_name} with nvcc...")
+    print(f"[Modal] Build directory: {build_dir}")
+    print(f"[Modal] CUDA file: {cu_file}")
+    print(f"[Modal] Output: {output_so}")
+    
+    # Run nvcc
+    result = subprocess.run(
+        ["nvcc"] + nvcc_flags,
+        cwd=build_dir,
+        capture_output=True,
+        text=True
+    )
+    
+    # Always print output for debugging
+    if result.stdout:
+        print(f"[Modal] Compilation stdout:\n{result.stdout}")
+    if result.stderr:
+        print(f"[Modal] Compilation stderr:\n{result.stderr}")
+    
+    if result.returncode != 0:
+        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
+        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
+        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
+        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
+    
+    # Verify the .so file was created
+    if not os.path.exists(output_so):
+        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
+    
+    print(f"[Modal] Successfully compiled {module_name}")
+    print(f"[Modal] Generated .so file: {output_so}")
+    return build_dir
+
+
 # Modal Evaluation Class
 # GPU must be specified here for all instances
 # Retries are configured at the class level to handle GPU attachment failures
@@ -192,10 +383,15 @@ def evaluate_single_sample_modal(
         verbose: bool = False,
         backend: str = "cuda",
         precision: str = "fp32",
+        cuda_src: str = None,
+        cuda_module_name: str = "tk_kernels",
     ):
         """
         Evaluate a single sample on Modal GPU with automatic retries for GPU attachment failures
         and proper GPU corruption handling via stop_fetching_inputs()
+        
+        If cuda_src is provided, it will be compiled first and the kernel_src will be modified
+        to import the compiled module.
         """
         from src.eval import eval_kernel_against_ref, get_torch_dtype_from_string
         from src.utils import set_gpu_arch
@@ -222,6 +418,21 @@ def evaluate_single_sample_modal(
         
         set_gpu_arch(gpu_arch)
 
+        # If CUDA source provided, compile it first
+        if cuda_src:
+            cuda_module_path = _compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
+            
+            # Modify kernel_src to import the compiled module
+            import_hook = f'''
+import sys
+import os
+_tk_module_path = "{cuda_module_path}"
+if _tk_module_path not in sys.path:
+    sys.path.insert(0, _tk_module_path)
+'''
+            kernel_src = import_hook + "\n" + kernel_src
+            print(f"[Modal] Modified kernel source to use compiled module at {cuda_module_path}")
+
         gpu_corrupted = False
         try:
             result = eval_kernel_against_ref(
@@ -291,20 +502,173 @@ def fetch_ref_arch_from_problem_id(
     return ref_arch_src
 
 
+def compile_thunderkittens_cuda(cuda_src_path: str, module_name: str = "tk_kernels", 
+                                 build_dir: str = None, verbose: bool = False) -> str:
+    """
+    Compile a ThunderKittens .cu file into a Python module (for local evaluation).
+    
+    Args:
+        cuda_src_path: Path to the .cu file
+        module_name: Name of the compiled module (default: tk_kernels)
+        build_dir: Build directory for compiled artifacts
+        verbose: Whether to print compilation output
+        
+    Returns:
+        Path to the directory containing the compiled module
+    """
+    import subprocess
+    import sys
+    import tempfile
+    
+    # Find ThunderKittens
+    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT")
+    if not tk_path:
+        # Try common locations
+        candidates = [
+            os.path.join(REPO_TOP_DIR, "ThunderKittens"),
+            os.path.expanduser("~/ThunderKittens")
+        ]
+        for path in candidates:
+            if os.path.exists(os.path.join(path, "include", "kittens.cuh")):
+                tk_path = path
+                break
+    
+    if not tk_path or not os.path.exists(tk_path):
+        raise RuntimeError(f"ThunderKittens not found. Set THUNDERKITTENS_PATH environment variable.")
+    
+    print(f"[INFO] Using ThunderKittens at: {tk_path}")
+    
+    # Read the CUDA source
+    with open(cuda_src_path, 'r') as f:
+        cuda_source = f.read()
+    
+    # Create build directory
+    if build_dir is None:
+        build_dir = tempfile.mkdtemp(prefix="tk_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source to the build directory
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_source)
+    
+    # Create setup.py for compilation
+    setup_py = f'''
+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+TK_PATH = "{tk_path}"
+
+setup(
+    name="{module_name}",
+    ext_modules=[
+        CUDAExtension(
+            name="{module_name}",
+            sources=["{cu_file}"],
+            include_dirs=[
+                TK_PATH,
+                os.path.join(TK_PATH, "include"),
+            ],
+            extra_compile_args={{
+                "cxx": ["-std=c++20", "-O3", "-fPIC"],
+                "nvcc": [
+                    "-std=c++20", "-O3",
+                    "-arch=sm_90a",
+                    "-DNDEBUG",
+                    "-DKITTENS_HOPPER",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "-Xcompiler", "-fPIC",
+                    "-diag-suppress=20012",
+                ],
+            }},
+            extra_link_args=["-lcuda"],
+            language="c++",
+        )
+    ],
+    cmdclass={{"build_ext": BuildExtension}},
+)
+'''
+    
+    setup_file = os.path.join(build_dir, "setup.py")
+    with open(setup_file, 'w') as f:
+        f.write(setup_py)
+    
+    # Compile the extension
+    print(f"[INFO] Compiling {cuda_src_path} as module '{module_name}'...")
+    
+    env = os.environ.copy()
+    env["TORCH_CUDA_ARCH_LIST"] = "9.0"
+    
+    try:
+        result = subprocess.run(
+            [sys.executable, "setup.py", "build_ext", "--inplace"],
+            cwd=build_dir,
+            capture_output=not verbose,
+            text=True,
+            env=env
+        )
+        
+        if result.returncode != 0:
+            print(f"[ERROR] Compilation failed:")
+            if result.stdout:
+                print(result.stdout)
+            if result.stderr:
+                print(result.stderr)
+            raise RuntimeError(f"Failed to compile {cuda_src_path}")
+        
+        if verbose and result.stdout:
+            print(result.stdout)
+            
+    except Exception as e:
+        raise RuntimeError(f"Failed to compile {cuda_src_path}: {e}")
+    
+    print(f"[INFO] Successfully compiled {module_name} to {build_dir}")
+    return build_dir
+
+
+def prepare_kernel_src_with_cuda(kernel_py_src: str, cuda_module_path: str, module_name: str = "tk_kernels") -> str:
+    """
+    Prepare the Python kernel source to use the pre-compiled CUDA module.
+    Adds the module path to sys.path so import works.
+    """
+    import_hook = f'''
+import sys
+import os
+# Add compiled CUDA module to path
+_tk_module_path = "{cuda_module_path}"
+if _tk_module_path not in sys.path:
+    sys.path.insert(0, _tk_module_path)
+'''
+    return import_hook + "\n" + kernel_py_src
+
+
 def fetch_kernel_from_disk(
     run_dir: str, level: int, problem_id: int, sample_id: int
-) -> str | None:
+) -> tuple[str | None, str | None]:
     """
-    Fetch kernel file from disk (stored in runs/{run_name})
+    Fetch kernel files from disk (stored in runs/{run_name})
+    Returns: (kernel_py_src, cuda_src_path) tuple
+    - kernel_py_src: Python kernel source code (or None if not found)
+    - cuda_src_path: Path to .cu file if it exists (or None)
     """
     kernel_path = os.path.join(
         run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.py"
     )
+    cuda_path = os.path.join(
+        run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.cu"
+    )
 
+    kernel_py_src = None
     if os.path.exists(kernel_path):
-        return read_file(kernel_path)
-    else:
-        return None
+        kernel_py_src = read_file(kernel_path)
+    
+    cuda_src_path = None
+    if os.path.exists(cuda_path):
+        cuda_src_path = cuda_path
+    
+    return (kernel_py_src, cuda_src_path)
 
 
 def evaluate_single_sample(
@@ -325,11 +689,30 @@ def evaluate_single_sample(
 
     # fetch kernel from disk
     # Add database support in the future
-    kernel_src = fetch_kernel_from_disk(run_dir, configs.level, problem_id, sample_id)
+    kernel_py_src, cuda_src_path = fetch_kernel_from_disk(run_dir, configs.level, problem_id, sample_id)
 
     assert (
-        kernel_src is not None
+        kernel_py_src is not None
     ), f"Kernel not found for problem {problem_id} sample {sample_id}"
+    
+    # For local evaluation, if CUDA source exists, compile it first
+    kernel_src = kernel_py_src
+    if cuda_src_path:
+        # Create build directory
+        cuda_build_dir = os.path.join(
+            configs.kernel_eval_build_dir, configs.run_name, f"{problem_id}", f"{sample_id}", "cuda_build"
+        )
+        
+        # Compile CUDA module
+        cuda_module_path = compile_thunderkittens_cuda(
+            cuda_src_path=cuda_src_path,
+            module_name="tk_kernels",
+            build_dir=cuda_build_dir,
+            verbose=configs.verbose
+        )
+        
+        # Modify kernel_src to import the compiled module
+        kernel_src = prepare_kernel_src_with_cuda(kernel_src, cuda_module_path, "tk_kernels")
 
     build_dir = os.path.join(
         configs.kernel_eval_build_dir, configs.run_name, f"{problem_id}", f"{sample_id}"
@@ -482,17 +865,24 @@ def batch_eval_modal(
                     ref_arch_src = fetch_ref_arch_from_problem_id(
                         curr_level_dataset, problem_id, config.dataset_src
                     )
-                    kernel_src = fetch_kernel_from_disk(run_dir, config.level, problem_id, sample_id)
+                    kernel_py_src, cuda_src_path = fetch_kernel_from_disk(run_dir, config.level, problem_id, sample_id)
                     
-                    if kernel_src is None:
+                    if kernel_py_src is None:
                         print(f"[WARNING] Kernel not found for problem {problem_id} sample {sample_id}")
                         work_items.append(None)
                     else:
+                        # Read CUDA source if it exists
+                        cuda_src = None
+                        if cuda_src_path:
+                            cuda_src = read_file(cuda_src_path)
+                            print(f"[INFO] Found CUDA source for problem {problem_id} sample {sample_id}: {cuda_src_path}")
+                        
                         work_items.append({
                             'problem_id': problem_id,
                             'sample_id': sample_id,
                             'ref_arch_src': ref_arch_src,
-                            'kernel_src': kernel_src,
+                            'kernel_src': kernel_py_src,
+                            'cuda_src': cuda_src,
                         })
                 
                 # Submit all evaluations in parallel using Modal
@@ -521,6 +911,8 @@ def batch_eval_modal(
                             verbose=config.verbose,
                             backend=config.backend,
                             precision=config.precision,
+                            cuda_src=item.get('cuda_src'),
+                            cuda_module_name="tk_kernels",
                         )
                         futures.append(future)
                 

From 10cd1af48706d328f85722d201efed9a1424da8f Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Sun, 7 Dec 2025 14:47:48 -0800
Subject: [PATCH 05/14] working generated samples

---
 scripts/generate_samples.py                   | 155 ++++++++--
 src/prompt_constructor_toml.py                |   8 +-
 .../model_new_ex_add_thunderkittens.py        | 152 +--------
 src/prompts/prompts.toml                      | 290 ++++++++++++++++++
 src/utils.py                                  | 153 +++++++++
 5 files changed, 602 insertions(+), 156 deletions(-)

diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 82f2c6c6..80de44c9 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -14,6 +14,7 @@
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
+    extract_cuda_and_python_code,
     maybe_multithread,
     read_file,
     set_gpu_arch,
@@ -157,23 +158,112 @@ def generate_sample_single(
             f.write(custom_prompt)
 
     # Query server with constructed prompt
-    custom_kernel = inference_server(custom_prompt)
-    custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
-    # check LLM is able to generate custom CUDA code
-    assert custom_kernel is not None, "Custom CUDA code generation failed"
-
-    if config.verbose:
-        print(
-            f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}"
+    custom_kernel_response = inference_server(custom_prompt)
+    
+    # For ThunderKittens, extract both CUDA and Python code blocks
+    if config.backend == "thunderkittens":
+        # Save raw response for debugging
+        raw_response_path = os.path.join(
+            run_dir,
+            f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_raw_response.txt",
+        )
+        with open(raw_response_path, "w") as f:
+            f.write(custom_kernel_response)
+        
+        # Try to extract both code blocks
+        cuda_code, python_code = extract_cuda_and_python_code(custom_kernel_response)
+        
+        # Fallback: if extraction failed, try to extract a single code block and split it
+        if cuda_code is None or python_code is None:
+            print(f"[WARNING] Failed to extract both code blocks for problem {work.problem_id} sample {work.sample_id}")
+            print(f"  - CUDA code found: {cuda_code is not None}")
+            print(f"  - Python code found: {python_code is not None}")
+            
+            # Try fallback: extract first code block and see if we can split it
+            single_code = extract_first_code(custom_kernel_response, ["python", "cpp", "cuda", "cu"])
+            if single_code:
+                # Try to split by looking for PYBIND11_MODULE or other markers
+                if "PYBIND11_MODULE" in single_code:
+                    # This looks like CUDA code, try to find Python code separately
+                    if python_code is None:
+                        # Try to extract Python code from remaining response
+                        python_code = extract_first_code(custom_kernel_response.replace(single_code, ""), ["python"])
+                    if cuda_code is None:
+                        cuda_code = single_code
+                elif "import torch" in single_code or "class ModelNew" in single_code:
+                    # This looks like Python code
+                    if python_code is None:
+                        python_code = single_code
+                    # Try to find CUDA code
+                    if cuda_code is None:
+                        # Look for other code blocks
+                        remaining = custom_kernel_response.replace(f"```python\n{single_code}\n```", "")
+                        cuda_code = extract_first_code(remaining, ["cpp", "cuda", "cu"])
+                else:
+                    # Unknown format, try to use as CUDA if we don't have it
+                    if cuda_code is None:
+                        cuda_code = single_code
+        
+        # Write out both files even if empty (for debugging)
+        # Store CUDA file (.cu)
+        cuda_path = os.path.join(
+            run_dir,
+            f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.cu",
+        )
+        with open(cuda_path, "w") as f:
+            if cuda_code:
+                f.write(cuda_code)
+            else:
+                f.write(f"# CUDA code extraction failed for problem {work.problem_id} sample {work.sample_id}\n")
+                f.write(f"# Raw response saved to: {os.path.basename(raw_response_path)}\n")
+        
+        # Store Python file (.py)
+        kernel_path = os.path.join(
+            run_dir,
+            f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.py",
         )
+        with open(kernel_path, "w") as f:
+            if python_code:
+                f.write(python_code)
+            else:
+                f.write(f"# Python code extraction failed for problem {work.problem_id} sample {work.sample_id}\n")
+                f.write(f"# Raw response saved to: {os.path.basename(raw_response_path)}\n")
+        
+        if config.verbose:
+            print(
+                f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}"
+            )
+            if cuda_code:
+                print(f"  - CUDA code: {len(cuda_code)} characters")
+            else:
+                print(f"  - CUDA code: NOT FOUND")
+            if python_code:
+                print(f"  - Python code: {len(python_code)} characters")
+            else:
+                print(f"  - Python code: NOT FOUND")
+            print(f"  - Raw response saved to: {os.path.basename(raw_response_path)}")
+        
+        # Warn if extraction failed but don't fail
+        if cuda_code is None or python_code is None:
+            print(f"[WARNING] Partial extraction for problem {work.problem_id} sample {work.sample_id}. Check raw_response.txt file.")
+    else:
+        # For other backends, extract single code block (Python or inline CUDA)
+        custom_kernel = extract_first_code(custom_kernel_response, ["python", "cpp"])
+        # check LLM is able to generate custom code
+        assert custom_kernel is not None, "Custom code generation failed"
+
+        if config.verbose:
+            print(
+                f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}"
+            )
 
-    # Store to local file
-    kernel_path = os.path.join(
-        run_dir,
-        f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.py",
-    )
-    with open(kernel_path, "w") as f:
-        f.write(custom_kernel)
+        # Store to local file
+        kernel_path = os.path.join(
+            run_dir,
+            f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.py",
+        )
+        with open(kernel_path, "w") as f:
+            f.write(custom_kernel)
 
     return True
 
@@ -193,15 +283,26 @@ def generate_sample_launcher(
 
 
 def check_kernel_exists(
-    run_dir: str, level: int, problem_id: int, sample_id: int
+    run_dir: str, level: int, problem_id: int, sample_id: int, backend: str = "cuda"
 ) -> bool:
     """
-    Check if a kernel for a given problem and sample ID already exists in the run directory
+    Check if a kernel for a given problem and sample ID already exists in the run directory.
+    For ThunderKittens, checks for both .cu and .py files.
+    For other backends, only checks for .py file.
     """
     kernel_path = os.path.join(
         run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.py"
     )
-    return os.path.exists(kernel_path)
+    
+    if backend == "thunderkittens":
+        # For ThunderKittens, both .cu and .py files must exist
+        cuda_path = os.path.join(
+            run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.cu"
+        )
+        return os.path.exists(kernel_path) and os.path.exists(cuda_path)
+    else:
+        # For other backends, only .py file is needed
+        return os.path.exists(kernel_path)
 
 
 @pydra.main(base=GenerationConfig)
@@ -307,7 +408,7 @@ def main(config: GenerationConfig):
     ):  # end index is inclusive
         for sample_id in range(config.num_samples):
             total_problems += 1
-            if not check_kernel_exists(run_dir, config.level, problem_id, sample_id):
+            if not check_kernel_exists(run_dir, config.level, problem_id, sample_id, config.backend):
                 problems_to_run.append(
                     WorkArgs(problem_id=int(problem_id), sample_id=sample_id)
                 )
@@ -358,3 +459,19 @@ def main(config: GenerationConfig):
 
 if __name__ == "__main__":
     main()
+
+
+
+# python scripts/generate_samples.py \
+# run_name=new_tk \
+# runs_dir=/Users/willychan/Desktop/projects/dsl-monkeys/runs \
+# dataset_src=local \
+# level=1 \
+# subset="(50,50)" \
+# num_samples=1 \
+# num_workers=50 \
+# server_type=google \
+# model_name=gemini/gemini-3-pro-preview \
+# temperature=1.0 \
+# max_tokens=60000 \
+# backend=thunderkittens
\ No newline at end of file
diff --git a/src/prompt_constructor_toml.py b/src/prompt_constructor_toml.py
index fc074494..5051e17c 100644
--- a/src/prompt_constructor_toml.py
+++ b/src/prompt_constructor_toml.py
@@ -199,7 +199,13 @@ def render_prompt_by_option(
     
     # Fill in shared templates with backend-specific terms
     problem_statement = shared.get("problem_statement", "").format(backend_display=backend_display)
-    instruction = shared.get("instruction", "").format(backend_display=backend_display)
+    
+    # Use ThunderKittens-specific instruction if backend is thunderkittens
+    if backend.lower() == "thunderkittens":
+        # ThunderKittens instruction doesn't use format placeholders, so don't format it
+        instruction = shared.get("instruction_thunderkittens", shared.get("instruction", ""))
+    else:
+        instruction = shared.get("instruction", "").format(backend_display=backend_display)
     
     # Add backend-specific content to context
     context = {
diff --git a/src/prompts/model_new_ex_add_thunderkittens.py b/src/prompts/model_new_ex_add_thunderkittens.py
index af6ff0fb..00a41c1b 100644
--- a/src/prompts/model_new_ex_add_thunderkittens.py
+++ b/src/prompts/model_new_ex_add_thunderkittens.py
@@ -1,142 +1,22 @@
 import torch
 import torch.nn as nn
-from torch.utils.cpp_extension import load_inline
-import os
-
-# ThunderKittens header-only library path (set via environment variable)
-# Default to /root/ThunderKittens for Modal containers, or use THUNDERKITTENS_PATH env var
-TK_PATH = os.environ.get("THUNDERKITTENS_PATH", os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens"))
-
-# C++ source: function declaration for binding
-elementwise_add_cpp_source = """
-torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b);
-"""
-
-# CUDA source: ThunderKittens kernel implementation
-# 
-# IMPORTANT ThunderKittens API notes:
-# 1. Define KITTENS_HOPPER before including kittens.cuh for H100/Hopper GPUs
-# 2. Operations like load, store, zero, mma_AB are NOT free functions!
-#    They are static member functions inside kittens::group<N> template struct.
-# 3. Create an alias like: using warp = kittens::group<1>;
-# 4. Then call: warp::load(...), warp::zero(...), etc.
-#
-elementwise_add_cuda_source = """
-// IMPORTANT: Define KITTENS_HOPPER before including ThunderKittens headers for H100/Hopper GPUs
-// This enables FP8 types and Hopper-specific features
-#define KITTENS_HOPPER
-
-#include <torch/extension.h>
-#include <cuda_runtime.h>
-
-// Include ThunderKittens headers
-#include "kittens.cuh"
-
-// ThunderKittens namespace and group aliases
-// Operations are accessed through these group types, NOT as free functions
-using namespace kittens;
-using warp = kittens::group<1>;  // For single-warp operations (32 threads)
-// For multi-warp operations, use: using warpgroup = kittens::group<4>;
-
-// Constants for tile dimensions
-constexpr int TILE_DIM = 16;
-
-// ThunderKittens elementwise add kernel using shared memory tiles
-// This example demonstrates the ThunderKittens API pattern
-__global__ void tk_elementwise_add_kernel(const float* __restrict__ a_ptr, 
-                                           const float* __restrict__ b_ptr, 
-                                           float* __restrict__ out_ptr, 
-                                           int rows, int cols) {
-    // For simple element-wise ops, we use a straightforward approach
-    // ThunderKittens shines for matrix ops with tiles, but here we show basic pattern
-    
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = rows * cols;
-    
-    // Grid-stride loop for simple element-wise addition
-    for (int i = idx; i < total; i += blockDim.x * gridDim.x) {
-        out_ptr[i] = a_ptr[i] + b_ptr[i];
-    }
-}
-
-// Alternative: ThunderKittens tiled version for larger matrices
-// Shows proper usage of ThunderKittens tile types and group operations
-// Uncomment and adapt for matrix operations:
-/*
-__global__ void tk_matmul_kernel(const bf16* A, const bf16* B, bf16* C, 
-                                  int M, int N, int K) {
-    // Define aliases for the group - THIS IS REQUIRED for ThunderKittens ops
-    using warpgroup = kittens::group<4>;  // 4 warps = 128 threads
-    
-    // ThunderKittens register tiles for accumulation
-    rt_fl<16, 16> acc;  // 16x16 float register tile
-    
-    // Shared memory tiles  
-    extern __shared__ alignment_dummy __shm[];
-    st_bf<16, 16> (&a_smem)[2] = *reinterpret_cast<st_bf<16, 16>(*)[2]>(__shm);
-    st_bf<16, 16> (&b_smem)[2] = *reinterpret_cast<st_bf<16, 16>(*)[2]>(__shm + sizeof(st_bf<16,16>)*2);
-    
-    // Initialize accumulator to zero - NOTE: use warpgroup:: prefix!
-    warpgroup::zero(acc);
-    
-    // Main loop would go here with:
-    // warpgroup::load(a_smem[...], ...);   // Load from global to shared
-    // warpgroup::mma_AB(acc, a_tile, b_tile);  // Matrix multiply-accumulate
-    // warpgroup::store(C_ptr, acc, ...);  // Store result
-}
-*/
-
-torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b) {
-    TORCH_CHECK(a.is_cuda(), "Input tensor a must be on CUDA");
-    TORCH_CHECK(b.is_cuda(), "Input tensor b must be on CUDA");
-    TORCH_CHECK(a.sizes() == b.sizes(), "Input tensors must have the same shape");
-    
-    auto out = torch::empty_like(a);
-    int rows = a.size(0);
-    int cols = a.numel() / rows;
-
-    const int block_size = 256;
-    const int num_blocks = (a.numel() + block_size - 1) / block_size;
-
-    tk_elementwise_add_kernel<<<num_blocks, block_size>>>(
-        a.data_ptr<float>(), 
-        b.data_ptr<float>(), 
-        out.data_ptr<float>(), 
-        rows, cols
-    );
-
-    return out;
-}
-"""
-
-# Compile the ThunderKittens kernel inline
-elementwise_add = load_inline(
-    name="elementwise_add_tk",
-    cpp_sources=elementwise_add_cpp_source,
-    cuda_sources=elementwise_add_cuda_source,
-    functions=["elementwise_add_cuda"],
-    verbose=True,
-    extra_include_paths=[
-        TK_PATH,
-        os.path.join(TK_PATH, "include"),
-    ],
-    extra_cflags=["-std=c++20", "-O3"],
-    extra_cuda_cflags=[
-        "-std=c++20",
-        "-O3",
-        "--expt-relaxed-constexpr",
-        "--expt-extended-lambda",
-        "-Xcompiler", "-fPIC",
-        "-DNDEBUG",
-        "-DKITTENS_HOPPER",
-    ],
-)
-
+import tk_kernels
 
 class ModelNew(nn.Module):
-    def __init__(self) -> None:
+    """
+    ThunderKittens-accelerated elementwise addition (C = A + B)
+    """
+
+    def __init__(self):
         super().__init__()
-        self.elementwise_add = elementwise_add
 
-    def forward(self, a, b):
-        return self.elementwise_add.elementwise_add_cuda(a, b)
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        M, N = A.shape
+        assert A.shape == B.shape, "Input tensors must have the same shape"
+        
+        C = torch.zeros((M, N), device=A.device, dtype=torch.float32).contiguous()
+        
+        # Call into TK pybind wrapper
+        tk_kernels.dispatch_add(A, B, C, int(M), int(N))
+        
+        return C
diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml
index 1d0c1fed..6bcfcd88 100644
--- a/src/prompts/prompts.toml
+++ b/src/prompts/prompts.toml
@@ -17,6 +17,296 @@ instruction = """
 Optimize the architecture named Model with custom {backend_display}! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
 """
 
+instruction_thunderkittens = """
+You are writing CUDA kernels using ThunderKittens (TK). You MUST use the ThunderKittens API (kittens::…) to implement the kernel.
+
+=====================
+OUTPUT CONTRACT
+=====================
+
+Produce **exactly two code blocks** and nothing else:
+
+1) A **C++/CUDA** file that builds a **pybind11** extension module named **tk_kernels** exporting **one kernel** and **one dispatcher**.
+
+2) A **Python** file defining **torch.nn.Module** named **ModelNew** that imports **tk_kernels** and calls the **dispatcher** inside **forward**.
+
+No prints, tests, timing, seeding, or commentary in either file.
+
+=====================
+GOLDEN RULES (must follow)
+=====================
+
+- Always call TK APIs with the **kittens::** prefix. For ops, use **kittens::warp::** or **kittens::warpgroup::**; never bare names.
+
+- Do **NOT** use `using namespace kittens`.
+
+- For global layouts, **never** put `0` in template dims. Use `-1` for runtime dims.
+
+- Favor pipelining: Global → **TMA async** → Shared → Warp loads → Compute → Async store to Global.
+
+- Use **semaphores** and `wait`/`arrive` to overlap producer/consumer work.
+
+- For matvec: prefer broadcast + elementwise + reduction. For matmul: use **MMA**.
+
+- Zero accumulators before use (`kittens::warp::zero`), avoid unnecessary syncs, and pick the lightest scope.
+
+- Register tiles do compute (**rt_***), shared tiles stage data (**st_***), {{r,s}}v_* for vectors.
+
+- Tensor-core MMAs: **bf16** inputs with **float** accumulators. Tile dims ≤ 64x64; chunk larger problems.
+
+- Shared tile **width must match** register tile width for load/store group ops.
+
+- Grid/block config must match warp layout (e.g., 4 warps = 2x2 macro-tile = 32x32).
+
+- You MUST use ONLY the APIs outlined in this prompt, or used in the given examples. DO NOT ASSUME OR MAKE UP ANY OTHER APIs.
+
+=====================
+MENTAL MODEL / WORKFLOW
+=====================
+
+1) Define shared-memory allocator and tiles.
+
+2) Define register tiles/vectors for compute.
+
+3) Load from **global** to **shared** (TMA async) using tile indices {{b, h, r_tile, c_tile}}.
+
+4) Warp-load from **shared** to **registers**.
+
+5) Do compute (elementwise / reductions / MMA).
+
+6) Store back: registers → shared, then async store to global.
+
+7) Use semaphores to overlap stages; prefer warp scope syncs.
+
+=====================
+API CHEATSHEET (fully qualified)
+=====================
+
+Types & Layout
+
+- Tiles/vectors (bf16 shown; analogous float/half types exist):
+  - `kittens::rt_bf<H, W>` : register tile (compute)
+  - `kittens::st_bf<H, W>` : shared tile (staging)
+  - `kittens::rv_bf<N>`    : register vector
+  - `kittens::sv_bf<N>`    : shared vector
+
+- Global layouts (runtime dims = -1):
+  - `kittens::gl<T, B, D, R, C, ...SharedTypes>`
+  - Example: `kittens::gl<kittens::bf16, 1, -1, -1, 2048, kittens::st_bf<16,512>> W;`
+  - Members: `T* raw_ptr;`
+             `template<int axis> size_t shape() const;`
+             `template<int axis> size_t stride() const;`
+  - Indexing uses **tile indices**: `gW[kittens::coord<>{{b, h, r_tile, c_tile}}]`
+  - For batched ops use **3D indexing** `{{batch, row, col}}` (not `{{batch, 0, row, col}}`).
+
+Warp-scope Memory Ops
+
+- Shared ↔ Registers:
+  - `kittens::warp::load(kittens::rt_bf<H,W>& dst, const kittens::st_bf<H,W>& src)`
+  - `kittens::warp::store(kittens::st_bf<H,W>& dst, const kittens::rt_bf<H,W>& src)`
+  - `kittens::warp::load(kittens::rv_bf<N>& dst, const kittens::sv_bf<N>& src)`
+  - `kittens::warp::store(kittens::sv_bf<N>& dst, const kittens::rv_bf<N>& src)`
+
+- Direct small reads from global:
+  - `kittens::warp::load(kittens::rt_bf<H,W>& dst, const kittens::gl<...>& g, kittens::coord<>{{...}})`
+  - `kittens::warp::load(kittens::rv_bf<N>&  dst, const kittens::gl<...>& g, kittens::coord<>{{offset}})`
+
+TMA (Async Global↔Shared)
+
+- Declare/issue:
+  - `kittens::tma::expect(kittens::semaphore& sem, kittens::st_bf<H,W>& tile_or_sv)`
+  - `kittens::tma::load_async(kittens::st_bf<H,W>& dst_smem, kittens::gl<...>& src_gmem, kittens::coord<>{{...}}, kittens::semaphore& sem)`
+  - `kittens::tma::store_async(kittens::gl<...>& dst_gmem, const kittens::sv_bf<N>& src_smem, kittens::coord<>{{...}})`
+  - `kittens::tma::store_add_async(...)`
+  - `kittens::tma::store_async_wait()`       // ensure visibility
+  - `kittens::tma::store_async_read_wait()`  // ensure read-side hazard clear
+
+- Pattern:
+  - Producer: `tma::expect(...); tma::load_async(...);`
+  - Consumer: `wait(sem, threshold); kittens::warp::load(...)` from shared
+  - After consuming: `kittens::warp::arrive(done_sem, count);`
+
+Sync & Semaphores
+
+- `kittens::warp::sync()`
+- `kittens::group<N>::sync(barrier_id)`  // N warps
+- `kittens::warpgroup::sync()`
+- `kittens::init_semaphore(kittens::semaphore&, int initial)`
+- `kittens::wait(kittens::semaphore&, int threshold)`
+- `kittens::warp::arrive(kittens::semaphore&, int delta=1)`
+- Rare fence: `asm volatile("fence.acq_rel.gpu;");`
+
+Math
+
+- Set/fill:
+  - `kittens::warp::zero(x), one(x), pos_infty(x), neg_infty(x)`
+  - For setting to a specific scalar value: `kittens::warp::mul(tile, tile, scalar_value);  // after zeroing`
+
+- Elementwise (tile/vector or scalar rhs):
+  - `kittens::warp::add(dst, a, b)`, `sub`, `mul`, `div`
+  - Scalars OK: `kittens::warp::mul(tile, tile, scalar)`
+
+- Broadcast/layout:
+  - `kittens::warp::broadcast_col(rt, row_vec)`
+  - `kittens::warp::broadcast_row(rt, col_vec)`
+  - `kittens::warp::transpose_inplace(rt)` → returns ref
+  - `kittens::warp::swap_layout_inplace(rt)` → switch row/col view
+
+- Apply lambdas:
+  - `kittens::warp::apply(rv_dst, rv_src, Lambda)`
+  - `kittens::warp::apply(rt_dst, rt_src, Lambda)`
+
+Reductions
+
+- Tile → vector:
+  - `kittens::warp::row_sum(col_vec, rt)`  // sum across columns per row
+  - `kittens::warp::col_sum(row_vec, rt)`
+  - `row_max`, `col_max` (also on shared tiles)
+
+- Vector → scalar:
+  - `auto s = kittens::warp::sum(const kittens::rv_bf<N>&)`
+
+MMA (Tensor Cores)
+
+- Use bf16 inputs with float accumulators:
+  - Inputs: `kittens::rt_bf<M,N, kittens::ducks::rt_layout::row|col>`
+  - Accum:  `kittens::rt_fl<M,N, kittens::ducks::rt_layout::row>`
+
+- Warp MMAs:
+  - `kittens::warp::mma_AB(C, A, B, C)`
+  - `kittens::warp::mma_ABt(C, A, B, C)`
+  - `kittens::warp::mma_AtB(C, A, B, C)`
+
+- Warpgroup MMAs:
+  - `kittens::warpgroup::mma_AB(C, A, B)` and friends
+  - `kittens::warpgroup::mma_async_wait()`
+
+- Example TMA/compute/store:
+```
+// loader
+kittens::tma::expect(inp_sem, weight_smem);
+kittens::tma::load_async(weight_smem, g.W, kittens::coord<>{{layer, col_block, tile_id}}, inp_sem);
+// consumer
+kittens::wait(inp_sem, 0);
+kittens::warp::load(Wt, weight_smem);
+// ... compute ...
+kittens::tma::store_async(g.O, out_smem_vec, kittens::coord<>{{out_block}});
+kittens::tma::store_async_wait();
+```
+
+=====================
+C++/CUDA FILE REQUIREMENTS
+=====================
+
+- Includes:
+  - `#include "kittens.cuh"`
+  - `#include "pyutils/pyutils.cuh"`
+  - No `using namespace kittens;`
+
+- Launch config:
+  - e.g., `#define NUM_WORKERS (1)`
+  - `#define NUM_THREADS (NUM_WORKERS * kittens::WARP_THREADS)`
+  - Tile dimensions are multiples of 16.
+
+- micro_globals:
+  - Contains all inputs/outputs as TK global layouts + scalar params.
+  - Each tensor as: `kittens::gl<kittens::half, -1, -1, -1, -1, kittens::st<kittens::half, TILE_M, TILE_N>>`
+    with runtime 4D shape (unused logical dims may be indexed with zeros).
+
+- Kernel signature:
+  - `__global__ __launch_bounds__(NUM_THREADS, 1) void micro_tk(const __grid_constant__ micro_globals g)`
+
+- Shared allocator (must use alignment_dummy):
+```
+extern __shared__ kittens::alignment_dummy __shm[];
+kittens::shared_allocator al((int*)&__shm[0]);
+```
+
+- Allocate tiles:
+  - Shared: `auto& x_s = al.allocate<kittens::st<kittens::half, M, N>>();`
+  - Registers: `kittens::rt<kittens::half, M, N, kittens::ducks::rt_layout::row|col> x_rt;`
+  - **Match shared/register widths** for group load/store.
+
+- Dispatcher:
+  - `void dispatch_micro(micro_globals g)`:
+    - Optionally `cudaFuncSetAttribute(micro_tk, cudaFuncAttributeMaxDynamicSharedMemorySize, mem_size);`
+    - Launch: `micro_tk<<<g.grid(), g.block(), mem_size>>>(g);`
+    - `cudaDeviceSynchronize();`
+
+- PyBind11 binding (member pointers; order matches struct fields):
+```
+PYBIND11_MODULE(tk_kernels, m) {{
+  kittens::py::bind_kernel<micro_tk, micro_globals>(m, "micro_tk",
+    &micro_globals::A, &micro_globals::B, &micro_globals::C, &micro_globals::M, &micro_globals::N);
+  kittens::py::bind_function<dispatch_micro, micro_globals>(m, "dispatch_micro",
+    &micro_globals::A, &micro_globals::B, &micro_globals::C, &micro_globals::M, &micro_globals::N);
+}}
+```
+
+=====================
+PYTHON FILE REQUIREMENTS
+=====================
+
+- `import tk_kernels` and standard PyTorch imports at top.
+
+- Define:
+  - `class ModelNew(torch.nn.Module):`
+    - `def forward(self, ...):`
+      - Ensure inputs on CUDA (`.cuda()` as needed).
+      - Allocate outputs on CUDA with correct `dtype`/shape.
+      - Call `tk_kernels.dispatch_micro(...)` with args in the **exact** order as in the PyBind signature.
+      - Return the output tensor.
+
+- No printing, checks, seeding, timing, or tests.
+
+=====================
+COMMON PITFALLS / FIXES (strict)
+=====================
+
+1) Never declare `using dtype = fp16;` (causes compile errors).
+
+2) Never use `0` in global layout template dims—use `-1` for runtime dims.
+
+3) For tensor cores: use `kittens::rt_bf<>` inputs with `kittens::rt_fl<>` accumulators.
+
+4) Always call ops via `kittens::warp::…` (or `kittens::warpgroup::…`), never unqualified.
+
+5) Use `kittens::alignment_dummy __shm[]` for shared memory (not `int __shm[]`).
+
+6) For batched ops, index `{{batch, row, col}}` (3D), not `{{batch, 0, row, col}}`.
+
+7) Use `__host__` (not `KITTENS_HOST_DEVICE`) for host functions.
+
+8) For scalar→half convert, use `__float2half()` (not `kittens::to_half()`).
+
+9) In pybind, pass **member pointers** (`&Class::member`), not string names.
+
+10) Use a fixed integer (e.g., `100000`) for `dynamic_shared_memory()` sizing; avoid template-size expressions.
+
+11) For scalar broadcasting to tiles: `kittens::warp::zero(tile); kittens::warp::add(tile, tile, scalar);`
+
+12) Allocate shared tiles with `auto& x_s = al.allocate<...>();` (not `*al.allocate`).
+
+13) For matmul use `kittens::warp::mma_AB(accum, A, B, accum)` (not elementwise mul+add).
+
+14) Global layout coordinates are **tile indices** `{{b, h, r_tile, c_tile}}` (not element offsets).
+
+15) Match shared/register **widths** to avoid "Group load/store requires tile widths to match".
+
+16) Align grid/block with actual warp count (e.g., 4 warps = 2×2 layout = 32×32).
+
+=====================
+CHECKLIST BEFORE YOU FINISH
+=====================
+
+- Exactly two code blocks produced: (1) C++/CUDA pybind module **tk_kernels** with `micro_tk` and `dispatch_micro`; (2) Python `ModelNew` calling the dispatcher in `forward`.
+
+- All TK calls are fully qualified with `kittens::…`.
+
+- No extra text, prints, tests, or timing code.
+
+"""
+
 # Shared example architecture (same for all backends)
 few_shot_example_arch = "src/prompts/model_ex_add.py"
 
diff --git a/src/utils.py b/src/utils.py
index f4fac580..8a0650fd 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -371,6 +371,159 @@ def extract_code_blocks(text, code_language_types: list[str]) -> str:
     
     return " \n ".join(combined_code) if combined_code else ""
 
+
+def extract_cuda_and_python_code(output_string: str) -> tuple[str | None, str | None]:
+    """
+    Extract both CUDA (C++) and Python code blocks from model output.
+    Handles two cases:
+    1. Separate code blocks (```cpp ... ``` and ```python ... ```)
+    2. CUDA code embedded as string in Python
+    
+    Returns: (cuda_code, python_code) tuple
+    - cuda_code: The CUDA/C++ code block (or None if not found)
+    - python_code: The Python code block, converted to use separate module import (or None if not found)
+    """
+    if output_string is None:
+        return (None, None)
+    
+    trimmed = output_string.strip()
+    
+    # First, try to find separate code blocks
+    pattern = r'```(\w+)?\n(.*?)```'
+    matches = re.findall(pattern, trimmed, re.DOTALL)
+    
+    cuda_code = None
+    python_code = None
+    
+    for lang_type, code in matches:
+        code = code.strip()
+        lang_type = lang_type.lower() if lang_type else ""
+        
+        # Check for CUDA/C++ code blocks
+        if lang_type in ['cpp', 'cuda', 'c++', 'cu'] or (not lang_type and 'PYBIND11_MODULE' in code):
+            if cuda_code is None:  # Take the first CUDA block found
+                cuda_code = code
+        
+        # Check for Python code blocks
+        elif lang_type == 'python' or (not lang_type and ('import torch' in code or 'class ModelNew' in code)):
+            if python_code is None:  # Take the first Python block found
+                python_code = code
+    
+    # If we didn't find separate blocks, try to extract from embedded string pattern
+    if cuda_code is None or python_code is None:
+        # Look for Python code block that might contain embedded CUDA
+        python_block = None
+        for lang_type, code in matches:
+            code = code.strip()
+            lang_type = lang_type.lower() if lang_type else ""
+            if lang_type == 'python' or ('import torch' in code or 'class ModelNew' in code or 'load_inline' in code):
+                python_block = code
+                break
+        
+        # If no code blocks found, check the raw text
+        if python_block is None and len(matches) == 0:
+            # Try to find Python code in raw text
+            if 'import torch' in trimmed or 'class ModelNew' in trimmed or 'load_inline' in trimmed:
+                python_block = trimmed
+        
+        if python_block:
+            # Try to extract CUDA code from string variables (e.g., conv2d_cuda_source = """...""")
+            # Pattern: variable_name = """...CUDA code..."""
+            cuda_string_pattern = r'(\w+_cuda_source|\w+_cpp_source|\w+_source)\s*=\s*"""(.*?)"""'
+            cuda_matches = re.findall(cuda_string_pattern, python_block, re.DOTALL)
+            
+            if cuda_matches:
+                # Take the first match (usually the CUDA source)
+                var_name, cuda_content = cuda_matches[0]
+                if cuda_code is None:
+                    cuda_code = cuda_content.strip()
+                
+                # Convert Python code to use separate module import
+                if python_code is None:
+                    python_code = convert_python_from_inline_to_module(python_block, var_name)
+            else:
+                # If no embedded CUDA found but we have Python, use it as-is
+                if python_code is None:
+                    python_code = python_block
+    
+    return (cuda_code, python_code)
+
+
+def convert_python_from_inline_to_module(python_code: str, cuda_var_name: str) -> str:
+    """
+    Convert Python code that uses load_inline to use separate module import pattern.
+    Removes load_inline calls and related setup, replaces with tk_kernels import.
+    """
+    # Use regex to remove the CUDA source string variable definitions
+    # Pattern: variable_name = """...""" (handles multi-line)
+    cuda_source_pattern = r'\w+_(cuda|cpp)_source\s*=\s*""".*?"""'
+    converted_code = re.sub(cuda_source_pattern, '', python_code, flags=re.DOTALL)
+    
+    # Remove cpp_source declarations too
+    cpp_source_pattern = r'\w+_cpp_source\s*=\s*""".*?"""'
+    converted_code = re.sub(cpp_source_pattern, '', converted_code, flags=re.DOTALL)
+    
+    # Remove load_inline calls (may span multiple lines)
+    # Find load_inline( ... ) and remove it
+    load_inline_pattern = r'\w+\s*=\s*load_inline\([^)]*\)'
+    # Handle multi-line load_inline
+    lines = converted_code.split('\n')
+    new_lines = []
+    skip_load_inline = False
+    paren_depth = 0
+    
+    for line in lines:
+        if 'load_inline' in line:
+            skip_load_inline = True
+            paren_depth = line.count('(') - line.count(')')
+            if paren_depth <= 0:
+                skip_load_inline = False
+            continue
+        
+        if skip_load_inline:
+            paren_depth += line.count('(') - line.count(')')
+            if paren_depth <= 0:
+                skip_load_inline = False
+            continue
+        
+        new_lines.append(line)
+    
+    converted_code = '\n'.join(new_lines)
+    
+    # Remove imports related to load_inline
+    converted_code = re.sub(r'from torch\.utils\.cpp_extension import.*?load_inline[^\n]*\n', '', converted_code)
+    converted_code = re.sub(r'import.*?load_inline[^\n]*\n', '', converted_code)
+    
+    # Remove TK_PATH setup (usually only needed for load_inline)
+    converted_code = re.sub(r'# ThunderKittens header-only library path\s*\nTK_PATH\s*=.*?\n', '', converted_code, flags=re.MULTILINE)
+    converted_code = re.sub(r'TK_PATH\s*=.*?\n', '', converted_code)
+    
+    # Remove C++ source declaration comments
+    converted_code = re.sub(r'# C\+\+ source declaration\s*\n', '', converted_code)
+    converted_code = re.sub(r'# CUDA source with.*?\n', '', converted_code)
+    converted_code = re.sub(r'# Compile kernel\s*\n', '', converted_code)
+    
+    # Add import for tk_kernels (after torch imports)
+    if 'import torch' in converted_code and 'import tk_kernels' not in converted_code:
+        # Insert after the last import statement
+        import_section = re.search(r'(import torch[^\n]*\n(?:import torch[^\n]*\n)*)', converted_code)
+        if import_section:
+            converted_code = converted_code.replace(import_section.group(1), import_section.group(1) + 'import tk_kernels\n')
+        else:
+            # Fallback: add after first import
+            converted_code = re.sub(r'(import torch[^\n]*\n)', r'\1import tk_kernels\n', converted_code, count=1)
+    
+    # Replace references to the load_inline result variable
+    # Pattern: self.conv2d_op = conv2d_tk -> remove or replace
+    # Pattern: conv2d_tk.conv2d_cuda(...) -> tk_kernels.dispatch_*(...)
+    # This is tricky without knowing the exact function names, so we'll leave a comment
+    converted_code = re.sub(r'self\.\w+_op\s*=\s*\w+_tk\s*\n', '', converted_code)
+    
+    # Clean up extra blank lines
+    converted_code = re.sub(r'\n\n\n+', '\n\n', converted_code)
+    
+    return converted_code.strip()
+
 ################################################################################
 # Scale up experiments in parallel
 ################################################################################

From f8ce4f1bf474d6704548002cd3d0d5b860bf242d Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Sun, 7 Dec 2025 15:27:50 -0800
Subject: [PATCH 06/14] single sample modal working

---
 .../generate_and_eval_single_sample_modal.py  | 251 +++++++++++++++++-
 1 file changed, 242 insertions(+), 9 deletions(-)

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 77016d49..48b40a31 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -13,10 +13,10 @@
 
 from datasets import load_dataset
 
-#from src.dataset import construct_kernelbench_dataset
+from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
-from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
+from src.utils import extract_first_code, extract_cuda_and_python_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
 app = modal.App("eval_single_sample")
 
@@ -26,6 +26,7 @@
 """
 
 REPO_TOP_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+KERNEL_BENCH_PATH = os.path.join(REPO_TOP_DIR, "KernelBench")
 
 torch.set_printoptions(precision=4, threshold=10)
 
@@ -106,6 +107,7 @@ def __repr__(self):
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .pip_install("pybind11")  # Ensure pybind11 is available for ThunderKittens compilation
         .env({
             "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
             "THUNDERKITTENS_PATH": "/root/ThunderKittens",
@@ -114,6 +116,7 @@ def __repr__(self):
             "CC": "gcc-10",
         })
         .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
+        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
     )
 else:
@@ -122,14 +125,205 @@ def __repr__(self):
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
+        .pip_install("pybind11")  # Ensure pybind11 is available
+        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
     )
 
+# Helper function for compiling CUDA on Modal using nvcc directly (like the Makefile)
+def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
+    """Compile CUDA source on Modal using nvcc directly (matching the Makefile approach)"""
+    import subprocess
+    import sys
+    import tempfile
+    from src.utils import set_gpu_arch
+    
+    set_gpu_arch(gpu_arch)
+    
+    # Find ThunderKittens
+    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT") or "/root/ThunderKittens"
+    if not os.path.exists(os.path.join(tk_path, "include", "kittens.cuh")):
+        raise RuntimeError(f"ThunderKittens not found at {tk_path}")
+    
+    print(f"[Modal] Using ThunderKittens at: {tk_path}")
+    
+    # Create build directory
+    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_src)
+    
+    # Get pybind11 includes - try command line first, then find in site-packages
+    pybind11_includes = ""
+    try:
+        pybind11_result = subprocess.run(
+            [sys.executable, "-m", "pybind11", "--includes"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        pybind11_includes = pybind11_result.stdout.strip()
+    except:
+        # Fallback: find pybind11 in site-packages
+        import site
+        import glob
+        for site_pkg in site.getsitepackages():
+            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
+            if pybind11_paths:
+                pybind11_includes = f"-I{pybind11_paths[0]}"
+                break
+        
+        # If still not found, try common locations
+        if not pybind11_includes:
+            common_paths = [
+                "/usr/local/include/pybind11",
+                "/usr/include/pybind11",
+                os.path.expanduser("~/.local/include/pybind11"),
+            ]
+            for path in common_paths:
+                if os.path.exists(path):
+                    pybind11_includes = f"-I{path}"
+                    break
+        
+        if not pybind11_includes:
+            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
+    
+    # Get Python config - try python3-config first, then python-config
+    python_ldflags = ""
+    try:
+        python_config_result = subprocess.run(
+            ["python3-config", "--ldflags"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        python_ldflags = python_config_result.stdout.strip()
+    except:
+        try:
+            python_config_result = subprocess.run(
+                ["python-config", "--ldflags"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            python_ldflags = python_config_result.stdout.strip()
+        except:
+            # Fallback - try to construct from sysconfig
+            import sysconfig
+            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
+    
+    # Get Python extension suffix
+    try:
+        ext_suffix_result = subprocess.run(
+            ["python3-config", "--extension-suffix"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        ext_suffix = ext_suffix_result.stdout.strip()
+    except:
+        try:
+            ext_suffix_result = subprocess.run(
+                ["python-config", "--extension-suffix"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            ext_suffix = ext_suffix_result.stdout.strip()
+        except:
+            # Fallback
+            import sysconfig
+            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
+    
+    # Build nvcc command matching the Makefile
+    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
+    
+    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
+    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
+    
+    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
+    python_ldflags_list = python_ldflags.split() if python_ldflags else []
+    
+    nvcc_flags = [
+        "-DNDEBUG",
+        "-Xcompiler", "-fPIE",
+        "--expt-extended-lambda",
+        "--expt-relaxed-constexpr",
+        "-Xcompiler", "-Wno-psabi",
+        "-Xcompiler", "-fno-strict-aliasing",
+        "--use_fast_math",
+        "-forward-unknown-to-host-compiler",
+        "-O3",
+        "-Xnvlink=--verbose",
+        "-Xptxas=--verbose",
+        "-Xptxas=--warn-on-spills",
+        "-std=c++20",
+        "-x", "cu",
+        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
+        f"-I{tk_path}/include",
+    ]
+    
+    # Add prototype include if it exists
+    if os.path.exists(os.path.join(tk_path, "prototype")):
+        nvcc_flags.append(f"-I{tk_path}/prototype")
+    
+    nvcc_flags.extend(pybind11_include_list)
+    nvcc_flags.extend(python_ldflags_list)
+    nvcc_flags.extend([
+        "-shared",
+        "-fPIC",
+        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
+        "-DKITTENS_HOPPER",
+        "-arch=sm_90a",
+        cu_file,
+        "-o", output_so
+    ])
+    
+    # Filter out empty strings
+    nvcc_flags = [f for f in nvcc_flags if f]
+    
+    print(f"[Modal] Compiling {module_name} with nvcc...")
+    print(f"[Modal] Build directory: {build_dir}")
+    print(f"[Modal] CUDA file: {cu_file}")
+    print(f"[Modal] Output: {output_so}")
+    
+    # Run nvcc
+    result = subprocess.run(
+        ["nvcc"] + nvcc_flags,
+        cwd=build_dir,
+        capture_output=True,
+        text=True
+    )
+    
+    # Always print output for debugging
+    if result.stdout:
+        print(f"[Modal] Compilation stdout:\n{result.stdout}")
+    if result.stderr:
+        print(f"[Modal] Compilation stderr:\n{result.stderr}")
+    
+    if result.returncode != 0:
+        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
+        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
+        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
+        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
+    
+    # Verify the .so file was created
+    if not os.path.exists(output_so):
+        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
+    
+    print(f"[Modal] Successfully compiled {module_name}")
+    print(f"[Modal] Generated .so file: {output_so}")
+    return build_dir
+
+
 @app.cls(image=image)
 class EvalFunc:
 
     @modal.method()
-    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision):
+    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision, cuda_src=None, cuda_module_name="tk_kernels"):
         # 3. Evaluate Kernel
         # NOTE: no need to wrap around process here as only a single sample
         # see batch eval for examples of process isolation
@@ -137,7 +331,26 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc
         from src.eval import get_torch_dtype_from_string
         # Use utility function to set the GPU architecture in the modal environment
         from src.utils import set_gpu_arch as modal_set_gpu_arch
+        import sys
+        import os
+        
         modal_set_gpu_arch(gpu_arch)
+        
+        # If CUDA source provided, compile it first (for ThunderKittens)
+        if cuda_src:
+            cuda_module_path = _compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
+            
+            # Modify kernel_src to import the compiled module
+            import_hook = f'''
+import sys
+import os
+_tk_module_path = "{cuda_module_path}"
+if _tk_module_path not in sys.path:
+    sys.path.insert(0, _tk_module_path)
+'''
+            custom_kernel = import_hook + "\n" + custom_kernel
+            print(f"[Modal] Modified kernel source to use compiled module at {cuda_module_path}")
+        
         return eval_kernel_against_ref(
             ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True, 
             num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision)
@@ -171,6 +384,8 @@ def main(config: EvalConfig):
     if config.dataset_src == "huggingface":
         dataset = load_dataset(config.dataset_name)
         curr_level_dataset = dataset[f"level_{config.level}"]
+    elif config.dataset_src == "local":
+        curr_level_dataset = construct_kernelbench_dataset(config.level)
 
     if config.log:
         os.makedirs(config.logdir, exist_ok=True)
@@ -285,19 +500,37 @@ def main(config: EvalConfig):
             f.write(custom_prompt)
 
     # Query server with constructed prompt
-    custom_kernel = inference_server(custom_prompt)
-    custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
-    # check LLM is able to generate custom kernel code
-    assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
+    custom_kernel_response = inference_server(custom_prompt)
+    
+    # For ThunderKittens, extract both CUDA and Python code
+    cuda_src = None
+    if backend == "thunderkittens":
+        cuda_code, python_code = extract_cuda_and_python_code(custom_kernel_response)
+        if cuda_code is None or python_code is None:
+            # Fallback to single code extraction
+            print("[WARNING] Could not extract separate CUDA and Python code blocks, falling back to single extraction")
+            custom_kernel = extract_first_code(custom_kernel_response, ["python", "cpp"])
+            assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
+        else:
+            custom_kernel = python_code
+            cuda_src = cuda_code
+            print(f"[INFO] Extracted CUDA code ({len(cuda_src)} chars) and Python code ({len(custom_kernel)} chars)")
+    else:
+        custom_kernel = extract_first_code(custom_kernel_response, ["python", "cpp"])
+        # check LLM is able to generate custom kernel code
+        assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
     
-    # this should be optional
+    # Log generated files
     if config.log:
+        if cuda_src:
+            with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.cu"), "w") as f:
+                f.write(cuda_src)
         with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:
             f.write(custom_kernel)
 
     with app.run():
         kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
-            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision
+            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision, cuda_src=cuda_src, cuda_module_name="tk_kernels"
         )
         
         print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")

From 8a3d32c3147166d84e36f910725b3f07e9986c31 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Wed, 10 Dec 2025 19:48:14 -0800
Subject: [PATCH 07/14] working with new version blackwell

---
 scripts/run_and_check.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index 24b2b09d..ede7a0b8 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -41,7 +41,7 @@
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
         .apt_install("git", "gcc-10", "g++-10", "clang")
         .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
-        .pip_install("pybind11")  # Ensure pybind11 is available for ThunderKittens compilation
+        .pip_install("pybind11")  # pybind11 needed for ThunderKittens compilation
         .env({
             "THUNDERKITTENS_ROOT": "/root/ThunderKittens",
             "THUNDERKITTENS_PATH": "/root/ThunderKittens",
@@ -179,6 +179,7 @@ def compile_thunderkittens_cuda(cuda_src_path: str, module_name: str = "tk_kerne
                     "-arch=sm_90a",
                     "-DNDEBUG",
                     "-DKITTENS_HOPPER",
+                    "-DKITTENS_BLACKWELL",
                     "--expt-relaxed-constexpr",
                     "--expt-extended-lambda",
                     "-Xcompiler", "-fPIC",
@@ -490,6 +491,7 @@ def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
         "-fPIC",
         f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
         "-DKITTENS_HOPPER",
+        "-DKITTENS_BLACKWELL",
         "-arch=sm_90a",
         cu_file,
         "-o", output_so

From e5a1a803451cf6136ac57fe161601fce9a8f23fd Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Wed, 10 Dec 2025 19:57:35 -0800
Subject: [PATCH 08/14] factored out tk compiling part

---
 scripts/generate_and_eval_single_sample.py |  60 +++-
 scripts/run_and_check.py                   | 352 +-----------------
 scripts/tk_compile.py                      | 400 +++++++++++++++++++++
 3 files changed, 461 insertions(+), 351 deletions(-)
 create mode 100644 scripts/tk_compile.py

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 92c42ef8..8edb2441 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -13,6 +13,7 @@
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
+    extract_cuda_and_python_code,
     query_server,
     read_file,
     set_gpu_arch,
@@ -250,22 +251,63 @@ def main(config: EvalConfig):
             f.write(custom_prompt)
 
     # Query server with constructed prompt
-    custom_kernel = inference_server(custom_prompt)
-    custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
-
-    # check LLM is able to generate custom kernel code
-    assert (
-        custom_kernel is not None
-    ), f"Custom {config.backend} kernel code generation failed"
-
-    # this should be optional
+    custom_kernel_response = inference_server(custom_prompt)
+    
+    # For ThunderKittens, extract both CUDA and Python code
+    cuda_src = None
+    if backend == "thunderkittens":
+        cuda_code, python_code = extract_cuda_and_python_code(custom_kernel_response)
+        if cuda_code is None or python_code is None:
+            # Fallback to single code extraction
+            print("[WARNING] Could not extract separate CUDA and Python code blocks, falling back to single extraction")
+            custom_kernel = extract_first_code(custom_kernel_response, ["python", "cpp"])
+            assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
+        else:
+            custom_kernel = python_code
+            cuda_src = cuda_code
+            print(f"[INFO] Extracted CUDA code ({len(cuda_src)} chars) and Python code ({len(custom_kernel)} chars)")
+    else:
+        custom_kernel = extract_first_code(custom_kernel_response, ["python", "cpp"])
+        # check LLM is able to generate custom kernel code
+        assert (
+            custom_kernel is not None
+        ), f"Custom {config.backend} kernel code generation failed"
+    
+    # Log generated files
     if config.log:
+        if cuda_src:
+            with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.cu"), "w") as f:
+                f.write(cuda_src)
         with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:
             f.write(custom_kernel)
 
     # 3. Evaluate Kernel
     # NOTE: no need to wrap around process here as only a single sample
     # see batch eval for examples of process isolation
+    
+    # For ThunderKittens with separate CUDA file, compile it first
+    if backend == "thunderkittens" and cuda_src:
+        from scripts.eval_from_generations import compile_thunderkittens_cuda, prepare_kernel_src_with_cuda
+        import tempfile
+        
+        # Create temporary CUDA file
+        temp_cuda_file = os.path.join(config.logdir, f"temp_kernel_level_{config.level}_problem_{config.problem_id}.cu")
+        os.makedirs(os.path.dirname(temp_cuda_file), exist_ok=True)
+        with open(temp_cuda_file, 'w') as f:
+            f.write(cuda_src)
+        
+        # Compile CUDA module
+        cuda_build_dir = os.path.join(config.logdir, f"cuda_build_level_{config.level}_problem_{config.problem_id}")
+        cuda_module_path = compile_thunderkittens_cuda(
+            cuda_src_path=temp_cuda_file,
+            module_name="tk_kernels",
+            build_dir=cuda_build_dir,
+            verbose=config.verbose
+        )
+        
+        # Modify kernel_src to import the compiled module
+        custom_kernel = prepare_kernel_src_with_cuda(custom_kernel, cuda_module_path, "tk_kernels")
+    
     kernel_exec_result = eval_kernel_against_ref(
         ref_arch_src,
         custom_kernel,
diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index ede7a0b8..3907c5de 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -10,6 +10,11 @@
 from src import utils as kernel_utils
 from scripts.generate_baseline_time import measure_program_time
 from src.utils import read_file
+from scripts.tk_compile import (
+    compile_thunderkittens_cuda,
+    compile_cuda_on_modal,
+    prepare_kernel_src_with_cuda
+)
 
 # Modal setup
 app = modal.App("run_and_check")
@@ -101,150 +106,6 @@
 torch.set_printoptions(precision=4, threshold=10)
 
 
-def compile_thunderkittens_cuda(cuda_src_path: str, module_name: str = "tk_kernels", 
-                                 build_dir: str = None, verbose: bool = False) -> str:
-    """
-    Compile a ThunderKittens .cu file into a Python module.
-    
-    Args:
-        cuda_src_path: Path to the .cu file
-        module_name: Name of the compiled module (default: tk_kernels)
-        build_dir: Build directory for compiled artifacts
-        verbose: Whether to print compilation output
-        
-    Returns:
-        Path to the directory containing the compiled module
-    """
-    import subprocess
-    import sys
-    import tempfile
-    
-    # Find ThunderKittens
-    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT")
-    if not tk_path:
-        # Try common locations
-        candidates = [
-            "/root/ThunderKittens",
-            os.path.join(REPO_TOP_PATH, "ThunderKittens"),
-            os.path.expanduser("~/ThunderKittens")
-        ]
-        for path in candidates:
-            if os.path.exists(os.path.join(path, "include", "kittens.cuh")):
-                tk_path = path
-                break
-    
-    if not tk_path or not os.path.exists(tk_path):
-        raise RuntimeError(f"ThunderKittens not found. Set THUNDERKITTENS_PATH environment variable.")
-    
-    print(f"[INFO] Using ThunderKittens at: {tk_path}")
-    
-    # Read the CUDA source
-    with open(cuda_src_path, 'r') as f:
-        cuda_source = f.read()
-    
-    # Create build directory
-    if build_dir is None:
-        build_dir = tempfile.mkdtemp(prefix="tk_build_")
-    os.makedirs(build_dir, exist_ok=True)
-    
-    # Write the CUDA source to the build directory
-    cu_file = os.path.join(build_dir, f"{module_name}.cu")
-    with open(cu_file, 'w') as f:
-        f.write(cuda_source)
-    
-    # Create setup.py for compilation
-    # Note: torch.utils.cpp_extension automatically includes pybind11 headers
-    # We don't need to import pybind11 - CUDAExtension handles it
-    setup_py = f'''
-import os
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-TK_PATH = "{tk_path}"
-
-setup(
-    name="{module_name}",
-    ext_modules=[
-        CUDAExtension(
-            name="{module_name}",
-            sources=["{cu_file}"],
-            include_dirs=[
-                TK_PATH,
-                os.path.join(TK_PATH, "include"),
-            ],
-            extra_compile_args={{
-                "cxx": ["-std=c++20", "-O3", "-fPIC"],
-                "nvcc": [
-                    "-std=c++20", "-O3",
-                    "-arch=sm_90a",
-                    "-DNDEBUG",
-                    "-DKITTENS_HOPPER",
-                    "-DKITTENS_BLACKWELL",
-                    "--expt-relaxed-constexpr",
-                    "--expt-extended-lambda",
-                    "-Xcompiler", "-fPIC",
-                    "-diag-suppress=20012",
-                ],
-            }},
-            extra_link_args=["-lcuda"],
-            language="c++",
-        )
-    ],
-    cmdclass={{"build_ext": BuildExtension}},
-)
-'''
-    
-    setup_file = os.path.join(build_dir, "setup.py")
-    with open(setup_file, 'w') as f:
-        f.write(setup_py)
-    
-    # Compile the extension
-    print(f"[INFO] Compiling {cuda_src_path} as module '{module_name}'...")
-    
-    env = os.environ.copy()
-    env["TORCH_CUDA_ARCH_LIST"] = "9.0"
-    
-    try:
-        result = subprocess.run(
-            [sys.executable, "setup.py", "build_ext", "--inplace"],
-            cwd=build_dir,
-            capture_output=not verbose,
-            text=True,
-            env=env
-        )
-        
-        if result.returncode != 0:
-            print(f"[ERROR] Compilation failed:")
-            if result.stdout:
-                print(result.stdout)
-            if result.stderr:
-                print(result.stderr)
-            raise RuntimeError(f"Failed to compile {cuda_src_path}")
-        
-        if verbose and result.stdout:
-            print(result.stdout)
-            
-    except Exception as e:
-        raise RuntimeError(f"Failed to compile {cuda_src_path}: {e}")
-    
-    print(f"[INFO] Successfully compiled {module_name} to {build_dir}")
-    return build_dir
-
-
-def prepare_kernel_src_with_cuda(kernel_py_src: str, cuda_module_path: str, module_name: str = "tk_kernels") -> str:
-    """
-    Prepare the Python kernel source to use the pre-compiled CUDA module.
-    Adds the module path to sys.path so import works.
-    """
-    import_hook = f'''
-import sys
-import os
-# Add compiled CUDA module to path
-_tk_module_path = "{cuda_module_path}"
-if _tk_module_path not in sys.path:
-    sys.path.insert(0, _tk_module_path)
-'''
-    return import_hook + "\n" + kernel_py_src
 
 class ScriptConfig(Config):
     def __init__(self):
@@ -344,194 +205,6 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
             return eval_result
 
 
-# Helper function for compiling CUDA on Modal using nvcc directly (like the Makefile)
-def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
-    """Compile CUDA source on Modal using nvcc directly (matching the Makefile approach)"""
-    import subprocess
-    import sys
-    import tempfile
-    from src.utils import set_gpu_arch
-    
-    set_gpu_arch(gpu_arch)
-    
-    # Find ThunderKittens
-    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT") or "/root/ThunderKittens"
-    if not os.path.exists(os.path.join(tk_path, "include", "kittens.cuh")):
-        raise RuntimeError(f"ThunderKittens not found at {tk_path}")
-    
-    print(f"[Modal] Using ThunderKittens at: {tk_path}")
-    
-    # Create build directory
-    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
-    os.makedirs(build_dir, exist_ok=True)
-    
-    # Write the CUDA source
-    cu_file = os.path.join(build_dir, f"{module_name}.cu")
-    with open(cu_file, 'w') as f:
-        f.write(cuda_src)
-    
-    # Get pybind11 includes - try command line first, then find in site-packages
-    pybind11_includes = ""
-    try:
-        pybind11_result = subprocess.run(
-            [sys.executable, "-m", "pybind11", "--includes"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        pybind11_includes = pybind11_result.stdout.strip()
-    except:
-        # Fallback: find pybind11 in site-packages
-        import site
-        import glob
-        for site_pkg in site.getsitepackages():
-            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
-            if pybind11_paths:
-                pybind11_includes = f"-I{pybind11_paths[0]}"
-                break
-        
-        # If still not found, try common locations
-        if not pybind11_includes:
-            common_paths = [
-                "/usr/local/include/pybind11",
-                "/usr/include/pybind11",
-                os.path.expanduser("~/.local/include/pybind11"),
-            ]
-            for path in common_paths:
-                if os.path.exists(path):
-                    pybind11_includes = f"-I{path}"
-                    break
-        
-        if not pybind11_includes:
-            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
-    
-    # Get Python config - try python3-config first, then python-config
-    python_ldflags = ""
-    try:
-        python_config_result = subprocess.run(
-            ["python3-config", "--ldflags"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        python_ldflags = python_config_result.stdout.strip()
-    except:
-        try:
-            python_config_result = subprocess.run(
-                ["python-config", "--ldflags"],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            python_ldflags = python_config_result.stdout.strip()
-        except:
-            # Fallback - try to construct from sysconfig
-            import sysconfig
-            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
-    
-    # Get Python extension suffix
-    try:
-        ext_suffix_result = subprocess.run(
-            ["python3-config", "--extension-suffix"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        ext_suffix = ext_suffix_result.stdout.strip()
-    except:
-        try:
-            ext_suffix_result = subprocess.run(
-                ["python-config", "--extension-suffix"],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            ext_suffix = ext_suffix_result.stdout.strip()
-        except:
-            # Fallback
-            import sysconfig
-            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
-    
-    # Build nvcc command matching the Makefile
-    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
-    
-    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
-    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
-    
-    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
-    python_ldflags_list = python_ldflags.split() if python_ldflags else []
-    
-    nvcc_flags = [
-        "-DNDEBUG",
-        "-Xcompiler", "-fPIE",
-        "--expt-extended-lambda",
-        "--expt-relaxed-constexpr",
-        "-Xcompiler", "-Wno-psabi",
-        "-Xcompiler", "-fno-strict-aliasing",
-        "--use_fast_math",
-        "-forward-unknown-to-host-compiler",
-        "-O3",
-        "-Xnvlink=--verbose",
-        "-Xptxas=--verbose",
-        "-Xptxas=--warn-on-spills",
-        "-std=c++20",
-        "-x", "cu",
-        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
-        f"-I{tk_path}/include",
-    ]
-    
-    # Add prototype include if it exists
-    if os.path.exists(os.path.join(tk_path, "prototype")):
-        nvcc_flags.append(f"-I{tk_path}/prototype")
-    
-    nvcc_flags.extend(pybind11_include_list)
-    nvcc_flags.extend(python_ldflags_list)
-    nvcc_flags.extend([
-        "-shared",
-        "-fPIC",
-        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
-        "-DKITTENS_HOPPER",
-        "-DKITTENS_BLACKWELL",
-        "-arch=sm_90a",
-        cu_file,
-        "-o", output_so
-    ])
-    
-    # Filter out empty strings
-    nvcc_flags = [f for f in nvcc_flags if f]
-    
-    print(f"[Modal] Compiling {module_name} with nvcc...")
-    print(f"[Modal] Build directory: {build_dir}")
-    print(f"[Modal] CUDA file: {cu_file}")
-    print(f"[Modal] Output: {output_so}")
-    
-    # Run nvcc
-    result = subprocess.run(
-        ["nvcc"] + nvcc_flags,
-        cwd=build_dir,
-        capture_output=True,
-        text=True
-    )
-    
-    # Always print output for debugging
-    if result.stdout:
-        print(f"[Modal] Compilation stdout:\n{result.stdout}")
-    if result.stderr:
-        print(f"[Modal] Compilation stderr:\n{result.stderr}")
-    
-    if result.returncode != 0:
-        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
-        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
-        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
-        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
-    
-    # Verify the .so file was created
-    if not os.path.exists(output_so):
-        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
-    
-    print(f"[Modal] Successfully compiled {module_name}")
-    print(f"[Modal] Generated .so file: {output_so}")
-    return build_dir
 
 
 # Modal evaluation class
@@ -544,23 +217,17 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c
         """Evaluate a single sample source code against a reference source code on Modal"""
         from src.utils import set_gpu_arch
         from src.eval import eval_kernel_against_ref, get_torch_dtype_from_string
+        from scripts.tk_compile import compile_cuda_on_modal, prepare_kernel_src_with_cuda
 
         set_gpu_arch(gpu_arch)
         device = torch.device("cuda:0")
 
         # If CUDA source provided, compile it first
         if cuda_src:
-            cuda_module_path = _compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
+            cuda_module_path = compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
             
             # Modify kernel_src to import the compiled module
-            import_hook = f'''
-import sys
-import os
-_tk_module_path = "{cuda_module_path}"
-if _tk_module_path not in sys.path:
-    sys.path.insert(0, _tk_module_path)
-'''
-            kernel_src = import_hook + "\n" + kernel_src
+            kernel_src = prepare_kernel_src_with_cuda(kernel_src, cuda_module_path, cuda_module_name)
             print(f"[Modal] Modified kernel source to use compiled module at {cuda_module_path}")
 
         num_correct_trials = configs["num_correct_trials"]
@@ -678,7 +345,8 @@ def main(config: ScriptConfig):
                 cuda_src_path=config.cuda_src_path,
                 module_name=config.cuda_module_name,
                 build_dir=cuda_build_dir,
-                verbose=config.verbose
+                verbose=config.verbose,
+                repo_top_path=REPO_TOP_PATH
             )
         
         # Modify kernel_src to import the compiled module
diff --git a/scripts/tk_compile.py b/scripts/tk_compile.py
new file mode 100644
index 00000000..ddb57394
--- /dev/null
+++ b/scripts/tk_compile.py
@@ -0,0 +1,400 @@
+"""
+ThunderKittens CUDA compilation utilities.
+
+This module provides functions for compiling ThunderKittens CUDA kernels
+into Python modules, both locally and on Modal.
+"""
+
+import os
+import subprocess
+import sys
+import tempfile
+import glob
+import site
+import sysconfig
+from typing import Optional
+
+
+def find_thunderkittens_path(repo_top_path: Optional[str] = None) -> str:
+    """
+    Find the ThunderKittens installation path.
+    
+    Args:
+        repo_top_path: Optional path to the repository root for local searches
+        
+    Returns:
+        Path to ThunderKittens directory
+        
+    Raises:
+        RuntimeError: If ThunderKittens is not found
+    """
+    # Try environment variables first
+    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT")
+    
+    if not tk_path:
+        # Try common locations
+        candidates = []
+        
+        # Add repo-relative path if provided
+        if repo_top_path:
+            candidates.append(os.path.join(repo_top_path, "ThunderKittens"))
+        
+        # Add standard locations
+        candidates.extend([
+            "/root/ThunderKittens",
+            os.path.expanduser("~/ThunderKittens")
+        ])
+        
+        for path in candidates:
+            if os.path.exists(os.path.join(path, "include", "kittens.cuh")):
+                tk_path = path
+                break
+    
+    if not tk_path or not os.path.exists(tk_path):
+        raise RuntimeError(
+            "ThunderKittens not found. Set THUNDERKITTENS_PATH or THUNDERKITTENS_ROOT "
+            "environment variable, or ensure ThunderKittens is in a standard location."
+        )
+    
+    return tk_path
+
+
+def compile_thunderkittens_cuda(
+    cuda_src_path: str,
+    module_name: str = "tk_kernels",
+    build_dir: Optional[str] = None,
+    verbose: bool = False,
+    repo_top_path: Optional[str] = None
+) -> str:
+    """
+    Compile a ThunderKittens .cu file into a Python module (local compilation).
+    
+    Args:
+        cuda_src_path: Path to the .cu file
+        module_name: Name of the compiled module (default: tk_kernels)
+        build_dir: Build directory for compiled artifacts (default: temp directory)
+        verbose: Whether to print compilation output
+        repo_top_path: Optional path to repository root for finding ThunderKittens
+        
+    Returns:
+        Path to the directory containing the compiled module
+    """
+    # Find ThunderKittens
+    tk_path = find_thunderkittens_path(repo_top_path)
+    print(f"[INFO] Using ThunderKittens at: {tk_path}")
+    
+    # Read the CUDA source
+    with open(cuda_src_path, 'r') as f:
+        cuda_source = f.read()
+    
+    # Create build directory
+    if build_dir is None:
+        build_dir = tempfile.mkdtemp(prefix="tk_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source to the build directory
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_source)
+    
+    # Create setup.py for compilation
+    # Note: torch.utils.cpp_extension automatically includes pybind11 headers
+    # We don't need to import pybind11 - CUDAExtension handles it
+    setup_py = f'''
+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+TK_PATH = "{tk_path}"
+
+setup(
+    name="{module_name}",
+    ext_modules=[
+        CUDAExtension(
+            name="{module_name}",
+            sources=["{cu_file}"],
+            include_dirs=[
+                TK_PATH,
+                os.path.join(TK_PATH, "include"),
+            ],
+            extra_compile_args={{
+                "cxx": ["-std=c++20", "-O3", "-fPIC"],
+                "nvcc": [
+                    "-std=c++20", "-O3",
+                    "-arch=sm_90a",
+                    "-DNDEBUG",
+                    "-DKITTENS_HOPPER",
+                    "-DKITTENS_BLACKWELL",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "-Xcompiler", "-fPIC",
+                    "-diag-suppress=20012",
+                ],
+            }},
+            extra_link_args=["-lcuda"],
+            language="c++",
+        )
+    ],
+    cmdclass={{"build_ext": BuildExtension}},
+)
+'''
+    
+    setup_file = os.path.join(build_dir, "setup.py")
+    with open(setup_file, 'w') as f:
+        f.write(setup_py)
+    
+    # Compile the extension
+    print(f"[INFO] Compiling {cuda_src_path} as module '{module_name}'...")
+    
+    env = os.environ.copy()
+    env["TORCH_CUDA_ARCH_LIST"] = "9.0"
+    
+    try:
+        result = subprocess.run(
+            [sys.executable, "setup.py", "build_ext", "--inplace"],
+            cwd=build_dir,
+            capture_output=not verbose,
+            text=True,
+            env=env
+        )
+        
+        if result.returncode != 0:
+            print(f"[ERROR] Compilation failed:")
+            if result.stdout:
+                print(result.stdout)
+            if result.stderr:
+                print(result.stderr)
+            raise RuntimeError(f"Failed to compile {cuda_src_path}")
+        
+        if verbose and result.stdout:
+            print(result.stdout)
+            
+    except Exception as e:
+        raise RuntimeError(f"Failed to compile {cuda_src_path}: {e}")
+    
+    print(f"[INFO] Successfully compiled {module_name} to {build_dir}")
+    return build_dir
+
+
+def compile_cuda_on_modal(
+    cuda_src: str,
+    module_name: str,
+    gpu_arch: list,
+    repo_top_path: Optional[str] = None
+) -> str:
+    """
+    Compile CUDA source on Modal using nvcc directly (matching the Makefile approach).
+    
+    Args:
+        cuda_src: CUDA source code as a string
+        module_name: Name of the compiled module
+        gpu_arch: List of GPU architectures (e.g., ["Hopper"])
+        repo_top_path: Optional path to repository root for finding ThunderKittens
+        
+    Returns:
+        Path to the directory containing the compiled module
+    """
+    from src.utils import set_gpu_arch
+    
+    set_gpu_arch(gpu_arch)
+    
+    # Find ThunderKittens
+    tk_path = find_thunderkittens_path(repo_top_path)
+    print(f"[Modal] Using ThunderKittens at: {tk_path}")
+    
+    # Create build directory
+    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
+    os.makedirs(build_dir, exist_ok=True)
+    
+    # Write the CUDA source
+    cu_file = os.path.join(build_dir, f"{module_name}.cu")
+    with open(cu_file, 'w') as f:
+        f.write(cuda_src)
+    
+    # Get pybind11 includes - try command line first, then find in site-packages
+    pybind11_includes = ""
+    try:
+        pybind11_result = subprocess.run(
+            [sys.executable, "-m", "pybind11", "--includes"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        pybind11_includes = pybind11_result.stdout.strip()
+    except:
+        # Fallback: find pybind11 in site-packages
+        for site_pkg in site.getsitepackages():
+            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
+            if pybind11_paths:
+                pybind11_includes = f"-I{pybind11_paths[0]}"
+                break
+        
+        # If still not found, try common locations
+        if not pybind11_includes:
+            common_paths = [
+                "/usr/local/include/pybind11",
+                "/usr/include/pybind11",
+                os.path.expanduser("~/.local/include/pybind11"),
+            ]
+            for path in common_paths:
+                if os.path.exists(path):
+                    pybind11_includes = f"-I{path}"
+                    break
+        
+        if not pybind11_includes:
+            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
+    
+    # Get Python config - try python3-config first, then python-config
+    python_ldflags = ""
+    try:
+        python_config_result = subprocess.run(
+            ["python3-config", "--ldflags"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        python_ldflags = python_config_result.stdout.strip()
+    except:
+        try:
+            python_config_result = subprocess.run(
+                ["python-config", "--ldflags"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            python_ldflags = python_config_result.stdout.strip()
+        except:
+            # Fallback - try to construct from sysconfig
+            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
+    
+    # Get Python extension suffix
+    try:
+        ext_suffix_result = subprocess.run(
+            ["python3-config", "--extension-suffix"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        ext_suffix = ext_suffix_result.stdout.strip()
+    except:
+        try:
+            ext_suffix_result = subprocess.run(
+                ["python-config", "--extension-suffix"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            ext_suffix = ext_suffix_result.stdout.strip()
+        except:
+            # Fallback
+            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
+    
+    # Build nvcc command matching the Makefile
+    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
+    
+    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
+    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
+    
+    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
+    python_ldflags_list = python_ldflags.split() if python_ldflags else []
+    
+    nvcc_flags = [
+        "-DNDEBUG",
+        "-Xcompiler", "-fPIE",
+        "--expt-extended-lambda",
+        "--expt-relaxed-constexpr",
+        "-Xcompiler", "-Wno-psabi",
+        "-Xcompiler", "-fno-strict-aliasing",
+        "--use_fast_math",
+        "-forward-unknown-to-host-compiler",
+        "-O3",
+        "-Xnvlink=--verbose",
+        "-Xptxas=--verbose",
+        "-Xptxas=--warn-on-spills",
+        "-std=c++20",
+        "-x", "cu",
+        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
+        f"-I{tk_path}/include",
+    ]
+    
+    # Add prototype include if it exists
+    if os.path.exists(os.path.join(tk_path, "prototype")):
+        nvcc_flags.append(f"-I{tk_path}/prototype")
+    
+    nvcc_flags.extend(pybind11_include_list)
+    nvcc_flags.extend(python_ldflags_list)
+    nvcc_flags.extend([
+        "-shared",
+        "-fPIC",
+        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
+        "-DKITTENS_HOPPER",
+        "-DKITTENS_BLACKWELL",
+        "-arch=sm_90a",
+        cu_file,
+        "-o", output_so
+    ])
+    
+    # Filter out empty strings
+    nvcc_flags = [f for f in nvcc_flags if f]
+    
+    print(f"[Modal] Compiling {module_name} with nvcc...")
+    print(f"[Modal] Build directory: {build_dir}")
+    print(f"[Modal] CUDA file: {cu_file}")
+    print(f"[Modal] Output: {output_so}")
+    
+    # Run nvcc
+    result = subprocess.run(
+        ["nvcc"] + nvcc_flags,
+        cwd=build_dir,
+        capture_output=True,
+        text=True
+    )
+    
+    # Always print output for debugging
+    if result.stdout:
+        print(f"[Modal] Compilation stdout:\n{result.stdout}")
+    if result.stderr:
+        print(f"[Modal] Compilation stderr:\n{result.stderr}")
+    
+    if result.returncode != 0:
+        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
+        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
+        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
+        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
+    
+    # Verify the .so file was created
+    if not os.path.exists(output_so):
+        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
+    
+    print(f"[Modal] Successfully compiled {module_name}")
+    print(f"[Modal] Generated .so file: {output_so}")
+    return build_dir
+
+
+def prepare_kernel_src_with_cuda(
+    kernel_py_src: str,
+    cuda_module_path: str,
+    module_name: str = "tk_kernels"
+) -> str:
+    """
+    Prepare the Python kernel source to use the pre-compiled CUDA module.
+    Adds the module path to sys.path so import works.
+    
+    Args:
+        kernel_py_src: Original Python kernel source code
+        cuda_module_path: Path to the directory containing the compiled module
+        module_name: Name of the compiled module (default: tk_kernels)
+        
+    Returns:
+        Modified Python source code with import hook
+    """
+    import_hook = f'''
+import sys
+import os
+# Add compiled CUDA module to path
+_tk_module_path = "{cuda_module_path}"
+if _tk_module_path not in sys.path:
+    sys.path.insert(0, _tk_module_path)
+'''
+    return import_hook + "\n" + kernel_py_src
+

From 1f61abc7cb2018699a343a2bfcd5ec18c3a0fe72 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Wed, 10 Dec 2025 20:14:24 -0800
Subject: [PATCH 09/14] factored out tk compiling parts

---
 scripts/eval_from_generations.py | 351 ++-----------------------------
 1 file changed, 14 insertions(+), 337 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 020b5d70..da288623 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -33,6 +33,13 @@
 # Modal support
 import modal
 
+# ThunderKittens compilation utilities
+from scripts.tk_compile import (
+    compile_thunderkittens_cuda,
+    compile_cuda_on_modal,
+    prepare_kernel_src_with_cuda
+)
+
 """
 Batch Evaluation from Existing Generations
 
@@ -81,6 +88,7 @@
         .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
         .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
+        .add_local_python_source("scripts")
     )
 else:
     # Standard image
@@ -91,6 +99,7 @@
         .pip_install("pybind11")  # Ensure pybind11 is available
         .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
+        .add_local_python_source("scripts")
     )
 
 
@@ -168,193 +177,6 @@ class WorkArgs:
     device: torch.device
 
 
-# Helper function for compiling CUDA on Modal using nvcc directly (like the Makefile)
-def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
-    """Compile CUDA source on Modal using nvcc directly (matching the Makefile approach)"""
-    import subprocess
-    import sys
-    import tempfile
-    from src.utils import set_gpu_arch
-    
-    set_gpu_arch(gpu_arch)
-    
-    # Find ThunderKittens
-    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT") or "/root/ThunderKittens"
-    if not os.path.exists(os.path.join(tk_path, "include", "kittens.cuh")):
-        raise RuntimeError(f"ThunderKittens not found at {tk_path}")
-    
-    print(f"[Modal] Using ThunderKittens at: {tk_path}")
-    
-    # Create build directory
-    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
-    os.makedirs(build_dir, exist_ok=True)
-    
-    # Write the CUDA source
-    cu_file = os.path.join(build_dir, f"{module_name}.cu")
-    with open(cu_file, 'w') as f:
-        f.write(cuda_src)
-    
-    # Get pybind11 includes - try command line first, then find in site-packages
-    pybind11_includes = ""
-    try:
-        pybind11_result = subprocess.run(
-            [sys.executable, "-m", "pybind11", "--includes"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        pybind11_includes = pybind11_result.stdout.strip()
-    except:
-        # Fallback: find pybind11 in site-packages
-        import site
-        import glob
-        for site_pkg in site.getsitepackages():
-            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
-            if pybind11_paths:
-                pybind11_includes = f"-I{pybind11_paths[0]}"
-                break
-        
-        # If still not found, try common locations
-        if not pybind11_includes:
-            common_paths = [
-                "/usr/local/include/pybind11",
-                "/usr/include/pybind11",
-                os.path.expanduser("~/.local/include/pybind11"),
-            ]
-            for path in common_paths:
-                if os.path.exists(path):
-                    pybind11_includes = f"-I{path}"
-                    break
-        
-        if not pybind11_includes:
-            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
-    
-    # Get Python config - try python3-config first, then python-config
-    python_ldflags = ""
-    try:
-        python_config_result = subprocess.run(
-            ["python3-config", "--ldflags"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        python_ldflags = python_config_result.stdout.strip()
-    except:
-        try:
-            python_config_result = subprocess.run(
-                ["python-config", "--ldflags"],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            python_ldflags = python_config_result.stdout.strip()
-        except:
-            # Fallback - try to construct from sysconfig
-            import sysconfig
-            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
-    
-    # Get Python extension suffix
-    try:
-        ext_suffix_result = subprocess.run(
-            ["python3-config", "--extension-suffix"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        ext_suffix = ext_suffix_result.stdout.strip()
-    except:
-        try:
-            ext_suffix_result = subprocess.run(
-                ["python-config", "--extension-suffix"],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            ext_suffix = ext_suffix_result.stdout.strip()
-        except:
-            # Fallback
-            import sysconfig
-            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
-    
-    # Build nvcc command matching the Makefile
-    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
-    
-    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
-    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
-    
-    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
-    python_ldflags_list = python_ldflags.split() if python_ldflags else []
-    
-    nvcc_flags = [
-        "-DNDEBUG",
-        "-Xcompiler", "-fPIE",
-        "--expt-extended-lambda",
-        "--expt-relaxed-constexpr",
-        "-Xcompiler", "-Wno-psabi",
-        "-Xcompiler", "-fno-strict-aliasing",
-        "--use_fast_math",
-        "-forward-unknown-to-host-compiler",
-        "-O3",
-        "-Xnvlink=--verbose",
-        "-Xptxas=--verbose",
-        "-Xptxas=--warn-on-spills",
-        "-std=c++20",
-        "-x", "cu",
-        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
-        f"-I{tk_path}/include",
-    ]
-    
-    # Add prototype include if it exists
-    if os.path.exists(os.path.join(tk_path, "prototype")):
-        nvcc_flags.append(f"-I{tk_path}/prototype")
-    
-    nvcc_flags.extend(pybind11_include_list)
-    nvcc_flags.extend(python_ldflags_list)
-    nvcc_flags.extend([
-        "-shared",
-        "-fPIC",
-        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
-        "-DKITTENS_HOPPER",
-        "-arch=sm_90a",
-        cu_file,
-        "-o", output_so
-    ])
-    
-    # Filter out empty strings
-    nvcc_flags = [f for f in nvcc_flags if f]
-    
-    print(f"[Modal] Compiling {module_name} with nvcc...")
-    print(f"[Modal] Build directory: {build_dir}")
-    print(f"[Modal] CUDA file: {cu_file}")
-    print(f"[Modal] Output: {output_so}")
-    
-    # Run nvcc
-    result = subprocess.run(
-        ["nvcc"] + nvcc_flags,
-        cwd=build_dir,
-        capture_output=True,
-        text=True
-    )
-    
-    # Always print output for debugging
-    if result.stdout:
-        print(f"[Modal] Compilation stdout:\n{result.stdout}")
-    if result.stderr:
-        print(f"[Modal] Compilation stderr:\n{result.stderr}")
-    
-    if result.returncode != 0:
-        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
-        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
-        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
-        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
-    
-    # Verify the .so file was created
-    if not os.path.exists(output_so):
-        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
-    
-    print(f"[Modal] Successfully compiled {module_name}")
-    print(f"[Modal] Generated .so file: {output_so}")
-    return build_dir
 
 
 # Modal Evaluation Class
@@ -395,6 +217,7 @@ def evaluate_single_sample_modal(
         """
         from src.eval import eval_kernel_against_ref, get_torch_dtype_from_string
         from src.utils import set_gpu_arch
+        from scripts.tk_compile import compile_cuda_on_modal, prepare_kernel_src_with_cuda
         import torch
         import time
         import modal.experimental
@@ -420,17 +243,10 @@ def evaluate_single_sample_modal(
 
         # If CUDA source provided, compile it first
         if cuda_src:
-            cuda_module_path = _compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
+            cuda_module_path = compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
             
             # Modify kernel_src to import the compiled module
-            import_hook = f'''
-import sys
-import os
-_tk_module_path = "{cuda_module_path}"
-if _tk_module_path not in sys.path:
-    sys.path.insert(0, _tk_module_path)
-'''
-            kernel_src = import_hook + "\n" + kernel_src
+            kernel_src = prepare_kernel_src_with_cuda(kernel_src, cuda_module_path, cuda_module_name)
             print(f"[Modal] Modified kernel source to use compiled module at {cuda_module_path}")
 
         gpu_corrupted = False
@@ -502,146 +318,6 @@ def fetch_ref_arch_from_problem_id(
     return ref_arch_src
 
 
-def compile_thunderkittens_cuda(cuda_src_path: str, module_name: str = "tk_kernels", 
-                                 build_dir: str = None, verbose: bool = False) -> str:
-    """
-    Compile a ThunderKittens .cu file into a Python module (for local evaluation).
-    
-    Args:
-        cuda_src_path: Path to the .cu file
-        module_name: Name of the compiled module (default: tk_kernels)
-        build_dir: Build directory for compiled artifacts
-        verbose: Whether to print compilation output
-        
-    Returns:
-        Path to the directory containing the compiled module
-    """
-    import subprocess
-    import sys
-    import tempfile
-    
-    # Find ThunderKittens
-    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT")
-    if not tk_path:
-        # Try common locations
-        candidates = [
-            os.path.join(REPO_TOP_DIR, "ThunderKittens"),
-            os.path.expanduser("~/ThunderKittens")
-        ]
-        for path in candidates:
-            if os.path.exists(os.path.join(path, "include", "kittens.cuh")):
-                tk_path = path
-                break
-    
-    if not tk_path or not os.path.exists(tk_path):
-        raise RuntimeError(f"ThunderKittens not found. Set THUNDERKITTENS_PATH environment variable.")
-    
-    print(f"[INFO] Using ThunderKittens at: {tk_path}")
-    
-    # Read the CUDA source
-    with open(cuda_src_path, 'r') as f:
-        cuda_source = f.read()
-    
-    # Create build directory
-    if build_dir is None:
-        build_dir = tempfile.mkdtemp(prefix="tk_build_")
-    os.makedirs(build_dir, exist_ok=True)
-    
-    # Write the CUDA source to the build directory
-    cu_file = os.path.join(build_dir, f"{module_name}.cu")
-    with open(cu_file, 'w') as f:
-        f.write(cuda_source)
-    
-    # Create setup.py for compilation
-    setup_py = f'''
-import os
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-TK_PATH = "{tk_path}"
-
-setup(
-    name="{module_name}",
-    ext_modules=[
-        CUDAExtension(
-            name="{module_name}",
-            sources=["{cu_file}"],
-            include_dirs=[
-                TK_PATH,
-                os.path.join(TK_PATH, "include"),
-            ],
-            extra_compile_args={{
-                "cxx": ["-std=c++20", "-O3", "-fPIC"],
-                "nvcc": [
-                    "-std=c++20", "-O3",
-                    "-arch=sm_90a",
-                    "-DNDEBUG",
-                    "-DKITTENS_HOPPER",
-                    "--expt-relaxed-constexpr",
-                    "--expt-extended-lambda",
-                    "-Xcompiler", "-fPIC",
-                    "-diag-suppress=20012",
-                ],
-            }},
-            extra_link_args=["-lcuda"],
-            language="c++",
-        )
-    ],
-    cmdclass={{"build_ext": BuildExtension}},
-)
-'''
-    
-    setup_file = os.path.join(build_dir, "setup.py")
-    with open(setup_file, 'w') as f:
-        f.write(setup_py)
-    
-    # Compile the extension
-    print(f"[INFO] Compiling {cuda_src_path} as module '{module_name}'...")
-    
-    env = os.environ.copy()
-    env["TORCH_CUDA_ARCH_LIST"] = "9.0"
-    
-    try:
-        result = subprocess.run(
-            [sys.executable, "setup.py", "build_ext", "--inplace"],
-            cwd=build_dir,
-            capture_output=not verbose,
-            text=True,
-            env=env
-        )
-        
-        if result.returncode != 0:
-            print(f"[ERROR] Compilation failed:")
-            if result.stdout:
-                print(result.stdout)
-            if result.stderr:
-                print(result.stderr)
-            raise RuntimeError(f"Failed to compile {cuda_src_path}")
-        
-        if verbose and result.stdout:
-            print(result.stdout)
-            
-    except Exception as e:
-        raise RuntimeError(f"Failed to compile {cuda_src_path}: {e}")
-    
-    print(f"[INFO] Successfully compiled {module_name} to {build_dir}")
-    return build_dir
-
-
-def prepare_kernel_src_with_cuda(kernel_py_src: str, cuda_module_path: str, module_name: str = "tk_kernels") -> str:
-    """
-    Prepare the Python kernel source to use the pre-compiled CUDA module.
-    Adds the module path to sys.path so import works.
-    """
-    import_hook = f'''
-import sys
-import os
-# Add compiled CUDA module to path
-_tk_module_path = "{cuda_module_path}"
-if _tk_module_path not in sys.path:
-    sys.path.insert(0, _tk_module_path)
-'''
-    return import_hook + "\n" + kernel_py_src
 
 
 def fetch_kernel_from_disk(
@@ -708,7 +384,8 @@ def evaluate_single_sample(
             cuda_src_path=cuda_src_path,
             module_name="tk_kernels",
             build_dir=cuda_build_dir,
-            verbose=configs.verbose
+            verbose=configs.verbose,
+            repo_top_path=REPO_TOP_DIR
         )
         
         # Modify kernel_src to import the compiled module

From 48a7c582c144336ad92876056bd8e52e076f8462 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Fri, 12 Dec 2025 12:58:17 -0800
Subject: [PATCH 10/14] eval from generations cosmetic changes

---
 scripts/eval_from_generations.py | 36 ++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index da288623..48fb9d4c 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -67,7 +67,7 @@
 operating_sys = "ubuntu22.04"
 tag = f"{cuda_version}-{flavor}-{operating_sys}"
 
-# ThunderKittens support - use TK image if directory exists locally
+# ThunderKittens support: Current method uses custom TK image if the TK directory exists locally
 THUNDERKITTENS_LOCAL_PATH = os.path.join(REPO_TOP_DIR, "ThunderKittens")
 SRC_PATH = os.path.join(REPO_TOP_DIR, "src")
 
@@ -75,7 +75,11 @@
     # ThunderKittens image with TK environment and mounting
     image = (
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
-        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .apt_install("git", 
+                     "gcc-10", 
+                     "g++-10", 
+                     "clang"
+                     )
         .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
         .pip_install("pybind11")  # Ensure pybind11 is available for ThunderKittens compilation
         .env({
@@ -85,8 +89,15 @@
             "CXX": "g++-10",
             "CC": "gcc-10",
         })
-        .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
-        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
+        .add_local_dir(
+            THUNDERKITTENS_LOCAL_PATH, 
+            remote_path="/root/ThunderKittens", 
+            copy=True
+        )
+        .add_local_dir(
+            KERNEL_BENCH_PATH, 
+            remote_path="/root/KernelBench"
+        )
         .add_local_dir(SRC_PATH, remote_path="/root/src")
         .add_local_python_source("scripts")
     )
@@ -94,12 +105,17 @@
     # Standard image
     image = (
         modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10")
-        .apt_install("git", "gcc-10", "g++-10", "clang")
+        .apt_install("git", 
+                     "gcc-10", 
+                     "g++-10", 
+                     "clang"
+                     )
         .pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
-        .pip_install("pybind11")  # Ensure pybind11 is available
-        .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
-        .add_local_dir(SRC_PATH, remote_path="/root/src")
-        .add_local_python_source("scripts")
+        .add_local_dir(
+            KERNEL_BENCH_PATH, 
+            remote_path="/root/KernelBench"
+        )
+        .add_local_python_source("src")
     )
 
 
@@ -177,8 +193,6 @@ class WorkArgs:
     device: torch.device
 
 
-
-
 # Modal Evaluation Class
 # GPU must be specified here for all instances
 # Retries are configured at the class level to handle GPU attachment failures

From ce33e14973d3c32faefbb021652056d47c827b14 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Fri, 12 Dec 2025 13:32:22 -0800
Subject: [PATCH 11/14] factored out tk compile part

---
 .../generate_and_eval_single_sample_modal.py  | 198 +-----------------
 1 file changed, 9 insertions(+), 189 deletions(-)

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 48b40a31..57412c10 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -27,6 +27,7 @@
 
 REPO_TOP_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 KERNEL_BENCH_PATH = os.path.join(REPO_TOP_DIR, "KernelBench")
+SCRIPTS_PATH = os.path.join(REPO_TOP_DIR, "scripts")
 
 torch.set_printoptions(precision=4, threshold=10)
 
@@ -118,6 +119,7 @@ def __repr__(self):
         .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True)
         .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
+        .add_local_dir(SCRIPTS_PATH, remote_path="/root/scripts")
     )
 else:
     # Standard image
@@ -128,196 +130,9 @@ def __repr__(self):
         .pip_install("pybind11")  # Ensure pybind11 is available
         .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench")
         .add_local_dir(SRC_PATH, remote_path="/root/src")
+        .add_local_dir(SCRIPTS_PATH, remote_path="/root/scripts")
     )
 
-# Helper function for compiling CUDA on Modal using nvcc directly (like the Makefile)
-def _compile_cuda_on_modal(cuda_src: str, module_name: str, gpu_arch: list):
-    """Compile CUDA source on Modal using nvcc directly (matching the Makefile approach)"""
-    import subprocess
-    import sys
-    import tempfile
-    from src.utils import set_gpu_arch
-    
-    set_gpu_arch(gpu_arch)
-    
-    # Find ThunderKittens
-    tk_path = os.environ.get("THUNDERKITTENS_PATH") or os.environ.get("THUNDERKITTENS_ROOT") or "/root/ThunderKittens"
-    if not os.path.exists(os.path.join(tk_path, "include", "kittens.cuh")):
-        raise RuntimeError(f"ThunderKittens not found at {tk_path}")
-    
-    print(f"[Modal] Using ThunderKittens at: {tk_path}")
-    
-    # Create build directory
-    build_dir = tempfile.mkdtemp(prefix="tk_modal_build_")
-    os.makedirs(build_dir, exist_ok=True)
-    
-    # Write the CUDA source
-    cu_file = os.path.join(build_dir, f"{module_name}.cu")
-    with open(cu_file, 'w') as f:
-        f.write(cuda_src)
-    
-    # Get pybind11 includes - try command line first, then find in site-packages
-    pybind11_includes = ""
-    try:
-        pybind11_result = subprocess.run(
-            [sys.executable, "-m", "pybind11", "--includes"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        pybind11_includes = pybind11_result.stdout.strip()
-    except:
-        # Fallback: find pybind11 in site-packages
-        import site
-        import glob
-        for site_pkg in site.getsitepackages():
-            pybind11_paths = glob.glob(os.path.join(site_pkg, "pybind11", "include"))
-            if pybind11_paths:
-                pybind11_includes = f"-I{pybind11_paths[0]}"
-                break
-        
-        # If still not found, try common locations
-        if not pybind11_includes:
-            common_paths = [
-                "/usr/local/include/pybind11",
-                "/usr/include/pybind11",
-                os.path.expanduser("~/.local/include/pybind11"),
-            ]
-            for path in common_paths:
-                if os.path.exists(path):
-                    pybind11_includes = f"-I{path}"
-                    break
-        
-        if not pybind11_includes:
-            print("[Modal WARNING] pybind11 includes not found, compilation may fail")
-    
-    # Get Python config - try python3-config first, then python-config
-    python_ldflags = ""
-    try:
-        python_config_result = subprocess.run(
-            ["python3-config", "--ldflags"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        python_ldflags = python_config_result.stdout.strip()
-    except:
-        try:
-            python_config_result = subprocess.run(
-                ["python-config", "--ldflags"],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            python_ldflags = python_config_result.stdout.strip()
-        except:
-            # Fallback - try to construct from sysconfig
-            import sysconfig
-            python_ldflags = f"-L{sysconfig.get_config_var('LIBDIR')} -lpython{sys.version_info.major}.{sys.version_info.minor}"
-    
-    # Get Python extension suffix
-    try:
-        ext_suffix_result = subprocess.run(
-            ["python3-config", "--extension-suffix"],
-            capture_output=True,
-            text=True,
-            check=True
-        )
-        ext_suffix = ext_suffix_result.stdout.strip()
-    except:
-        try:
-            ext_suffix_result = subprocess.run(
-                ["python-config", "--extension-suffix"],
-                capture_output=True,
-                text=True,
-                check=True
-            )
-            ext_suffix = ext_suffix_result.stdout.strip()
-        except:
-            # Fallback
-            import sysconfig
-            ext_suffix = sysconfig.get_config_var('EXT_SUFFIX') or '.so'
-    
-    # Build nvcc command matching the Makefile
-    output_so = os.path.join(build_dir, f"{module_name}{ext_suffix}")
-    
-    # Parse pybind11 includes (they come as "-I/path1 -I/path2")
-    pybind11_include_list = pybind11_includes.split() if pybind11_includes else []
-    
-    # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
-    python_ldflags_list = python_ldflags.split() if python_ldflags else []
-    
-    nvcc_flags = [
-        "-DNDEBUG",
-        "-Xcompiler", "-fPIE",
-        "--expt-extended-lambda",
-        "--expt-relaxed-constexpr",
-        "-Xcompiler", "-Wno-psabi",
-        "-Xcompiler", "-fno-strict-aliasing",
-        "--use_fast_math",
-        "-forward-unknown-to-host-compiler",
-        "-O3",
-        "-Xnvlink=--verbose",
-        "-Xptxas=--verbose",
-        "-Xptxas=--warn-on-spills",
-        "-std=c++20",
-        "-x", "cu",
-        "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
-        f"-I{tk_path}/include",
-    ]
-    
-    # Add prototype include if it exists
-    if os.path.exists(os.path.join(tk_path, "prototype")):
-        nvcc_flags.append(f"-I{tk_path}/prototype")
-    
-    nvcc_flags.extend(pybind11_include_list)
-    nvcc_flags.extend(python_ldflags_list)
-    nvcc_flags.extend([
-        "-shared",
-        "-fPIC",
-        f"-lpython{sys.version_info.major}.{sys.version_info.minor}",
-        "-DKITTENS_HOPPER",
-        "-arch=sm_90a",
-        cu_file,
-        "-o", output_so
-    ])
-    
-    # Filter out empty strings
-    nvcc_flags = [f for f in nvcc_flags if f]
-    
-    print(f"[Modal] Compiling {module_name} with nvcc...")
-    print(f"[Modal] Build directory: {build_dir}")
-    print(f"[Modal] CUDA file: {cu_file}")
-    print(f"[Modal] Output: {output_so}")
-    
-    # Run nvcc
-    result = subprocess.run(
-        ["nvcc"] + nvcc_flags,
-        cwd=build_dir,
-        capture_output=True,
-        text=True
-    )
-    
-    # Always print output for debugging
-    if result.stdout:
-        print(f"[Modal] Compilation stdout:\n{result.stdout}")
-    if result.stderr:
-        print(f"[Modal] Compilation stderr:\n{result.stderr}")
-    
-    if result.returncode != 0:
-        print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
-        print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
-        print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
-        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
-    
-    # Verify the .so file was created
-    if not os.path.exists(output_so):
-        raise RuntimeError(f"Compilation succeeded but .so file not found: {output_so}")
-    
-    print(f"[Modal] Successfully compiled {module_name}")
-    print(f"[Modal] Generated .so file: {output_so}")
-    return build_dir
-
 
 @app.cls(image=image)
 class EvalFunc:
@@ -334,11 +149,16 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc
         import sys
         import os
         
+        # Add scripts directory to path for importing tk_compile
+        if "/root/scripts" not in sys.path:
+            sys.path.insert(0, "/root/scripts")
+        from tk_compile import compile_cuda_on_modal
+        
         modal_set_gpu_arch(gpu_arch)
         
         # If CUDA source provided, compile it first (for ThunderKittens)
         if cuda_src:
-            cuda_module_path = _compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch)
+            cuda_module_path = compile_cuda_on_modal(cuda_src, cuda_module_name, gpu_arch, repo_top_path="/root")
             
             # Modify kernel_src to import the compiled module
             import_hook = f'''

From f2eefb92dc8b3ca4e4fc61c3d2fd76bff3978449 Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Fri, 12 Dec 2025 13:43:05 -0800
Subject: [PATCH 12/14] simplified generate_samples by removing fallbacks

---
 scripts/generate_samples.py | 79 ++++---------------------------------
 1 file changed, 7 insertions(+), 72 deletions(-)

diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 80de44c9..5ee479fc 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -170,52 +170,20 @@ def generate_sample_single(
         with open(raw_response_path, "w") as f:
             f.write(custom_kernel_response)
         
-        # Try to extract both code blocks
+        # Extract both code blocks
         cuda_code, python_code = extract_cuda_and_python_code(custom_kernel_response)
         
-        # Fallback: if extraction failed, try to extract a single code block and split it
-        if cuda_code is None or python_code is None:
-            print(f"[WARNING] Failed to extract both code blocks for problem {work.problem_id} sample {work.sample_id}")
-            print(f"  - CUDA code found: {cuda_code is not None}")
-            print(f"  - Python code found: {python_code is not None}")
-            
-            # Try fallback: extract first code block and see if we can split it
-            single_code = extract_first_code(custom_kernel_response, ["python", "cpp", "cuda", "cu"])
-            if single_code:
-                # Try to split by looking for PYBIND11_MODULE or other markers
-                if "PYBIND11_MODULE" in single_code:
-                    # This looks like CUDA code, try to find Python code separately
-                    if python_code is None:
-                        # Try to extract Python code from remaining response
-                        python_code = extract_first_code(custom_kernel_response.replace(single_code, ""), ["python"])
-                    if cuda_code is None:
-                        cuda_code = single_code
-                elif "import torch" in single_code or "class ModelNew" in single_code:
-                    # This looks like Python code
-                    if python_code is None:
-                        python_code = single_code
-                    # Try to find CUDA code
-                    if cuda_code is None:
-                        # Look for other code blocks
-                        remaining = custom_kernel_response.replace(f"```python\n{single_code}\n```", "")
-                        cuda_code = extract_first_code(remaining, ["cpp", "cuda", "cu"])
-                else:
-                    # Unknown format, try to use as CUDA if we don't have it
-                    if cuda_code is None:
-                        cuda_code = single_code
+        # Check LLM is able to generate both CUDA and Python code
+        assert cuda_code is not None, "Custom CUDA code generation failed"
+        assert python_code is not None, "Custom Python code generation failed"
         
-        # Write out both files even if empty (for debugging)
         # Store CUDA file (.cu)
         cuda_path = os.path.join(
             run_dir,
             f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.cu",
         )
         with open(cuda_path, "w") as f:
-            if cuda_code:
-                f.write(cuda_code)
-            else:
-                f.write(f"# CUDA code extraction failed for problem {work.problem_id} sample {work.sample_id}\n")
-                f.write(f"# Raw response saved to: {os.path.basename(raw_response_path)}\n")
+            f.write(cuda_code)
         
         # Store Python file (.py)
         kernel_path = os.path.join(
@@ -223,29 +191,12 @@ def generate_sample_single(
             f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.py",
         )
         with open(kernel_path, "w") as f:
-            if python_code:
-                f.write(python_code)
-            else:
-                f.write(f"# Python code extraction failed for problem {work.problem_id} sample {work.sample_id}\n")
-                f.write(f"# Raw response saved to: {os.path.basename(raw_response_path)}\n")
+            f.write(python_code)
         
         if config.verbose:
             print(
                 f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}"
             )
-            if cuda_code:
-                print(f"  - CUDA code: {len(cuda_code)} characters")
-            else:
-                print(f"  - CUDA code: NOT FOUND")
-            if python_code:
-                print(f"  - Python code: {len(python_code)} characters")
-            else:
-                print(f"  - Python code: NOT FOUND")
-            print(f"  - Raw response saved to: {os.path.basename(raw_response_path)}")
-        
-        # Warn if extraction failed but don't fail
-        if cuda_code is None or python_code is None:
-            print(f"[WARNING] Partial extraction for problem {work.problem_id} sample {work.sample_id}. Check raw_response.txt file.")
     else:
         # For other backends, extract single code block (Python or inline CUDA)
         custom_kernel = extract_first_code(custom_kernel_response, ["python", "cpp"])
@@ -458,20 +409,4 @@ def main(config: GenerationConfig):
 
 
 if __name__ == "__main__":
-    main()
-
-
-
-# python scripts/generate_samples.py \
-# run_name=new_tk \
-# runs_dir=/Users/willychan/Desktop/projects/dsl-monkeys/runs \
-# dataset_src=local \
-# level=1 \
-# subset="(50,50)" \
-# num_samples=1 \
-# num_workers=50 \
-# server_type=google \
-# model_name=gemini/gemini-3-pro-preview \
-# temperature=1.0 \
-# max_tokens=60000 \
-# backend=thunderkittens
\ No newline at end of file
+    main()
\ No newline at end of file

From d2d3f1d688a3b440aa584985e901156439dfdeba Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Fri, 12 Dec 2025 13:49:58 -0800
Subject: [PATCH 13/14] removed some enters

---
 scripts/run_and_check.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index 3907c5de..014b3936 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -105,8 +105,6 @@
 
 torch.set_printoptions(precision=4, threshold=10)
 
-
-
 class ScriptConfig(Config):
     def __init__(self):
 

From 6bcba596878bcc4e5ab6d5b0ae596fdfc44b18fe Mon Sep 17 00:00:00 2001
From: Willy-Chan <willychan2022@gmail.com>
Date: Wed, 17 Dec 2025 14:31:58 -0500
Subject: [PATCH 14/14] added vec-add TK CUDA example and modified prompts
 accordingly

---
 scripts/tk_compile.py                         | 32 ++++++++-
 src/prompt_constructor_toml.py                | 12 ++++
 .../model_new_ex_add_thunderkittens.cu        | 69 +++++++++++++++++++
 .../model_new_ex_add_thunderkittens.py        |  7 +-
 src/prompts/prompts.toml                      |  1 +
 5 files changed, 118 insertions(+), 3 deletions(-)
 create mode 100644 src/prompts/model_new_ex_add_thunderkittens.cu

diff --git a/scripts/tk_compile.py b/scripts/tk_compile.py
index ddb57394..ffc0ffd2 100644
--- a/scripts/tk_compile.py
+++ b/scripts/tk_compile.py
@@ -298,6 +298,32 @@ def compile_cuda_on_modal(
     # Parse python ldflags (they come as "-L/path -lpython3.10 ...")
     python_ldflags_list = python_ldflags.split() if python_ldflags else []
     
+    # Get torch include and library paths
+    torch_includes = []
+    torch_lib_flags = []
+    try:
+        import torch
+        torch_dir = os.path.dirname(torch.__file__)
+        torch_include_path = os.path.join(torch_dir, "include")
+        torch_lib_path = os.path.join(torch_dir, "lib")
+        
+        if os.path.exists(torch_include_path):
+            torch_includes.append(f"-I{torch_include_path}")
+            # Also add torch/csrc/api/include for ATen headers
+            torch_csrc_include = os.path.join(torch_include_path, "torch", "csrc", "api", "include")
+            if os.path.exists(torch_csrc_include):
+                torch_includes.append(f"-I{torch_csrc_include}")
+        
+        if os.path.exists(torch_lib_path):
+            torch_lib_flags.append(f"-L{torch_lib_path}")
+            torch_lib_flags.append("-ltorch")
+            torch_lib_flags.append("-lc10")
+            torch_lib_flags.append("-ltorch_cpu")
+            torch_lib_flags.append("-ltorch_python")
+            torch_lib_flags.append(f"-Wl,-rpath,{torch_lib_path}")
+    except ImportError:
+        print("[Modal WARNING] torch not found, compilation may fail")
+    
     nvcc_flags = [
         "-DNDEBUG",
         "-Xcompiler", "-fPIE",
@@ -312,6 +338,8 @@ def compile_cuda_on_modal(
         "-Xptxas=--verbose",
         "-Xptxas=--warn-on-spills",
         "-std=c++20",
+        "-diag-suppress=3189",  # Suppress C++20 module keyword warning from torch headers (harmless)
+        "-diag-suppress=2361",  # Suppress narrowing conversion warnings (harmless)
         "-x", "cu",
         "-lrt", "-lpthread", "-ldl", "-lcuda", "-lcudadevrt", "-lcudart_static", "-lcublas",
         f"-I{tk_path}/include",
@@ -322,6 +350,8 @@ def compile_cuda_on_modal(
         nvcc_flags.append(f"-I{tk_path}/prototype")
     
     nvcc_flags.extend(pybind11_include_list)
+    nvcc_flags.extend(torch_includes)
+    nvcc_flags.extend(torch_lib_flags)
     nvcc_flags.extend(python_ldflags_list)
     nvcc_flags.extend([
         "-shared",
@@ -360,7 +390,7 @@ def compile_cuda_on_modal(
         print(f"[Modal ERROR] Compilation failed with return code {result.returncode}")
         print(f"[Modal ERROR] Full stdout:\n{result.stdout}")
         print(f"[Modal ERROR] Full stderr:\n{result.stderr}")
-        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:500] if result.stderr else 'Unknown error'}")
+        raise RuntimeError(f"Failed to compile CUDA module: {result.stderr[:2000] if result.stderr else 'Unknown error'}")
     
     # Verify the .so file was created
     if not os.path.exists(output_so):
diff --git a/src/prompt_constructor_toml.py b/src/prompt_constructor_toml.py
index 5051e17c..1dec610b 100644
--- a/src/prompt_constructor_toml.py
+++ b/src/prompt_constructor_toml.py
@@ -274,6 +274,12 @@ def render_example_entry(input_code: str, output_code: str, example_label: str)
                 ex_new_path = _abs_path(backend_data["one_shot_new_arch"])
                 input_code = read_file(ex_arch_path)
                 output_code = read_file(ex_new_path)
+                
+                # For backends with separate CUDA file (e.g., ThunderKittens), include both
+                if "one_shot_cuda_arch" in backend_data:
+                    cuda_code = read_file(_abs_path(backend_data["one_shot_cuda_arch"]))
+                    output_code = f"C++/CUDA file:\n\n```cpp\n{cuda_code}\n```\n\nPython file:\n\n```python\n{output_code}\n```"
+                
                 examples_entries.append(
                     render_example_entry(input_code, output_code, "Example:")
                 )
@@ -286,6 +292,12 @@ def render_example_entry(input_code: str, output_code: str, example_label: str)
             ex_new_path = _abs_path(backend_data["one_shot_new_arch"])
             input_code = read_file(ex_arch_path)
             output_code = read_file(ex_new_path)
+            
+            # For backends with separate CUDA file (e.g., ThunderKittens), include both
+            if "one_shot_cuda_arch" in backend_data:
+                cuda_code = read_file(_abs_path(backend_data["one_shot_cuda_arch"]))
+                output_code = f"C++/CUDA file:\n\n```cpp\n{cuda_code}\n```\n\nPython file:\n\n```python\n{output_code}\n```"
+            
             examples_entries.append(
                 render_example_entry(input_code, output_code, "Example:")
             )
diff --git a/src/prompts/model_new_ex_add_thunderkittens.cu b/src/prompts/model_new_ex_add_thunderkittens.cu
new file mode 100644
index 00000000..ef59d040
--- /dev/null
+++ b/src/prompts/model_new_ex_add_thunderkittens.cu
@@ -0,0 +1,69 @@
+#include "kittens.cuh"
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+
+using namespace kittens;
+
+constexpr int BLOCK_SIZE = 16;
+#define NUM_WORKERS (1)
+#define NUM_THREADS (NUM_WORKERS * kittens::WARP_THREADS)
+
+struct add_globals {
+    using sub_tile = st_bf<BLOCK_SIZE, BLOCK_SIZE>;
+    using tile_gl = gl<bf16, 1, 1, -1, -1, sub_tile>;
+    tile_gl A;
+    tile_gl B;
+    tile_gl C;
+};
+
+__global__ void add_tk(const __grid_constant__ add_globals g) {
+    extern __shared__ alignment_dummy __shm[];
+    shared_allocator al((int*)&__shm[0]);
+    
+    st_bf<BLOCK_SIZE, BLOCK_SIZE> &As = al.allocate<st_bf<BLOCK_SIZE, BLOCK_SIZE>>();
+    st_bf<BLOCK_SIZE, BLOCK_SIZE> &Bs = al.allocate<st_bf<BLOCK_SIZE, BLOCK_SIZE>>();
+    st_bf<BLOCK_SIZE, BLOCK_SIZE> &Cs = al.allocate<st_bf<BLOCK_SIZE, BLOCK_SIZE>>();
+    
+    rt_bf<BLOCK_SIZE, BLOCK_SIZE> A_reg;
+    rt_bf<BLOCK_SIZE, BLOCK_SIZE> B_reg;
+    rt_bf<BLOCK_SIZE, BLOCK_SIZE> C_reg;
+    
+    int col = blockIdx.x;
+    int row = blockIdx.y;
+    
+    // Load A and B tiles from global to shared
+    kittens::warp::load(As, g.A, {0, 0, row, col});
+    kittens::warp::load(Bs, g.B, {0, 0, row, col});
+    __syncthreads();
+    
+    // Load from shared to register
+    kittens::warp::load(A_reg, As);
+    kittens::warp::load(B_reg, Bs);
+    __syncthreads();
+    
+    // Element-wise add: C = A + B
+    kittens::warp::add(C_reg, A_reg, B_reg);
+    __syncthreads();
+    
+    // Store result back to global
+    kittens::warp::store(g.C, C_reg, {0, 0, row, col});
+}
+
+void dispatch_add(torch::Tensor A, torch::Tensor B, torch::Tensor C, int M, int N) {
+    using tile_gl = add_globals::tile_gl;
+    tile_gl a_arg{(bf16*)A.data_ptr(), nullptr, nullptr, (size_t)M, (size_t)N};
+    tile_gl b_arg{(bf16*)B.data_ptr(), nullptr, nullptr, (size_t)M, (size_t)N};
+    tile_gl c_arg{(bf16*)C.data_ptr(), nullptr, nullptr, (size_t)M, (size_t)N};
+    add_globals g{a_arg, b_arg, c_arg};
+    
+    dim3 blocks((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
+    unsigned long mem_size = 50480;
+    cudaFuncSetAttribute(add_tk, cudaFuncAttributeMaxDynamicSharedMemorySize, mem_size);
+    add_tk<<<blocks, NUM_THREADS, mem_size>>>(g);
+    cudaDeviceSynchronize();
+}
+
+PYBIND11_MODULE(tk_kernels, m) {
+    m.doc() = "ThunderKittens element-wise add kernel";
+    m.def("dispatch_add", &dispatch_add, "Element-wise add using ThunderKittens");
+}
diff --git a/src/prompts/model_new_ex_add_thunderkittens.py b/src/prompts/model_new_ex_add_thunderkittens.py
index 00a41c1b..3996f4b8 100644
--- a/src/prompts/model_new_ex_add_thunderkittens.py
+++ b/src/prompts/model_new_ex_add_thunderkittens.py
@@ -14,9 +14,12 @@ def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
         M, N = A.shape
         assert A.shape == B.shape, "Input tensors must have the same shape"
         
-        C = torch.zeros((M, N), device=A.device, dtype=torch.float32).contiguous()
+        # ThunderKittens uses bfloat16 for tile operations
+        A_bf16 = A.to(torch.bfloat16).contiguous()
+        B_bf16 = B.to(torch.bfloat16).contiguous()
+        C = torch.zeros((M, N), device=A.device, dtype=torch.bfloat16).contiguous()
         
         # Call into TK pybind wrapper
-        tk_kernels.dispatch_add(A, B, C, int(M), int(N))
+        tk_kernels.dispatch_add(A_bf16, B_bf16, C, int(M), int(N))
         
         return C
diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml
index 6bcfcd88..5023fff0 100644
--- a/src/prompts/prompts.toml
+++ b/src/prompts/prompts.toml
@@ -342,6 +342,7 @@ one_shot_new_arch = "src/prompts/model_new_ex_add_tilelang.py"
 [backends.thunderkittens]
 backend_display = "ThunderKittens kernels"
 one_shot_new_arch = "src/prompts/model_new_ex_add_thunderkittens.py"
+one_shot_cuda_arch = "src/prompts/model_new_ex_add_thunderkittens.cu"
 # No few_shot_examples - will use one-shot when few_shot option is selected
 
 # -------------------------------------------------------------------------