From 509ed996193e3a1bf6b152622c39a3e8e4219ead Mon Sep 17 00:00:00 2001
From: pythonomar22 <omarabulhassan@gmail.com>
Date: Thu, 20 Nov 2025 18:20:42 -0800
Subject: [PATCH 1/5] before testing

---
 .../A10G_modal/baseline_time_torch.json       | 904 ++++++++++++++++++
 ...e_time_torch_compile_inductor_default.json | 904 ++++++++++++++++++
 .../H100_modal/baseline_time_torch.json       | 904 ++++++++++++++++++
 ...e_time_torch_compile_inductor_default.json | 904 ++++++++++++++++++
 scripts/benchmark_eval_analysis.py            |  45 +-
 scripts/eval_from_generations.py              |  25 +-
 scripts/generate_and_eval_single_sample.py    |   5 +-
 .../generate_and_eval_single_sample_modal.py  |   7 +-
 scripts/generate_baseline_time.py             |  21 +-
 scripts/generate_baseline_time_modal.py       |  18 +-
 scripts/generate_samples.py                   |  22 +-
 scripts/inspect_baseline.py                   |   5 +-
 scripts/inspect_triton.py                     |  20 +-
 scripts/run_and_check.py                      |  26 +-
 scripts/verify_bench.py                       |  58 +-
 src/dataset.py                                |  54 +-
 src/eval.py                                   |  19 +-
 17 files changed, 3807 insertions(+), 134 deletions(-)
 create mode 100644 results/timing/A10G_modal/baseline_time_torch.json
 create mode 100644 results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json
 create mode 100644 results/timing/H100_modal/baseline_time_torch.json
 create mode 100644 results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json

diff --git a/results/timing/A10G_modal/baseline_time_torch.json b/results/timing/A10G_modal/baseline_time_torch.json
new file mode 100644
index 00000000..327a00c2
--- /dev/null
+++ b/results/timing/A10G_modal/baseline_time_torch.json
@@ -0,0 +1,904 @@
+{
+    "level1": {
+        "1_Square_matrix_multiplication_.py": {
+            "mean": 5.78,
+            "std": 0.0635,
+            "min": 5.55,
+            "max": 5.91,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "2_Standard_matrix_multiplication_.py": {
+            "mean": 8.47,
+            "std": 0.293,
+            "min": 6.99,
+            "max": 9.51,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "3_Batched_matrix_multiplication.py": {
+            "mean": 14.0,
+            "std": 0.169,
+            "min": 13.5,
+            "max": 14.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "4_Matrix_vector_multiplication_.py": {
+            "mean": 25.5,
+            "std": 0.157,
+            "min": 25.0,
+            "max": 25.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "5_Matrix_scalar_multiplication.py": {
+            "mean": 17.6,
+            "std": 0.0154,
+            "min": 17.6,
+            "max": 17.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "6_Matmul_with_large_K_dimension_.py": {
+            "mean": 3.3,
+            "std": 0.0495,
+            "min": 3.1,
+            "max": 3.34,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "7_Matmul_with_small_K_dimension_.py": {
+            "mean": 14.9,
+            "std": 1.19,
+            "min": 13.3,
+            "max": 23.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "8_Matmul_with_irregular_shapes_.py": {
+            "mean": 21.0,
+            "std": 0.68,
+            "min": 20.5,
+            "max": 25.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "9_Tall_skinny_matrix_multiplication_.py": {
+            "mean": 10.9,
+            "std": 0.0388,
+            "min": 10.9,
+            "max": 11.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "10_3D_tensor_matrix_multiplication.py": {
+            "mean": 2.4,
+            "std": 0.0551,
+            "min": 2.25,
+            "max": 2.45,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "11_4D_tensor_matrix_multiplication.py": {
+            "mean": 22.2,
+            "std": 0.224,
+            "min": 21.8,
+            "max": 22.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "12_Matmul_with_diagonal_matrices_.py": {
+            "mean": 9.62,
+            "std": 0.537,
+            "min": 7.46,
+            "max": 11.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "13_Matmul_for_symmetric_matrices.py": {
+            "mean": 10.4,
+            "std": 0.889,
+            "min": 8.12,
+            "max": 15.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "14_Matmul_for_upper_triangular_matrices.py": {
+            "mean": 5.75,
+            "std": 0.0515,
+            "min": 5.71,
+            "max": 6.06,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "15_Matmul_for_lower_triangular_matrices.py": {
+            "mean": 5.71,
+            "std": 0.0196,
+            "min": 5.7,
+            "max": 5.88,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "16_Matmul_with_transposed_A.py": {
+            "mean": 8.29,
+            "std": 0.228,
+            "min": 6.84,
+            "max": 8.54,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "17_Matmul_with_transposed_B.py": {
+            "mean": 11.9,
+            "std": 0.674,
+            "min": 10.1,
+            "max": 15.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "18_Matmul_with_transposed_both.py": {
+            "mean": 8.78,
+            "std": 0.415,
+            "min": 7.41,
+            "max": 10.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "19_ReLU.py": {
+            "mean": 26.6,
+            "std": 0.0288,
+            "min": 26.5,
+            "max": 26.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "20_LeakyReLU.py": {
+            "mean": 26.4,
+            "std": 0.0369,
+            "min": 26.4,
+            "max": 26.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "21_Sigmoid.py": {
+            "mean": 26.5,
+            "std": 0.0341,
+            "min": 26.4,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "22_Tanh.py": {
+            "mean": 26.6,
+            "std": 0.0275,
+            "min": 26.5,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "23_Softmax.py": {
+            "mean": 51.4,
+            "std": 0.0335,
+            "min": 51.3,
+            "max": 51.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "24_LogSoftmax.py": {
+            "mean": 51.4,
+            "std": 0.0255,
+            "min": 51.3,
+            "max": 51.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "25_Swish.py": {
+            "mean": 65.9,
+            "std": 0.033,
+            "min": 65.8,
+            "max": 66.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "26_GELU_.py": {
+            "mean": 26.5,
+            "std": 0.0244,
+            "min": 26.5,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "27_SELU_.py": {
+            "mean": 26.4,
+            "std": 0.014,
+            "min": 26.3,
+            "max": 26.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "28_HardSigmoid.py": {
+            "mean": 26.6,
+            "std": 0.0332,
+            "min": 26.5,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "29_Softplus.py": {
+            "mean": 26.5,
+            "std": 0.0349,
+            "min": 26.5,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "30_Softsign.py": {
+            "mean": 92.3,
+            "std": 0.0474,
+            "min": 92.2,
+            "max": 92.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "31_ELU.py": {
+            "mean": 26.4,
+            "std": 0.0291,
+            "min": 26.4,
+            "max": 26.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "32_HardTanh.py": {
+            "mean": 26.4,
+            "std": 0.0333,
+            "min": 26.4,
+            "max": 26.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "33_BatchNorm.py": {
+            "mean": 28.3,
+            "std": 0.0373,
+            "min": 28.2,
+            "max": 28.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "34_InstanceNorm.py": {
+            "mean": 47.4,
+            "std": 0.0383,
+            "min": 47.3,
+            "max": 47.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "35_GroupNorm_.py": {
+            "mean": 46.6,
+            "std": 0.0375,
+            "min": 46.5,
+            "max": 46.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "36_RMSNorm_.py": {
+            "mean": 80.9,
+            "std": 0.0425,
+            "min": 80.8,
+            "max": 81.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "37_FrobeniusNorm_.py": {
+            "mean": 45.5,
+            "std": 0.0466,
+            "min": 45.4,
+            "max": 45.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "38_L1Norm_.py": {
+            "mean": 88.2,
+            "std": 0.0341,
+            "min": 88.1,
+            "max": 88.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "39_L2Norm_.py": {
+            "mean": 53.0,
+            "std": 0.105,
+            "min": 52.9,
+            "max": 53.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "40_LayerNorm.py": {
+            "mean": 8.53,
+            "std": 0.0196,
+            "min": 8.52,
+            "max": 8.65,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "41_Max_Pooling_1D.py": {
+            "mean": 27.0,
+            "std": 0.0402,
+            "min": 26.9,
+            "max": 27.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "42_Max_Pooling_2D.py": {
+            "mean": 30.9,
+            "std": 1.31,
+            "min": 30.1,
+            "max": 40.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "43_Max_Pooling_3D.py": {
+            "mean": 12.9,
+            "std": 0.16,
+            "min": 12.2,
+            "max": 13.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "44_Average_Pooling_1D.py": {
+            "mean": 18.9,
+            "std": 0.497,
+            "min": 18.5,
+            "max": 21.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "45_Average_Pooling_2D.py": {
+            "mean": 44.0,
+            "std": 0.0797,
+            "min": 43.8,
+            "max": 44.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "46_Average_Pooling_3D.py": {
+            "mean": 18.8,
+            "std": 0.0448,
+            "min": 18.7,
+            "max": 18.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "47_Sum_reduction_over_a_dimension.py": {
+            "mean": 21.1,
+            "std": 0.0911,
+            "min": 20.8,
+            "max": 21.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "48_Mean_reduction_over_a_dimension.py": {
+            "mean": 21.0,
+            "std": 0.0635,
+            "min": 20.9,
+            "max": 21.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "49_Max_reduction_over_a_dimension.py": {
+            "mean": 20.1,
+            "std": 0.0694,
+            "min": 19.9,
+            "max": 20.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "50_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 16.1,
+            "std": 0.0699,
+            "min": 16.0,
+            "max": 16.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "51_Argmax_over_a_dimension.py": {
+            "mean": 20.9,
+            "std": 0.0814,
+            "min": 20.7,
+            "max": 21.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "52_Argmin_over_a_dimension.py": {
+            "mean": 20.9,
+            "std": 0.0777,
+            "min": 20.7,
+            "max": 21.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "53_Min_reduction_over_a_dimension.py": {
+            "mean": 20.9,
+            "std": 0.0826,
+            "min": 20.8,
+            "max": 21.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "54_conv_standard_3D__square_input__square_kernel.py": {
+            "mean": 14.4,
+            "std": 0.0301,
+            "min": 14.3,
+            "max": 14.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
+            "mean": 83.4,
+            "std": 2.16,
+            "min": 81.5,
+            "max": 101.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 26.2,
+            "std": 1.92,
+            "min": 25.5,
+            "max": 43.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "57_conv_transposed_2D__square_input__square_kernel.py": {
+            "mean": 39.5,
+            "std": 0.0511,
+            "min": 39.3,
+            "max": 39.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 17.4,
+            "std": 0.0441,
+            "min": 17.3,
+            "max": 17.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
+            "mean": 13.3,
+            "std": 0.0313,
+            "min": 13.2,
+            "max": 13.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
+            "mean": 31.5,
+            "std": 0.15,
+            "min": 31.3,
+            "max": 31.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "61_conv_transposed_3D__square_input__square_kernel.py": {
+            "mean": 27.5,
+            "std": 0.0604,
+            "min": 27.4,
+            "max": 27.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
+            "mean": 15.1,
+            "std": 0.0803,
+            "min": 14.9,
+            "max": 15.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "63_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 43.2,
+            "std": 0.142,
+            "min": 43.1,
+            "max": 44.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "64_conv_transposed_1D.py": {
+            "mean": 32.8,
+            "std": 0.033,
+            "min": 32.7,
+            "max": 32.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
+            "mean": 16.0,
+            "std": 0.049,
+            "min": 15.9,
+            "max": 16.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 22.5,
+            "std": 0.0621,
+            "min": 22.4,
+            "max": 22.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "67_conv_standard_1D.py": {
+            "mean": 12.5,
+            "std": 0.0419,
+            "min": 12.5,
+            "max": 12.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
+            "mean": 393.0,
+            "std": 0.128,
+            "min": 393.0,
+            "max": 394.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 23.3,
+            "std": 0.0538,
+            "min": 23.2,
+            "max": 23.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
+            "mean": 87.8,
+            "std": 0.0543,
+            "min": 87.6,
+            "max": 87.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
+            "mean": 9.27,
+            "std": 0.697,
+            "min": 8.88,
+            "max": 13.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
+            "mean": 5.16,
+            "std": 0.0349,
+            "min": 5.14,
+            "max": 5.44,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
+            "mean": 17.8,
+            "std": 0.0655,
+            "min": 17.7,
+            "max": 18.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "74_conv_transposed_1D_dilated.py": {
+            "mean": 8.32,
+            "std": 0.285,
+            "min": 6.68,
+            "max": 8.93,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
+            "mean": 16.9,
+            "std": 0.306,
+            "min": 16.5,
+            "max": 18.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "76_conv_standard_1D_dilated_strided__.py": {
+            "mean": 57.7,
+            "std": 2.91,
+            "min": 55.0,
+            "max": 76.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
+            "mean": 6.14,
+            "std": 0.0596,
+            "min": 6.05,
+            "max": 6.34,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
+            "mean": 16.2,
+            "std": 0.143,
+            "min": 16.1,
+            "max": 17.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
+            "mean": 10.9,
+            "std": 0.0716,
+            "min": 10.8,
+            "max": 11.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
+            "mean": 14.7,
+            "std": 0.0183,
+            "min": 14.7,
+            "max": 14.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
+            "mean": 6.15,
+            "std": 0.0244,
+            "min": 6.11,
+            "max": 6.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "82_conv_depthwise_2D_square_input_square_kernel.py": {
+            "mean": 8.39,
+            "std": 0.455,
+            "min": 6.91,
+            "max": 11.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
+            "mean": 3.72,
+            "std": 0.0183,
+            "min": 3.71,
+            "max": 3.84,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
+            "mean": 24.2,
+            "std": 0.0117,
+            "min": 24.2,
+            "max": 24.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
+            "mean": 5.31,
+            "std": 0.0288,
+            "min": 5.3,
+            "max": 5.51,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "86_conv_depthwise_separable_2D.py": {
+            "mean": 13.0,
+            "std": 0.0413,
+            "min": 12.9,
+            "max": 13.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "87_conv_pointwise_2D.py": {
+            "mean": 27.0,
+            "std": 0.0308,
+            "min": 27.0,
+            "max": 27.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "88_MinGPTNewGelu.py": {
+            "mean": 9.99,
+            "std": 0.0291,
+            "min": 9.95,
+            "max": 10.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "89_cumsum.py": {
+            "mean": 20.3,
+            "std": 0.0552,
+            "min": 20.3,
+            "max": 20.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "90_cumprod.py": {
+            "mean": 19.9,
+            "std": 0.0804,
+            "min": 19.9,
+            "max": 20.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "91_cumsum_reverse.py": {
+            "mean": 56.1,
+            "std": 0.0691,
+            "min": 56.0,
+            "max": 56.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "92_cumsum_exclusive.py": {
+            "mean": 45.0,
+            "std": 0.0489,
+            "min": 44.9,
+            "max": 45.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "93_masked_cumsum.py": {
+            "mean": 40.5,
+            "std": 0.173,
+            "min": 40.5,
+            "max": 42.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "94_MSELoss.py": {
+            "mean": 52.7,
+            "std": 0.0446,
+            "min": 52.6,
+            "max": 52.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "95_CrossEntropyLoss.py": {
+            "mean": 3.09,
+            "std": 0.0109,
+            "min": 3.08,
+            "max": 3.19,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "96_HuberLoss.py": {
+            "mean": 34.7,
+            "std": 0.0217,
+            "min": 34.7,
+            "max": 34.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "97_ScaledDotProductAttention.py": {
+            "mean": 44.1,
+            "std": 0.0406,
+            "min": 44.0,
+            "max": 44.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "98_KLDivLoss.py": {
+            "mean": 24.3,
+            "std": 0.0332,
+            "min": 24.2,
+            "max": 24.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "99_TripletMarginLoss.py": {
+            "mean": 26.4,
+            "std": 0.0708,
+            "min": 26.3,
+            "max": 27.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "100_HingeLoss.py": {
+            "mean": 61.9,
+            "std": 0.0296,
+            "min": 61.8,
+            "max": 62.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        }
+    }
+}
\ No newline at end of file
diff --git a/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json
new file mode 100644
index 00000000..596eb088
--- /dev/null
+++ b/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json
@@ -0,0 +1,904 @@
+{
+    "level1": {
+        "1_Square_matrix_multiplication_.py": {
+            "mean": 5.85,
+            "std": 0.0617,
+            "min": 5.55,
+            "max": 6.08,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "2_Standard_matrix_multiplication_.py": {
+            "mean": 5.93,
+            "std": 0.0839,
+            "min": 5.62,
+            "max": 6.17,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "3_Batched_matrix_multiplication.py": {
+            "mean": 22.0,
+            "std": 0.964,
+            "min": 21.2,
+            "max": 28.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "4_Matrix_vector_multiplication_.py": {
+            "mean": 25.5,
+            "std": 0.156,
+            "min": 25.1,
+            "max": 25.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "5_Matrix_scalar_multiplication.py": {
+            "mean": 17.6,
+            "std": 0.0187,
+            "min": 17.5,
+            "max": 17.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "6_Matmul_with_large_K_dimension_.py": {
+            "mean": 3.32,
+            "std": 0.0394,
+            "min": 3.15,
+            "max": 3.37,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "7_Matmul_with_small_K_dimension_.py": {
+            "mean": 15.1,
+            "std": 1.06,
+            "min": 13.1,
+            "max": 22.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "8_Matmul_with_irregular_shapes_.py": {
+            "mean": 20.6,
+            "std": 0.559,
+            "min": 20.0,
+            "max": 23.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "9_Tall_skinny_matrix_multiplication_.py": {
+            "mean": 11.1,
+            "std": 0.12,
+            "min": 10.9,
+            "max": 11.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "10_3D_tensor_matrix_multiplication.py": {
+            "mean": 2.44,
+            "std": 0.0532,
+            "min": 2.27,
+            "max": 2.58,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "11_4D_tensor_matrix_multiplication.py": {
+            "mean": 22.2,
+            "std": 0.279,
+            "min": 21.6,
+            "max": 23.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "12_Matmul_with_diagonal_matrices_.py": {
+            "mean": 9.59,
+            "std": 0.655,
+            "min": 7.75,
+            "max": 13.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "13_Matmul_for_symmetric_matrices.py": {
+            "mean": 10.4,
+            "std": 0.737,
+            "min": 8.74,
+            "max": 14.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "14_Matmul_for_upper_triangular_matrices.py": {
+            "mean": 5.93,
+            "std": 0.0786,
+            "min": 5.82,
+            "max": 6.16,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "15_Matmul_for_lower_triangular_matrices.py": {
+            "mean": 5.82,
+            "std": 0.0385,
+            "min": 5.81,
+            "max": 6.19,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "16_Matmul_with_transposed_A.py": {
+            "mean": 8.51,
+            "std": 0.245,
+            "min": 6.84,
+            "max": 8.95,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "17_Matmul_with_transposed_B.py": {
+            "mean": 7.43,
+            "std": 0.226,
+            "min": 6.94,
+            "max": 8.94,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "18_Matmul_with_transposed_both.py": {
+            "mean": 6.05,
+            "std": 0.167,
+            "min": 5.8,
+            "max": 7.13,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "19_ReLU.py": {
+            "mean": 26.5,
+            "std": 0.0308,
+            "min": 26.4,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "20_LeakyReLU.py": {
+            "mean": 26.5,
+            "std": 0.0313,
+            "min": 26.4,
+            "max": 26.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "21_Sigmoid.py": {
+            "mean": 26.5,
+            "std": 0.0288,
+            "min": 26.4,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "22_Tanh.py": {
+            "mean": 26.6,
+            "std": 0.516,
+            "min": 26.4,
+            "max": 30.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "23_Softmax.py": {
+            "mean": 52.8,
+            "std": 0.104,
+            "min": 52.6,
+            "max": 53.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "24_LogSoftmax.py": {
+            "mean": 52.9,
+            "std": 0.0791,
+            "min": 52.8,
+            "max": 53.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "25_Swish.py": {
+            "mean": 26.6,
+            "std": 0.0973,
+            "min": 26.5,
+            "max": 27.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "26_GELU_.py": {
+            "mean": 26.4,
+            "std": 0.104,
+            "min": 26.2,
+            "max": 26.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "27_SELU_.py": {
+            "mean": 26.6,
+            "std": 0.077,
+            "min": 26.4,
+            "max": 27.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "28_HardSigmoid.py": {
+            "mean": 26.5,
+            "std": 0.0566,
+            "min": 26.4,
+            "max": 26.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "29_Softplus.py": {
+            "mean": 26.2,
+            "std": 0.0483,
+            "min": 26.2,
+            "max": 26.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "30_Softsign.py": {
+            "mean": 26.5,
+            "std": 0.0414,
+            "min": 26.4,
+            "max": 26.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "31_ELU.py": {
+            "mean": 26.4,
+            "std": 0.0325,
+            "min": 26.3,
+            "max": 26.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "32_HardTanh.py": {
+            "mean": 26.5,
+            "std": 0.0774,
+            "min": 26.4,
+            "max": 26.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "33_BatchNorm.py": {
+            "mean": 26.0,
+            "std": 0.0405,
+            "min": 25.9,
+            "max": 26.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "34_InstanceNorm.py": {
+            "mean": 47.1,
+            "std": 0.068,
+            "min": 46.9,
+            "max": 47.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "35_GroupNorm_.py": {
+            "mean": 45.7,
+            "std": 0.0425,
+            "min": 45.6,
+            "max": 45.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "36_RMSNorm_.py": {
+            "mean": 48.9,
+            "std": 0.0773,
+            "min": 48.8,
+            "max": 49.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "37_FrobeniusNorm_.py": {
+            "mean": 45.8,
+            "std": 0.0738,
+            "min": 45.7,
+            "max": 46.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "38_L1Norm_.py": {
+            "mean": 72.5,
+            "std": 0.238,
+            "min": 72.2,
+            "max": 74.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "39_L2Norm_.py": {
+            "mean": 74.7,
+            "std": 0.106,
+            "min": 74.4,
+            "max": 74.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "40_LayerNorm.py": {
+            "mean": 2.79,
+            "std": 0.0655,
+            "min": 2.75,
+            "max": 3.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "41_Max_Pooling_1D.py": {
+            "mean": 31.0,
+            "std": 1.19,
+            "min": 30.4,
+            "max": 39.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "42_Max_Pooling_2D.py": {
+            "mean": 9.84,
+            "std": 0.21,
+            "min": 9.71,
+            "max": 10.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "43_Max_Pooling_3D.py": {
+            "mean": 12.9,
+            "std": 0.168,
+            "min": 12.2,
+            "max": 14.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "44_Average_Pooling_1D.py": {
+            "mean": 8.94,
+            "std": 0.0751,
+            "min": 8.84,
+            "max": 9.16,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "45_Average_Pooling_2D.py": {
+            "mean": 42.0,
+            "std": 0.909,
+            "min": 39.7,
+            "max": 43.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "46_Average_Pooling_3D.py": {
+            "mean": 19.0,
+            "std": 0.75,
+            "min": 18.8,
+            "max": 26.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "47_Sum_reduction_over_a_dimension.py": {
+            "mean": 22.0,
+            "std": 0.0462,
+            "min": 21.9,
+            "max": 22.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "48_Mean_reduction_over_a_dimension.py": {
+            "mean": 21.7,
+            "std": 0.0532,
+            "min": 21.6,
+            "max": 21.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "49_Max_reduction_over_a_dimension.py": {
+            "mean": 22.0,
+            "std": 0.0813,
+            "min": 21.8,
+            "max": 22.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "50_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 5.42,
+            "std": 0.127,
+            "min": 5.27,
+            "max": 5.76,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "51_Argmax_over_a_dimension.py": {
+            "mean": 21.5,
+            "std": 0.0593,
+            "min": 21.3,
+            "max": 21.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "52_Argmin_over_a_dimension.py": {
+            "mean": 21.6,
+            "std": 0.0785,
+            "min": 21.4,
+            "max": 21.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "53_Min_reduction_over_a_dimension.py": {
+            "mean": 21.9,
+            "std": 0.0312,
+            "min": 21.8,
+            "max": 22.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "54_conv_standard_3D__square_input__square_kernel.py": {
+            "mean": 11.5,
+            "std": 0.382,
+            "min": 10.0,
+            "max": 13.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
+            "mean": 37.5,
+            "std": 0.0376,
+            "min": 37.4,
+            "max": 37.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 22.2,
+            "std": 0.195,
+            "min": 22.1,
+            "max": 23.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "57_conv_transposed_2D__square_input__square_kernel.py": {
+            "mean": 49.1,
+            "std": 0.0756,
+            "min": 49.0,
+            "max": 49.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 17.4,
+            "std": 0.0674,
+            "min": 17.4,
+            "max": 17.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
+            "mean": 13.3,
+            "std": 0.05,
+            "min": 13.3,
+            "max": 13.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
+            "mean": 45.5,
+            "std": 0.107,
+            "min": 45.3,
+            "max": 45.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "61_conv_transposed_3D__square_input__square_kernel.py": {
+            "mean": 27.5,
+            "std": 0.093,
+            "min": 27.4,
+            "max": 27.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
+            "mean": 14.5,
+            "std": 0.406,
+            "min": 13.0,
+            "max": 16.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "63_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 80.1,
+            "std": 0.312,
+            "min": 79.7,
+            "max": 82.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "64_conv_transposed_1D.py": {
+            "mean": 33.1,
+            "std": 0.0561,
+            "min": 33.0,
+            "max": 33.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
+            "mean": 18.5,
+            "std": 0.062,
+            "min": 18.4,
+            "max": 18.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 22.6,
+            "std": 0.124,
+            "min": 22.4,
+            "max": 22.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "67_conv_standard_1D.py": {
+            "mean": 12.5,
+            "std": 0.0701,
+            "min": 12.5,
+            "max": 12.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
+            "mean": 393.0,
+            "std": 0.127,
+            "min": 393.0,
+            "max": 393.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 22.2,
+            "std": 0.479,
+            "min": 21.2,
+            "max": 23.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
+            "mean": 87.8,
+            "std": 0.0361,
+            "min": 87.7,
+            "max": 88.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
+            "mean": 12.8,
+            "std": 0.0219,
+            "min": 12.7,
+            "max": 12.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
+            "mean": 6.62,
+            "std": 0.157,
+            "min": 5.73,
+            "max": 6.87,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
+            "mean": 17.3,
+            "std": 0.322,
+            "min": 16.5,
+            "max": 18.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "74_conv_transposed_1D_dilated.py": {
+            "mean": 5.79,
+            "std": 0.0773,
+            "min": 5.46,
+            "max": 5.88,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
+            "mean": 14.3,
+            "std": 0.048,
+            "min": 14.3,
+            "max": 14.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "76_conv_standard_1D_dilated_strided__.py": {
+            "mean": 35.6,
+            "std": 0.569,
+            "min": 34.6,
+            "max": 36.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
+            "mean": 6.12,
+            "std": 0.0542,
+            "min": 6.08,
+            "max": 6.51,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
+            "mean": 18.5,
+            "std": 0.0573,
+            "min": 18.4,
+            "max": 18.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
+            "mean": 11.0,
+            "std": 0.13,
+            "min": 10.8,
+            "max": 11.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
+            "mean": 15.9,
+            "std": 0.0591,
+            "min": 15.9,
+            "max": 16.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
+            "mean": 6.4,
+            "std": 0.138,
+            "min": 6.28,
+            "max": 6.82,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "82_conv_depthwise_2D_square_input_square_kernel.py": {
+            "mean": 14.0,
+            "std": 0.113,
+            "min": 14.0,
+            "max": 15.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
+            "mean": 32.8,
+            "std": 0.106,
+            "min": 32.6,
+            "max": 33.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
+            "mean": 55.6,
+            "std": 0.0411,
+            "min": 55.6,
+            "max": 55.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
+            "mean": 47.9,
+            "std": 0.31,
+            "min": 47.2,
+            "max": 48.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "86_conv_depthwise_separable_2D.py": {
+            "mean": 25.3,
+            "std": 0.0247,
+            "min": 25.3,
+            "max": 25.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "87_conv_pointwise_2D.py": {
+            "mean": 83.6,
+            "std": 0.0593,
+            "min": 83.5,
+            "max": 83.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "88_MinGPTNewGelu.py": {
+            "mean": 1.16,
+            "std": 0.0281,
+            "min": 1.14,
+            "max": 1.29,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "89_cumsum.py": {
+            "mean": 17.7,
+            "std": 0.0877,
+            "min": 17.6,
+            "max": 18.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "90_cumprod.py": {
+            "mean": 17.8,
+            "std": 0.0928,
+            "min": 17.7,
+            "max": 18.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "91_cumsum_reverse.py": {
+            "mean": 35.2,
+            "std": 0.0892,
+            "min": 35.1,
+            "max": 35.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "92_cumsum_exclusive.py": {
+            "mean": 17.7,
+            "std": 0.0365,
+            "min": 17.7,
+            "max": 18.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "93_masked_cumsum.py": {
+            "mean": 19.9,
+            "std": 0.0254,
+            "min": 19.9,
+            "max": 20.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "94_MSELoss.py": {
+            "mean": 17.2,
+            "std": 0.207,
+            "min": 17.0,
+            "max": 18.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "95_CrossEntropyLoss.py": {
+            "mean": 1.13,
+            "std": 0.00513,
+            "min": 1.12,
+            "max": 1.15,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "96_HuberLoss.py": {
+            "mean": 17.1,
+            "std": 0.0508,
+            "min": 17.1,
+            "max": 17.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        },
+        "97_ScaledDotProductAttention.py": {
+            "mean": 44.6,
+            "std": 1.99,
+            "min": 44.2,
+            "max": 64.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "98_KLDivLoss.py": {
+            "mean": 4.21,
+            "std": 0.0217,
+            "min": 4.2,
+            "max": 4.32,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "99_TripletMarginLoss.py": {
+            "mean": 6.35,
+            "std": 0.00473,
+            "min": 6.34,
+            "max": 6.36,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10G",
+            "device": "cuda:0"
+        },
+        "100_HingeLoss.py": {
+            "mean": 8.55,
+            "std": 0.505,
+            "min": 8.4,
+            "max": 13.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA A10",
+            "device": "cuda:0"
+        }
+    }
+}
\ No newline at end of file
diff --git a/results/timing/H100_modal/baseline_time_torch.json b/results/timing/H100_modal/baseline_time_torch.json
new file mode 100644
index 00000000..5bdcd393
--- /dev/null
+++ b/results/timing/H100_modal/baseline_time_torch.json
@@ -0,0 +1,904 @@
+{
+    "level1": {
+        "1_Square_matrix_multiplication_.py": {
+            "mean": 2.66,
+            "std": 0.00178,
+            "min": 2.66,
+            "max": 2.67,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "2_Standard_matrix_multiplication_.py": {
+            "mean": 2.64,
+            "std": 0.0039,
+            "min": 2.64,
+            "max": 2.67,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "3_Batched_matrix_multiplication.py": {
+            "mean": 5.34,
+            "std": 0.00552,
+            "min": 5.33,
+            "max": 5.38,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "4_Matrix_vector_multiplication_.py": {
+            "mean": 2.78,
+            "std": 0.00237,
+            "min": 2.78,
+            "max": 2.79,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "5_Matrix_scalar_multiplication.py": {
+            "mean": 2.84,
+            "std": 0.00653,
+            "min": 2.83,
+            "max": 2.87,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "6_Matmul_with_large_K_dimension_.py": {
+            "mean": 1.32,
+            "std": 0.00464,
+            "min": 1.31,
+            "max": 1.34,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "7_Matmul_with_small_K_dimension_.py": {
+            "mean": 4.12,
+            "std": 0.00954,
+            "min": 4.11,
+            "max": 4.21,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "8_Matmul_with_irregular_shapes_.py": {
+            "mean": 6.42,
+            "std": 0.00544,
+            "min": 6.41,
+            "max": 6.45,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "9_Tall_skinny_matrix_multiplication_.py": {
+            "mean": 2.61,
+            "std": 0.00374,
+            "min": 2.61,
+            "max": 2.63,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "10_3D_tensor_matrix_multiplication.py": {
+            "mean": 1.05,
+            "std": 0.00159,
+            "min": 1.04,
+            "max": 1.06,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "11_4D_tensor_matrix_multiplication.py": {
+            "mean": 11.1,
+            "std": 0.957,
+            "min": 10.1,
+            "max": 13.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "12_Matmul_with_diagonal_matrices_.py": {
+            "mean": 2.69,
+            "std": 0.00425,
+            "min": 2.68,
+            "max": 2.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "13_Matmul_for_symmetric_matrices.py": {
+            "mean": 2.65,
+            "std": 0.0045,
+            "min": 2.65,
+            "max": 2.67,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "14_Matmul_for_upper_triangular_matrices.py": {
+            "mean": 2.71,
+            "std": 0.00376,
+            "min": 2.7,
+            "max": 2.73,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "15_Matmul_for_lower_triangular_matrices.py": {
+            "mean": 2.71,
+            "std": 0.00182,
+            "min": 2.71,
+            "max": 2.72,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "16_Matmul_with_transposed_A.py": {
+            "mean": 2.62,
+            "std": 0.00207,
+            "min": 2.61,
+            "max": 2.62,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "17_Matmul_with_transposed_B.py": {
+            "mean": 2.74,
+            "std": 0.00801,
+            "min": 2.71,
+            "max": 2.76,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "18_Matmul_with_transposed_both.py": {
+            "mean": 2.78,
+            "std": 0.00828,
+            "min": 2.76,
+            "max": 2.81,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "19_ReLU.py": {
+            "mean": 4.27,
+            "std": 0.00845,
+            "min": 4.26,
+            "max": 4.35,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "20_LeakyReLU.py": {
+            "mean": 4.27,
+            "std": 0.00191,
+            "min": 4.26,
+            "max": 4.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "21_Sigmoid.py": {
+            "mean": 4.26,
+            "std": 0.00198,
+            "min": 4.26,
+            "max": 4.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "22_Tanh.py": {
+            "mean": 3.05,
+            "std": 0.00172,
+            "min": 3.04,
+            "max": 3.05,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "23_Softmax.py": {
+            "mean": 7.12,
+            "std": 0.0142,
+            "min": 7.11,
+            "max": 7.18,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "24_LogSoftmax.py": {
+            "mean": 6.18,
+            "std": 0.0645,
+            "min": 6.06,
+            "max": 6.33,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "25_Swish.py": {
+            "mean": 10.6,
+            "std": 0.00389,
+            "min": 10.6,
+            "max": 10.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "26_GELU_.py": {
+            "mean": 4.24,
+            "std": 0.00177,
+            "min": 4.23,
+            "max": 4.24,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "27_SELU_.py": {
+            "mean": 3.02,
+            "std": 0.00174,
+            "min": 3.02,
+            "max": 3.03,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "28_HardSigmoid.py": {
+            "mean": 4.26,
+            "std": 0.00202,
+            "min": 4.26,
+            "max": 4.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "29_Softplus.py": {
+            "mean": 4.23,
+            "std": 0.00216,
+            "min": 4.23,
+            "max": 4.24,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "30_Softsign.py": {
+            "mean": 10.4,
+            "std": 0.00375,
+            "min": 10.4,
+            "max": 10.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "31_ELU.py": {
+            "mean": 4.24,
+            "std": 0.00229,
+            "min": 4.24,
+            "max": 4.25,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "32_HardTanh.py": {
+            "mean": 4.24,
+            "std": 0.00168,
+            "min": 4.23,
+            "max": 4.24,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "33_BatchNorm.py": {
+            "mean": 8.8,
+            "std": 0.016,
+            "min": 8.77,
+            "max": 8.85,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "34_InstanceNorm.py": {
+            "mean": 9.57,
+            "std": 0.0119,
+            "min": 9.55,
+            "max": 9.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "35_GroupNorm_.py": {
+            "mean": 9.94,
+            "std": 0.0103,
+            "min": 9.93,
+            "max": 10.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "36_RMSNorm_.py": {
+            "mean": 14.2,
+            "std": 0.00266,
+            "min": 14.2,
+            "max": 14.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "37_FrobeniusNorm_.py": {
+            "mean": 8.42,
+            "std": 0.00264,
+            "min": 8.41,
+            "max": 8.42,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "38_L1Norm_.py": {
+            "mean": 15.5,
+            "std": 0.00742,
+            "min": 15.5,
+            "max": 15.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "39_L2Norm_.py": {
+            "mean": 10.0,
+            "std": 0.0024,
+            "min": 10.0,
+            "max": 10.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "40_LayerNorm.py": {
+            "mean": 8.12,
+            "std": 0.00649,
+            "min": 8.11,
+            "max": 8.16,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "41_Max_Pooling_1D.py": {
+            "mean": 10.7,
+            "std": 0.00997,
+            "min": 10.7,
+            "max": 10.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "42_Max_Pooling_2D.py": {
+            "mean": 10.7,
+            "std": 0.00823,
+            "min": 10.7,
+            "max": 10.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "43_Max_Pooling_3D.py": {
+            "mean": 3.93,
+            "std": 0.00239,
+            "min": 3.93,
+            "max": 3.94,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "44_Average_Pooling_1D.py": {
+            "mean": 8.02,
+            "std": 0.0057,
+            "min": 8.01,
+            "max": 8.06,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "45_Average_Pooling_2D.py": {
+            "mean": 6.6,
+            "std": 0.0259,
+            "min": 6.55,
+            "max": 6.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "46_Average_Pooling_3D.py": {
+            "mean": 8.66,
+            "std": 0.00616,
+            "min": 8.65,
+            "max": 8.69,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "47_Sum_reduction_over_a_dimension.py": {
+            "mean": 2.11,
+            "std": 0.0154,
+            "min": 2.08,
+            "max": 2.16,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "48_Mean_reduction_over_a_dimension.py": {
+            "mean": 2.89,
+            "std": 0.0163,
+            "min": 2.86,
+            "max": 2.94,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "49_Max_reduction_over_a_dimension.py": {
+            "mean": 3.17,
+            "std": 0.00295,
+            "min": 3.16,
+            "max": 3.17,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "50_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 2.09,
+            "std": 0.0172,
+            "min": 2.08,
+            "max": 2.15,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "51_Argmax_over_a_dimension.py": {
+            "mean": 3.25,
+            "std": 0.00328,
+            "min": 3.25,
+            "max": 3.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "52_Argmin_over_a_dimension.py": {
+            "mean": 3.24,
+            "std": 0.00269,
+            "min": 3.24,
+            "max": 3.25,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "53_Min_reduction_over_a_dimension.py": {
+            "mean": 3.18,
+            "std": 0.00328,
+            "min": 3.17,
+            "max": 3.19,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "54_conv_standard_3D__square_input__square_kernel.py": {
+            "mean": 1.36,
+            "std": 0.00258,
+            "min": 1.36,
+            "max": 1.38,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
+            "mean": 4.18,
+            "std": 0.0626,
+            "min": 4.0,
+            "max": 4.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 3.44,
+            "std": 0.046,
+            "min": 3.37,
+            "max": 3.54,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "57_conv_transposed_2D__square_input__square_kernel.py": {
+            "mean": 6.56,
+            "std": 0.0393,
+            "min": 6.53,
+            "max": 6.86,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 2.32,
+            "std": 0.0145,
+            "min": 2.29,
+            "max": 2.37,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
+            "mean": 2.09,
+            "std": 0.0049,
+            "min": 2.08,
+            "max": 2.11,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
+            "mean": 5.29,
+            "std": 0.0129,
+            "min": 5.26,
+            "max": 5.31,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "61_conv_transposed_3D__square_input__square_kernel.py": {
+            "mean": 5.5,
+            "std": 0.011,
+            "min": 5.48,
+            "max": 5.58,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
+            "mean": 3.64,
+            "std": 0.0747,
+            "min": 3.56,
+            "max": 3.85,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "63_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 7.05,
+            "std": 0.0139,
+            "min": 7.03,
+            "max": 7.11,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "64_conv_transposed_1D.py": {
+            "mean": 5.28,
+            "std": 0.0103,
+            "min": 5.25,
+            "max": 5.31,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
+            "mean": 2.71,
+            "std": 0.0129,
+            "min": 2.68,
+            "max": 2.75,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 2.6,
+            "std": 0.00198,
+            "min": 2.6,
+            "max": 2.61,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "67_conv_standard_1D.py": {
+            "mean": 2.66,
+            "std": 0.0156,
+            "min": 2.63,
+            "max": 2.68,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
+            "mean": 9.54,
+            "std": 0.0113,
+            "min": 9.52,
+            "max": 9.61,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 2.75,
+            "std": 0.0209,
+            "min": 2.72,
+            "max": 2.81,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
+            "mean": 9.85,
+            "std": 0.0364,
+            "min": 9.83,
+            "max": 10.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
+            "mean": 1.59,
+            "std": 0.00375,
+            "min": 1.58,
+            "max": 1.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
+            "mean": 2.89,
+            "std": 0.00619,
+            "min": 2.88,
+            "max": 2.91,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
+            "mean": 2.08,
+            "std": 0.00526,
+            "min": 2.07,
+            "max": 2.09,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "74_conv_transposed_1D_dilated.py": {
+            "mean": 1.89,
+            "std": 0.017,
+            "min": 1.87,
+            "max": 2.03,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
+            "mean": 6.68,
+            "std": 0.00872,
+            "min": 6.67,
+            "max": 6.75,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "76_conv_standard_1D_dilated_strided__.py": {
+            "mean": 12.2,
+            "std": 0.0506,
+            "min": 12.2,
+            "max": 12.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
+            "mean": 1.95,
+            "std": 0.0152,
+            "min": 1.91,
+            "max": 1.99,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
+            "mean": 2.42,
+            "std": 0.00491,
+            "min": 2.41,
+            "max": 2.43,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
+            "mean": 1.93,
+            "std": 0.00786,
+            "min": 1.92,
+            "max": 1.95,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
+            "mean": 3.53,
+            "std": 0.00861,
+            "min": 3.52,
+            "max": 3.56,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
+            "mean": 1.81,
+            "std": 0.0117,
+            "min": 1.78,
+            "max": 1.83,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "82_conv_depthwise_2D_square_input_square_kernel.py": {
+            "mean": 2.54,
+            "std": 0.00128,
+            "min": 2.54,
+            "max": 2.54,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
+            "mean": 1.47,
+            "std": 0.00158,
+            "min": 1.47,
+            "max": 1.48,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
+            "mean": 10.1,
+            "std": 0.00491,
+            "min": 10.1,
+            "max": 10.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
+            "mean": 2.31,
+            "std": 0.00502,
+            "min": 2.31,
+            "max": 2.34,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "86_conv_depthwise_separable_2D.py": {
+            "mean": 3.7,
+            "std": 0.013,
+            "min": 3.67,
+            "max": 3.72,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "87_conv_pointwise_2D.py": {
+            "mean": 4.67,
+            "std": 0.00559,
+            "min": 4.66,
+            "max": 4.69,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "88_MinGPTNewGelu.py": {
+            "mean": 1.61,
+            "std": 0.00118,
+            "min": 1.6,
+            "max": 1.61,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "89_cumsum.py": {
+            "mean": 4.65,
+            "std": 0.00818,
+            "min": 4.64,
+            "max": 4.67,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "90_cumprod.py": {
+            "mean": 4.64,
+            "std": 0.00386,
+            "min": 4.63,
+            "max": 4.65,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "91_cumsum_reverse.py": {
+            "mean": 11.4,
+            "std": 0.0112,
+            "min": 11.4,
+            "max": 11.4,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "92_cumsum_exclusive.py": {
+            "mean": 8.86,
+            "std": 0.0191,
+            "min": 8.83,
+            "max": 8.93,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "93_masked_cumsum.py": {
+            "mean": 8.67,
+            "std": 0.00487,
+            "min": 8.66,
+            "max": 8.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "94_MSELoss.py": {
+            "mean": 8.43,
+            "std": 0.00298,
+            "min": 8.42,
+            "max": 8.44,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "95_CrossEntropyLoss.py": {
+            "mean": 1.45,
+            "std": 0.0022,
+            "min": 1.44,
+            "max": 1.45,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "96_HuberLoss.py": {
+            "mean": 5.52,
+            "std": 0.00201,
+            "min": 5.51,
+            "max": 5.53,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "97_ScaledDotProductAttention.py": {
+            "mean": 8.23,
+            "std": 0.193,
+            "min": 8.01,
+            "max": 9.38,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "98_KLDivLoss.py": {
+            "mean": 3.89,
+            "std": 0.00194,
+            "min": 3.89,
+            "max": 3.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "99_TripletMarginLoss.py": {
+            "mean": 4.25,
+            "std": 0.0129,
+            "min": 4.24,
+            "max": 4.37,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "100_HingeLoss.py": {
+            "mean": 10.4,
+            "std": 0.00488,
+            "min": 10.4,
+            "max": 10.5,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        }
+    }
+}
\ No newline at end of file
diff --git a/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json
new file mode 100644
index 00000000..ee1fb338
--- /dev/null
+++ b/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json
@@ -0,0 +1,904 @@
+{
+    "level1": {
+        "1_Square_matrix_multiplication_.py": {
+            "mean": 2.66,
+            "std": 0.00503,
+            "min": 2.66,
+            "max": 2.68,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "2_Standard_matrix_multiplication_.py": {
+            "mean": 2.67,
+            "std": 0.00828,
+            "min": 2.65,
+            "max": 2.68,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "3_Batched_matrix_multiplication.py": {
+            "mean": 5.32,
+            "std": 0.0181,
+            "min": 5.3,
+            "max": 5.37,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "4_Matrix_vector_multiplication_.py": {
+            "mean": 2.9,
+            "std": 0.00233,
+            "min": 2.9,
+            "max": 2.91,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "5_Matrix_scalar_multiplication.py": {
+            "mean": 2.88,
+            "std": 0.0132,
+            "min": 2.86,
+            "max": 2.98,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "6_Matmul_with_large_K_dimension_.py": {
+            "mean": 1.33,
+            "std": 0.00488,
+            "min": 1.33,
+            "max": 1.35,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "7_Matmul_with_small_K_dimension_.py": {
+            "mean": 4.14,
+            "std": 0.0273,
+            "min": 4.12,
+            "max": 4.35,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "8_Matmul_with_irregular_shapes_.py": {
+            "mean": 6.44,
+            "std": 0.00478,
+            "min": 6.43,
+            "max": 6.46,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "9_Tall_skinny_matrix_multiplication_.py": {
+            "mean": 2.63,
+            "std": 0.0036,
+            "min": 2.63,
+            "max": 2.65,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "10_3D_tensor_matrix_multiplication.py": {
+            "mean": 1.07,
+            "std": 0.00272,
+            "min": 1.06,
+            "max": 1.08,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "11_4D_tensor_matrix_multiplication.py": {
+            "mean": 10.8,
+            "std": 1.01,
+            "min": 10.0,
+            "max": 13.2,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "12_Matmul_with_diagonal_matrices_.py": {
+            "mean": 2.69,
+            "std": 0.0112,
+            "min": 2.68,
+            "max": 2.72,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "13_Matmul_for_symmetric_matrices.py": {
+            "mean": 2.66,
+            "std": 0.00314,
+            "min": 2.66,
+            "max": 2.68,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "14_Matmul_for_upper_triangular_matrices.py": {
+            "mean": 2.72,
+            "std": 0.0045,
+            "min": 2.72,
+            "max": 2.75,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "15_Matmul_for_lower_triangular_matrices.py": {
+            "mean": 2.73,
+            "std": 0.005,
+            "min": 2.72,
+            "max": 2.75,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "16_Matmul_with_transposed_A.py": {
+            "mean": 2.65,
+            "std": 0.00467,
+            "min": 2.64,
+            "max": 2.67,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "17_Matmul_with_transposed_B.py": {
+            "mean": 2.77,
+            "std": 0.0083,
+            "min": 2.76,
+            "max": 2.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "18_Matmul_with_transposed_both.py": {
+            "mean": 2.79,
+            "std": 0.0101,
+            "min": 2.77,
+            "max": 2.82,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "19_ReLU.py": {
+            "mean": 4.31,
+            "std": 0.00483,
+            "min": 4.29,
+            "max": 4.33,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "20_LeakyReLU.py": {
+            "mean": 4.29,
+            "std": 0.00853,
+            "min": 4.28,
+            "max": 4.33,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "21_Sigmoid.py": {
+            "mean": 4.29,
+            "std": 0.0126,
+            "min": 4.27,
+            "max": 4.39,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "22_Tanh.py": {
+            "mean": 3.07,
+            "std": 0.0213,
+            "min": 3.06,
+            "max": 3.27,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "23_Softmax.py": {
+            "mean": 8.68,
+            "std": 0.0881,
+            "min": 8.65,
+            "max": 9.55,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "24_LogSoftmax.py": {
+            "mean": 8.65,
+            "std": 0.0181,
+            "min": 8.63,
+            "max": 8.74,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "25_Swish.py": {
+            "mean": 4.28,
+            "std": 0.00737,
+            "min": 4.27,
+            "max": 4.32,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "26_GELU_.py": {
+            "mean": 4.27,
+            "std": 0.00737,
+            "min": 4.26,
+            "max": 4.29,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "27_SELU_.py": {
+            "mean": 4.27,
+            "std": 0.0127,
+            "min": 4.26,
+            "max": 4.37,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "28_HardSigmoid.py": {
+            "mean": 4.29,
+            "std": 0.03,
+            "min": 4.27,
+            "max": 4.58,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "29_Softplus.py": {
+            "mean": 4.29,
+            "std": 0.0411,
+            "min": 4.27,
+            "max": 4.58,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "30_Softsign.py": {
+            "mean": 3.06,
+            "std": 0.00498,
+            "min": 3.05,
+            "max": 3.08,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "31_ELU.py": {
+            "mean": 4.28,
+            "std": 0.0248,
+            "min": 4.27,
+            "max": 4.52,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "32_HardTanh.py": {
+            "mean": 4.29,
+            "std": 0.0059,
+            "min": 4.28,
+            "max": 4.31,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "33_BatchNorm.py": {
+            "mean": 4.24,
+            "std": 0.00629,
+            "min": 4.22,
+            "max": 4.26,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "34_InstanceNorm.py": {
+            "mean": 7.66,
+            "std": 0.027,
+            "min": 7.64,
+            "max": 7.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "35_GroupNorm_.py": {
+            "mean": 7.49,
+            "std": 0.00591,
+            "min": 7.48,
+            "max": 7.51,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "36_RMSNorm_.py": {
+            "mean": 7.8,
+            "std": 0.00398,
+            "min": 7.8,
+            "max": 7.82,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "37_FrobeniusNorm_.py": {
+            "mean": 7.32,
+            "std": 0.0091,
+            "min": 7.31,
+            "max": 7.36,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "38_L1Norm_.py": {
+            "mean": 13.0,
+            "std": 0.0182,
+            "min": 13.0,
+            "max": 13.1,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "39_L2Norm_.py": {
+            "mean": 13.4,
+            "std": 0.029,
+            "min": 13.3,
+            "max": 13.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "40_LayerNorm.py": {
+            "mean": 0.476,
+            "std": 0.0031,
+            "min": 0.472,
+            "max": 0.491,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "41_Max_Pooling_1D.py": {
+            "mean": 10.7,
+            "std": 0.0108,
+            "min": 10.7,
+            "max": 10.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "42_Max_Pooling_2D.py": {
+            "mean": 4.45,
+            "std": 0.00646,
+            "min": 4.44,
+            "max": 4.49,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "43_Max_Pooling_3D.py": {
+            "mean": 3.95,
+            "std": 0.00396,
+            "min": 3.94,
+            "max": 3.97,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "44_Average_Pooling_1D.py": {
+            "mean": 1.89,
+            "std": 0.00262,
+            "min": 1.88,
+            "max": 1.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "45_Average_Pooling_2D.py": {
+            "mean": 6.43,
+            "std": 0.0493,
+            "min": 6.36,
+            "max": 6.79,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "46_Average_Pooling_3D.py": {
+            "mean": 8.71,
+            "std": 0.0143,
+            "min": 8.69,
+            "max": 8.81,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "47_Sum_reduction_over_a_dimension.py": {
+            "mean": 3.5,
+            "std": 0.0161,
+            "min": 3.45,
+            "max": 3.54,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "48_Mean_reduction_over_a_dimension.py": {
+            "mean": 3.39,
+            "std": 0.0227,
+            "min": 3.34,
+            "max": 3.55,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "49_Max_reduction_over_a_dimension.py": {
+            "mean": 3.49,
+            "std": 0.0149,
+            "min": 3.45,
+            "max": 3.56,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "50_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 1.66,
+            "std": 0.0152,
+            "min": 1.64,
+            "max": 1.72,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "51_Argmax_over_a_dimension.py": {
+            "mean": 2.94,
+            "std": 0.00652,
+            "min": 2.93,
+            "max": 2.96,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "52_Argmin_over_a_dimension.py": {
+            "mean": 3.07,
+            "std": 0.00532,
+            "min": 3.06,
+            "max": 3.09,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "53_Min_reduction_over_a_dimension.py": {
+            "mean": 3.48,
+            "std": 0.0141,
+            "min": 3.44,
+            "max": 3.52,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "54_conv_standard_3D__square_input__square_kernel.py": {
+            "mean": 1.4,
+            "std": 0.00443,
+            "min": 1.39,
+            "max": 1.42,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
+            "mean": 4.63,
+            "std": 0.006,
+            "min": 4.61,
+            "max": 4.64,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 3.68,
+            "std": 0.037,
+            "min": 3.62,
+            "max": 3.76,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "57_conv_transposed_2D__square_input__square_kernel.py": {
+            "mean": 6.56,
+            "std": 0.00741,
+            "min": 6.54,
+            "max": 6.58,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 2.32,
+            "std": 0.0159,
+            "min": 2.29,
+            "max": 2.37,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
+            "mean": 2.11,
+            "std": 0.00394,
+            "min": 2.1,
+            "max": 2.13,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
+            "mean": 5.33,
+            "std": 0.0602,
+            "min": 5.32,
+            "max": 5.79,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "61_conv_transposed_3D__square_input__square_kernel.py": {
+            "mean": 5.51,
+            "std": 0.0104,
+            "min": 5.48,
+            "max": 5.54,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
+            "mean": 2.68,
+            "std": 0.0374,
+            "min": 2.66,
+            "max": 3.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "63_conv_standard_2D__square_input__square_kernel.py": {
+            "mean": 13.9,
+            "std": 0.0149,
+            "min": 13.9,
+            "max": 14.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "64_conv_transposed_1D.py": {
+            "mean": 5.32,
+            "std": 0.0562,
+            "min": 5.29,
+            "max": 5.72,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
+            "mean": 2.72,
+            "std": 0.0112,
+            "min": 2.68,
+            "max": 2.74,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 2.65,
+            "std": 0.0043,
+            "min": 2.65,
+            "max": 2.68,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "67_conv_standard_1D.py": {
+            "mean": 2.69,
+            "std": 0.0361,
+            "min": 2.65,
+            "max": 3.01,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
+            "mean": 9.55,
+            "std": 0.00915,
+            "min": 9.53,
+            "max": 9.58,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
+            "mean": 2.74,
+            "std": 0.0113,
+            "min": 2.72,
+            "max": 2.78,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
+            "mean": 10.0,
+            "std": 0.0098,
+            "min": 9.98,
+            "max": 10.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
+            "mean": 1.59,
+            "std": 0.00411,
+            "min": 1.58,
+            "max": 1.6,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
+            "mean": 2.93,
+            "std": 0.00823,
+            "min": 2.92,
+            "max": 2.96,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
+            "mean": 2.07,
+            "std": 0.00509,
+            "min": 2.06,
+            "max": 2.09,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "74_conv_transposed_1D_dilated.py": {
+            "mean": 1.93,
+            "std": 0.00876,
+            "min": 1.91,
+            "max": 1.95,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
+            "mean": 6.6,
+            "std": 0.0337,
+            "min": 6.58,
+            "max": 6.91,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "76_conv_standard_1D_dilated_strided__.py": {
+            "mean": 12.4,
+            "std": 0.0826,
+            "min": 12.4,
+            "max": 12.7,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
+            "mean": 1.98,
+            "std": 0.0146,
+            "min": 1.95,
+            "max": 2.03,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
+            "mean": 2.37,
+            "std": 0.0217,
+            "min": 2.35,
+            "max": 2.55,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
+            "mean": 1.93,
+            "std": 0.00781,
+            "min": 1.92,
+            "max": 1.95,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
+            "mean": 2.65,
+            "std": 0.0159,
+            "min": 2.63,
+            "max": 2.68,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
+            "mean": 1.71,
+            "std": 0.00818,
+            "min": 1.7,
+            "max": 1.76,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "82_conv_depthwise_2D_square_input_square_kernel.py": {
+            "mean": 2.57,
+            "std": 0.0708,
+            "min": 2.53,
+            "max": 3.09,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
+            "mean": 19.0,
+            "std": 0.273,
+            "min": 18.8,
+            "max": 20.3,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
+            "mean": 9.79,
+            "std": 0.00959,
+            "min": 9.77,
+            "max": 9.81,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
+            "mean": 13.9,
+            "std": 0.00974,
+            "min": 13.9,
+            "max": 14.0,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "86_conv_depthwise_separable_2D.py": {
+            "mean": 3.41,
+            "std": 0.28,
+            "min": 3.31,
+            "max": 5.34,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "87_conv_pointwise_2D.py": {
+            "mean": 10.6,
+            "std": 0.098,
+            "min": 10.5,
+            "max": 10.9,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "88_MinGPTNewGelu.py": {
+            "mean": 0.161,
+            "std": 0.00496,
+            "min": 0.154,
+            "max": 0.178,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "89_cumsum.py": {
+            "mean": 2.69,
+            "std": 0.0867,
+            "min": 2.66,
+            "max": 3.45,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "90_cumprod.py": {
+            "mean": 2.64,
+            "std": 0.0149,
+            "min": 2.63,
+            "max": 2.73,
+            "num_trials": 100,
+            "hardware": "NVIDIA H200",
+            "device": "cuda:0"
+        },
+        "91_cumsum_reverse.py": {
+            "mean": 5.87,
+            "std": 0.00331,
+            "min": 5.86,
+            "max": 5.88,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "92_cumsum_exclusive.py": {
+            "mean": 5.67,
+            "std": 0.01,
+            "min": 5.66,
+            "max": 5.75,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "93_masked_cumsum.py": {
+            "mean": 3.31,
+            "std": 0.0057,
+            "min": 3.3,
+            "max": 3.33,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "94_MSELoss.py": {
+            "mean": 2.78,
+            "std": 0.00263,
+            "min": 2.78,
+            "max": 2.79,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "95_CrossEntropyLoss.py": {
+            "mean": 0.234,
+            "std": 0.0313,
+            "min": 0.223,
+            "max": 0.541,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "96_HuberLoss.py": {
+            "mean": 2.79,
+            "std": 0.00316,
+            "min": 2.78,
+            "max": 2.8,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "97_ScaledDotProductAttention.py": {
+            "mean": 8.25,
+            "std": 0.228,
+            "min": 8.1,
+            "max": 9.25,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "98_KLDivLoss.py": {
+            "mean": 0.745,
+            "std": 0.0237,
+            "min": 0.735,
+            "max": 0.965,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "99_TripletMarginLoss.py": {
+            "mean": 1.04,
+            "std": 0.00274,
+            "min": 1.04,
+            "max": 1.05,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        },
+        "100_HingeLoss.py": {
+            "mean": 1.41,
+            "std": 0.00369,
+            "min": 1.4,
+            "max": 1.42,
+            "num_trials": 100,
+            "hardware": "NVIDIA H100 80GB HBM3",
+            "device": "cuda:0"
+        }
+    }
+}
\ No newline at end of file
diff --git a/scripts/benchmark_eval_analysis.py b/scripts/benchmark_eval_analysis.py
index e2bea005..428e240f 100644
--- a/scripts/benchmark_eval_analysis.py
+++ b/scripts/benchmark_eval_analysis.py
@@ -40,7 +40,7 @@ def patch(eval_results, dataset):
     """
     Patch the eval results with the dataset
     """
-    for pid in range(1, len(dataset) + 1):
+    for pid in dataset.get_problem_ids():
         if str(pid) not in eval_results:
             eval_results[str(pid)] = {
                 "sample_id": 0,
@@ -136,19 +136,40 @@ def analyze_greedy_eval(run_name, hardware, baseline, level):
     )
 
     # Extract the speedup values
-    is_correct = np.array([entry["correctness"] for entry in eval_results.values()])
-    baseline_speed = np.array(
-        [entry["mean"] for entry in baseline_results[f"level{level}"].values()]
-    )
-    actual_speed = np.array([entry["runtime"] for entry in eval_results.values()])
+    is_correct_list = []
+    baseline_speed_list = []
+    actual_speed_list = []
+
+    # Sort problem IDs to ensure consistent order
+    sorted_pids = sorted(dataset.get_problem_ids())
+
+    for pid in sorted_pids:
+        # Get eval result
+        if str(pid) not in eval_results:
+            print(f"Warning: Problem {pid} not found in eval results")
+            continue
+        eval_entry = eval_results[str(pid)]
+        
+        # Get baseline result
+        problem_path = dataset.get_problem_by_id(pid)
+        problem_name = os.path.basename(problem_path)
+        
+        if problem_name not in baseline_results[f"level{level}"]:
+            print(f"Warning: Problem {problem_name} not found in baseline results")
+            continue
+            
+        baseline_entry = baseline_results[f"level{level}"][problem_name]
+        
+        is_correct_list.append(eval_entry["correctness"])
+        actual_speed_list.append(eval_entry["runtime"])
+        baseline_speed_list.append(baseline_entry["mean"])
+
+    is_correct = np.array(is_correct_list)
+    baseline_speed = np.array(baseline_speed_list)
+    actual_speed = np.array(actual_speed_list)
     n = len(is_correct)
 
-    assert (
-        len(baseline_speed) == n
-    ), "Baseline speedup values do not match the number of eval results"
-    assert (
-        len(actual_speed) == n
-    ), "Actual speedup values do not match the number of eval results"
+    print(f"Aligned {n} problems for analysis")
 
     # Calculate the metrics
     gmsr_correct = geometric_mean_speed_ratio_correct_only(
diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 2e39e3be..a973187f 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -257,10 +257,7 @@ def fetch_ref_arch_from_problem_id(
         problem_name = curr_problem_row["name"][0]
 
     elif dataset_src == "local":
-        problem_idx_in_dataset = (
-            problem_id - 1
-        )  # due to dataset list being 0-indexed locally
-        ref_arch_path = dataset[problem_idx_in_dataset]
+        ref_arch_path = dataset.get_problem_by_id(problem_id)
 
         problem_name = os.path.basename(ref_arch_path)
         ref_arch_src = read_file(ref_arch_path)
@@ -764,17 +761,18 @@ def main(config: EvalConfig):
         curr_level_dataset = construct_kernelbench_dataset(config.level)
 
     num_problems_in_level = len(curr_level_dataset)
+    all_problem_ids = curr_level_dataset.get_problem_ids() if config.dataset_src == "local" else list(range(1, num_problems_in_level + 1))
 
     if config.subset == (None, None):
-        problem_id_range = range(1, num_problems_in_level)
+        problem_ids_to_run = all_problem_ids
     else:
-        assert (
-            config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level
-        ), f"Subset range {config.subset} out of range for Level {config.level}"
-        problem_id_range = range(config.subset[0], config.subset[1])
+        start, end = config.subset
+        problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end]
+        if not problem_ids_to_run:
+             print(f"Warning: No problems found in subset range {config.subset}")
 
     print(
-        f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_id_range}"
+        f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_ids_to_run}"
     )
 
     run_dir = os.path.join(config.runs_dir, config.run_name)
@@ -784,22 +782,19 @@ def main(config: EvalConfig):
     # single_eval_example(config, curr_level_dataset, run_dir, eval_file_path)
 
     total_work = []
-    for problem_id in range(
-        problem_id_range.start, problem_id_range.stop + 1
-    ):  # end index is inclusive
+    for problem_id in problem_ids_to_run:
         for sample_id in range(config.num_samples_per_problem):
             if not check_if_eval_exists_local(problem_id, sample_id, eval_file_path):
                 total_work.append((problem_id, sample_id))
 
     print(
         f"Start evaluation on {len(total_work)} unevaluated samples"
-        f" in range: {problem_id_range}"
+        f" in range: {problem_ids_to_run}"
     )
     # Build Cache on CPU as that is faster (only for local mode)
     if config.build_cache and config.eval_mode == "local":
         compile.batch_compile(total_work, config.to_dict())
 
-    # Batch Eval on multiple GPUs in parallel
     batch_eval(total_work, config, curr_level_dataset, run_dir, eval_file_path)
 
     # Calculate pass@k metrics if multiple samples per problem were evaluated
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 18fb3c55..0b86964a 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -139,10 +139,7 @@ def main(config: EvalConfig):
         problem_name = curr_problem_row["name"][0]
 
     elif config.dataset_src == "local":
-        problem_idx_in_dataset = (
-            config.problem_id - 1
-        )  # due to dataset list being 0-indexed locally
-        ref_arch_path = curr_level_dataset[problem_idx_in_dataset]
+        ref_arch_path = curr_level_dataset.get_problem_by_id(config.problem_id)
 
         problem_name = os.path.basename(ref_arch_path)
         ref_arch_src = read_file(ref_arch_path)
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 6962f515..471cee9a 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -13,7 +13,7 @@
 
 from datasets import load_dataset
 
-#from src.dataset import construct_kernelbench_dataset
+from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
 from src.prompt_constructor_multilang import get_prompt_for_backend
@@ -148,6 +148,8 @@ def main(config: EvalConfig):
     if config.dataset_src == "huggingface":
         dataset = load_dataset(config.dataset_name)
         curr_level_dataset = dataset[f"level_{config.level}"]
+    elif config.dataset_src == "local":
+        curr_level_dataset = construct_kernelbench_dataset(config.level)
 
     if config.log:
         os.makedirs(config.logdir, exist_ok=True)
@@ -168,8 +170,7 @@ def main(config: EvalConfig):
         problem_name = curr_problem_row["name"][0]
 
     elif config.dataset_src == "local":
-        problem_idx_in_dataset = config.problem_id - 1 # due to dataset list being 0-indexed locally
-        ref_arch_path = curr_level_dataset[problem_idx_in_dataset]
+        ref_arch_path = curr_level_dataset.get_problem_by_id(config.problem_id)
 
         problem_name = os.path.basename(ref_arch_path)
         ref_arch_src = read_file(ref_arch_path)
diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
index 5a68ea08..739ffcc1 100644
--- a/scripts/generate_baseline_time.py
+++ b/scripts/generate_baseline_time.py
@@ -7,7 +7,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_problem_dataset_from_problem_dir
+from src.dataset import construct_kernelbench_dataset
 from src.utils import read_file
 import os
 import json
@@ -46,7 +46,7 @@
 TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing")
 
 
-def fetch_ref_arch_from_dataset(dataset: list[str], 
+def fetch_ref_arch_from_dataset(dataset, 
                                 problem_id: int) -> tuple[str, str, str]:
     """
     Fetch the reference architecture from the problem directory
@@ -57,14 +57,7 @@ def fetch_ref_arch_from_dataset(dataset: list[str],
         ref_arch_name: str, the name of the reference architecture
         ref_arch_src: str, the source code of the reference architecture
     """
-    ref_arch_path = None
-    
-    for file in dataset:
-        if file.split("/")[-1].split("_")[0] == str(problem_id):
-            ref_arch_path = file
-            break
-    if ref_arch_path is None:
-        raise ValueError(f"No reference architecture found for problem_id {problem_id}")
+    ref_arch_path = dataset.get_problem_by_id(problem_id)
     
     ref_arch_src = read_file(ref_arch_path)
 
@@ -143,12 +136,11 @@ def record_baseline_times(use_torch_compile: bool = False,
     json_results = {}
     
     for level in [1, 2, 3]:
-        PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level))
-        dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
+        dataset = construct_kernelbench_dataset(level)
         json_results[f"level{level}"] = {}
 
         num_problems = len(dataset)
-        for problem_id in tqdm(range(1, num_problems + 1)):
+        for problem_id in tqdm(dataset.get_problem_ids()):
             ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(dataset, problem_id)
             runtime_stats = measure_program_time(
                 ref_arch_name=ref_arch_name,
@@ -174,8 +166,7 @@ def test_measure_particular_program(level_num: int, problem_id: int):
     """
     device = torch.device("cuda:0")
 
-    PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level_num))
-    dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
+    dataset = construct_kernelbench_dataset(level_num)
 
     ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(dataset, problem_id)
 
diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
index a0039193..f89a6a84 100644
--- a/scripts/generate_baseline_time_modal.py
+++ b/scripts/generate_baseline_time_modal.py
@@ -7,7 +7,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_problem_dataset_from_problem_dir
+from src.dataset import construct_kernelbench_dataset
 from src.utils import read_file
 import os
 import json
@@ -126,7 +126,7 @@ def write_batch_to_json(entries_to_write: list, f_path: str):
     
     print(f"[INFO] Wrote {len(entries_to_write)} entries to {f_path}")
 
-def fetch_ref_arch_from_dataset(dataset: list[str], 
+def fetch_ref_arch_from_dataset(dataset, 
                                 problem_id: int) -> tuple[str, str, str]:
     """
     Fetch the reference architecture from the problem directory
@@ -137,14 +137,7 @@ def fetch_ref_arch_from_dataset(dataset: list[str],
         ref_arch_name: str, the name of the reference architecture
         ref_arch_src: str, the source code of the reference architecture
     """
-    ref_arch_path = None
-    
-    for file in dataset:
-        if file.split("/")[-1].split("_")[0] == str(problem_id):
-            ref_arch_path = file
-            break
-    if ref_arch_path is None:
-        raise ValueError(f"No reference architecture found for problem_id {problem_id}")
+    ref_arch_path = dataset.get_problem_by_id(problem_id)
     
     ref_arch_src = read_file(ref_arch_path)
 
@@ -229,10 +222,9 @@ def record_baseline_times(config: BaselineConfig,
     json_results = []
 
     level = config.level
-    PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level))
-    dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
+    dataset = construct_kernelbench_dataset(level)
     num_problems = len(dataset)
-    total_work = [(i, *fetch_ref_arch_from_dataset(dataset, i)) for i in list(range(1, num_problems + 1))]
+    total_work = [(i, *fetch_ref_arch_from_dataset(dataset, i)) for i in dataset.get_problem_ids()]
 
     with tqdm(total=len(total_work), desc="Processing batches") as pbar:
         while len(total_work) > 0:
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 5b476445..c6869c15 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -112,10 +112,7 @@ def generate_sample_single(
         problem_name = curr_problem_row["name"][0]
 
     elif config.dataset_src == "local":
-        problem_idx_in_dataset = (
-            work.problem_id - 1
-        )  # due to dataset list being 0-indexed locally
-        ref_arch_path = dataset[problem_idx_in_dataset]
+        ref_arch_path = dataset.get_problem_by_id(work.problem_id)
 
         problem_name = os.path.basename(ref_arch_path)
         ref_arch_src = read_file(ref_arch_path)
@@ -224,17 +221,18 @@ def main(config: GenerationConfig):
         curr_level_dataset = construct_kernelbench_dataset(config.level)
 
     num_problems_in_level = len(curr_level_dataset)
+    all_problem_ids = curr_level_dataset.get_problem_ids() if config.dataset_src == "local" else list(range(1, num_problems_in_level + 1))
 
     if config.subset == (None, None):
-        problem_id_range = range(1, num_problems_in_level)
+        problem_ids_to_run = all_problem_ids
     else:
-        assert (
-            config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level
-        ), f"Subset range {config.subset} out of range for Level {config.level}"
-        problem_id_range = range(config.subset[0], config.subset[1])
+        start, end = config.subset
+        problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end]
+        if not problem_ids_to_run:
+             print(f"Warning: No problems found in subset range {config.subset}")
 
     print(
-        f"Generating {config.num_samples} sample(s) each for level {config.level} problems: {problem_id_range}"
+        f"Generating {config.num_samples} sample(s) each for level {config.level} problems: {problem_ids_to_run}"
     )
 
     # set up run directory
@@ -253,9 +251,7 @@ def main(config: GenerationConfig):
     problems_to_run = []
     total_problems = 0
     already_completed = 0
-    for problem_id in range(
-        problem_id_range.start, problem_id_range.stop + 1
-    ):  # end index is inclusive
+    for problem_id in problem_ids_to_run:
         for sample_id in range(config.num_samples):
             total_problems += 1
             if not check_kernel_exists(run_dir, config.level, problem_id, sample_id):
diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py
index e7811f64..29afe0c9 100644
--- a/scripts/inspect_baseline.py
+++ b/scripts/inspect_baseline.py
@@ -10,7 +10,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_problem_dataset_from_problem_dir
+from src.dataset import construct_kernelbench_dataset
 import os, sys
 import logging
 import json
@@ -93,8 +93,7 @@ def emit(self, record):
     separator("")
     
 def fetch_ref_arch_from_level_problem_id(level_num, problem_id, with_name=False):
-    PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level_num))
-    dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
+    dataset = construct_kernelbench_dataset(level_num)
     return fetch_ref_arch_from_problem_id(problem_id, dataset, with_name)
 
 def inspect_torch_compile_triton(level_num, problem_id):
diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py
index 4f13c8af..1e1c5a10 100644
--- a/scripts/inspect_triton.py
+++ b/scripts/inspect_triton.py
@@ -26,8 +26,9 @@
     set_seed,
 )
 
-def fetch_ref_arch_from_dataset(dataset: list[str], 
-                                problem_id: int) -> tuple[str, str, str]:
+from src.dataset import construct_kernelbench_dataset
+
+def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str]:
     """
     Fetch the reference architecture from the problem directory
     problem_id should be logical index (1-indexed), matching the problem_id in the problem_name
@@ -37,18 +38,9 @@ def fetch_ref_arch_from_dataset(dataset: list[str],
         ref_arch_name: str, the name of the reference architecture
         ref_arch_src: str, the source code of the reference architecture
     """
-    ref_arch_path = None
-    
-    for file in dataset:
-        if file.split("/")[-1].split("_")[0] == str(problem_id):
-            ref_arch_path = file
-            break
-    if ref_arch_path is None:
-        raise ValueError(f"No reference architecture found for problem_id {problem_id}")
-    
+    ref_arch_path = dataset.get_problem_by_id(problem_id)
     ref_arch_src = read_file(ref_arch_path)
-
-    ref_arch_name = ref_arch_path.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_path)
     return (ref_arch_path, ref_arch_name, ref_arch_src)
 
 
@@ -125,7 +117,7 @@ def get_torch_compile_triton(level_num, problem_id):
     Get the triton code generated by torch compile for a particular problem
     """
     ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(
-        dataset, problem_id, with_name=True
+        dataset, problem_id
     )
     context = {}
     # import pdb; pdb.set_trace()
diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index 316b96ee..43fb4e81 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -81,6 +81,7 @@ def __init__(self):
         # ref_origin is local, specify local file path
         self.ref_arch_src_path = ""
         # ref_origin is kernelbench, specify level and problem id
+        self.dataset_src = "huggingface" # either huggingface or local
         self.dataset_name = "ScalingIntelligence/KernelBench"
         self.level = ""
         self.problem_id = ""
@@ -240,16 +241,25 @@ def main(config: ScriptConfig):
         assert config.level != "", "level is required"
         assert config.problem_id != "", "problem_id is required"
 
-        # for now use the HuggingFace dataset
-        dataset = load_dataset(config.dataset_name)
-        curr_level_dataset = dataset[f"level_{config.level}"]
-
-        curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id)
-        ref_arch_src = curr_problem_row["code"][0]
-        problem_name = curr_problem_row["name"][0]
+        if config.dataset_src == "huggingface":
+            # for now use the HuggingFace dataset
+            dataset = load_dataset(config.dataset_name)
+            curr_level_dataset = dataset[f"level_{config.level}"]
+
+            curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id)
+            ref_arch_src = curr_problem_row["code"][0]
+            problem_name = curr_problem_row["name"][0]
+        elif config.dataset_src == "local":
+            from src.dataset import construct_kernelbench_dataset
+            dataset = construct_kernelbench_dataset(config.level)
+            ref_arch_path = dataset.get_problem_by_id(int(config.problem_id))
+            ref_arch_src = read_file(ref_arch_path)
+            problem_name = os.path.basename(ref_arch_path)
+        else:
+            raise ValueError(f"Invalid dataset_src: {config.dataset_src}")
 
         problem_number = int(problem_name.split("_")[0])
-        assert problem_number == config.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
+        assert problem_number == int(config.problem_id), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
 
         print(f"Fetched problem {config.problem_id} from KernelBench level {config.level}: {problem_name}")
 
diff --git a/scripts/verify_bench.py b/scripts/verify_bench.py
index 5fdc6862..2ad79395 100644
--- a/scripts/verify_bench.py
+++ b/scripts/verify_bench.py
@@ -71,37 +71,43 @@ def run(Model, NewModel, get_inputs, get_init_inputs, seed=1012):
     return check_correctness(Model, NewModel, get_inputs, get_init_inputs, seed)
 
 
-def run_all(directory):
-    print(f"Running {directory}")
+from src.dataset import construct_kernelbench_dataset
+
+def run_all(level):
+    print(f"Running Level {level}")
+    dataset = construct_kernelbench_dataset(level)
     total = 0
     passed = 0
     fail_tests = []
-    abs_path = os.path.abspath(directory)
-    for filename in os.listdir(abs_path):
-        if filename.endswith(".py"):
-            total += 1
-            module_name = filename[:-3]  # Remove .py extension
-            try:
-                # Dynamically import the module
-                spec = importlib.util.spec_from_file_location(
-                    module_name, os.path.join(abs_path, filename)
-                )
-                module = importlib.util.module_from_spec(spec)
-                spec.loader.exec_module(module)
-                # Get the required attributes from the module
-                Model = getattr(module, "Model")
-                get_inputs = getattr(module, "get_inputs")
-                get_init_inputs = getattr(module, "get_init_inputs")
-                assert run(Model, Model, get_inputs, get_init_inputs)
-                passed += 1
-            except Exception as e:
-                fail_tests.append(module_name)
-    print(f"{directory}: {passed}/{total} passed")
+    
+    for problem_id in dataset.get_problem_ids():
+        problem_path = dataset.get_problem_by_id(problem_id)
+        filename = os.path.basename(problem_path)
+        
+        total += 1
+        module_name = filename[:-3]  # Remove .py extension
+        try:
+            # Dynamically import the module
+            spec = importlib.util.spec_from_file_location(
+                module_name, problem_path
+            )
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            # Get the required attributes from the module
+            Model = getattr(module, "Model")
+            get_inputs = getattr(module, "get_inputs")
+            get_init_inputs = getattr(module, "get_init_inputs")
+            assert run(Model, Model, get_inputs, get_init_inputs)
+            passed += 1
+        except Exception as e:
+            print(f"Failed {module_name}: {e}")
+            fail_tests.append(module_name)
+    print(f"Level {level}: {passed}/{total} passed")
     if len(fail_tests) > 0:
         print(f"Failed tests: {fail_tests}")
 
 
 if __name__ == "__main__":
-    run_all(KERNEL_BENCH_PATH + "/level1")
-    run_all(KERNEL_BENCH_PATH + "/level2")
-    run_all(KERNEL_BENCH_PATH + "/level3")
+    run_all(1)
+    run_all(2)
+    run_all(3)
diff --git a/src/dataset.py b/src/dataset.py
index cb429dc1..81d393c2 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -38,6 +38,51 @@ def get_code_hash(problem_src: str) -> str:
     return hashlib.md5(cleaned_problem_src.encode()).hexdigest()
 
 
+
+def check_id_matches_name(problem_id: int, problem_name: str):
+    """Check if the problem_id matches the ID in the problem_name"""
+    return problem_id == int(os.path.basename(problem_name).split('_')[0])
+
+
+class KernelBenchDataset():
+    def __init__(self, dataset_name: str, level: int, use_subset=False, dataset=[], subset_dataset=[]):
+        
+        self.dataset_name = dataset_name
+        
+        if use_subset:
+            self.problems = subset_dataset
+        else:
+            self.problems = dataset
+
+        self.level = level
+        self.use_subset = use_subset
+
+        # print(f"[Initilaize Dataset Object] {self.dataset_name} with level {self.level} and use_subset {self.use_subset}")
+
+    def get_problem_by_id(self, problem_id=int):
+        "Logical index of problem_id (logical is 1-indexed)"
+        # Find problem with matching ID in basename
+
+        for problem in self.problems:
+            if check_id_matches_name(problem_id, problem):
+                return problem
+        raise ValueError(f"Problem ID {problem_id} not found in dataset")
+    
+    # get the problem_ids 
+    def get_problem_ids(self):
+        # return self.whol
+        return [int(os.path.basename(problem).split('_')[0]) for problem in self.problems]
+
+    def __len__(self):
+        return len(self.problems)
+
+    def __getitem__(self, index):
+        return self.problems[index]
+    
+    def __iter__(self):
+        return iter(self.problems)
+
+
 def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]:
     """
     Construct a list of relative paths to all the python files in the problem directory
@@ -57,10 +102,15 @@ def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]:
     return DATASET
 
 
-def construct_kernelbench_dataset(level: int) -> list[str]:
-    return construct_problem_dataset_from_problem_dir(
+def construct_kernelbench_dataset(level: int) -> KernelBenchDataset:
+    dataset_list = construct_problem_dataset_from_problem_dir(
         os.path.join(KERNEL_BENCH_PATH, f"level{level}")
     )
+    return KernelBenchDataset(
+        dataset_name=f"KernelBench_Level_{level}",
+        level=level,
+        dataset=dataset_list
+    )
 
 
 KERNELBENCH_LEVEL_1_DATASET = construct_kernelbench_dataset(level=1)
diff --git a/src/eval.py b/src/eval.py
index 4a072c89..9f2862a9 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -21,7 +21,7 @@
 import torch.nn as nn
 from pydantic import BaseModel
 
-from . import utils
+from . import utils, dataset
 
 REPO_TOP_PATH = os.path.abspath(
     os.path.join(
@@ -46,7 +46,10 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
     if isinstance(problem_id, str):
         problem_id = int(problem_id)
 
-    problem_path = problems[problem_id]
+    if hasattr(problems, "get_problem_by_id"):
+        problem_path = problems.get_problem_by_id(problem_id)
+    else:
+        problem_path = problems[problem_id]
 
     # problem_path = os.path.join(REPO_ROOT_PATH, problem)
     if not os.path.exists(problem_path):
@@ -60,9 +63,8 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
 
 
 def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False):
-    PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level))
-    dataset = utils.construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
-    return fetch_ref_arch_from_problem_id(problem_id, dataset, with_name)
+    kb_dataset = dataset.construct_kernelbench_dataset(level)
+    return fetch_ref_arch_from_problem_id(problem_id, kb_dataset, with_name)
 
 
 def set_seed(seed: int):
@@ -884,7 +886,12 @@ def fetch_baseline_time(
     with open(baseline_time_filepath, "r") as f:
         baseline_json = json.load(f)
 
-    problem_name = dataset[problem_id].split("/")[-1]
+    if hasattr(dataset, "get_problem_by_id"):
+        problem_path = dataset.get_problem_by_id(problem_id)
+    else:
+        problem_path = dataset[problem_id]
+
+    problem_name = os.path.basename(problem_path)
     baseline_time = baseline_json[level_name].get(problem_name, None)
     return baseline_time
 

From c77955fd8a8daeb3e651cbd861fae75189b48aed Mon Sep 17 00:00:00 2001
From: pythonomar22 <omarabulhassan@gmail.com>
Date: Thu, 20 Nov 2025 20:24:40 -0800
Subject: [PATCH 2/5] fixing some syntax

---
 scripts/generate_baseline_time.py       |  27 ++++---
 scripts/generate_baseline_time_modal.py |  27 ++++---
 scripts/inspect_triton.py               |  53 ++++++++----
 src/dataset.py                          | 103 +++++++++++++++++++-----
 4 files changed, 153 insertions(+), 57 deletions(-)

diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
index 739ffcc1..a8abb911 100644
--- a/scripts/generate_baseline_time.py
+++ b/scripts/generate_baseline_time.py
@@ -7,7 +7,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_kernelbench_dataset
+from src.dataset import construct_kernelbench_dataset, KernelBenchDataset
 from src.utils import read_file
 import os
 import json
@@ -46,22 +46,25 @@
 TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing")
 
 
-def fetch_ref_arch_from_dataset(dataset, 
-                                problem_id: int) -> tuple[str, str, str]:
-    """
-    Fetch the reference architecture from the problem directory
-    problem_id should be logical index (1-indexed), matching the problem_id in the problem_name
+def fetch_ref_arch_from_dataset(
+    dataset: KernelBenchDataset,
+    problem_id: int
+) -> tuple[str, str, str]:
+    """Fetch the reference architecture from the dataset.
+
+    Args:
+        dataset: KernelBenchDataset object
+        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
 
     Returns:
-        ref_arch_path: str, the path to the reference architecture
-        ref_arch_name: str, the name of the reference architecture
-        ref_arch_src: str, the source code of the reference architecture
+        tuple containing:
+            - ref_arch_path: Path to the reference architecture
+            - ref_arch_name: Name of the reference architecture file
+            - ref_arch_src: Source code of the reference architecture
     """
     ref_arch_path = dataset.get_problem_by_id(problem_id)
-    
     ref_arch_src = read_file(ref_arch_path)
-
-    ref_arch_name = ref_arch_path.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_path)
     return (ref_arch_path, ref_arch_name, ref_arch_src)
 
 
diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
index f89a6a84..85fc5e88 100644
--- a/scripts/generate_baseline_time_modal.py
+++ b/scripts/generate_baseline_time_modal.py
@@ -7,7 +7,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_kernelbench_dataset
+from src.dataset import construct_kernelbench_dataset, KernelBenchDataset
 from src.utils import read_file
 import os
 import json
@@ -126,22 +126,25 @@ def write_batch_to_json(entries_to_write: list, f_path: str):
     
     print(f"[INFO] Wrote {len(entries_to_write)} entries to {f_path}")
 
-def fetch_ref_arch_from_dataset(dataset, 
-                                problem_id: int) -> tuple[str, str, str]:
-    """
-    Fetch the reference architecture from the problem directory
-    problem_id should be logical index (1-indexed), matching the problem_id in the problem_name
+def fetch_ref_arch_from_dataset(
+    dataset: KernelBenchDataset,
+    problem_id: int
+) -> tuple[str, str, str]:
+    """Fetch the reference architecture from the dataset.
+
+    Args:
+        dataset: KernelBenchDataset object
+        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
 
     Returns:
-        ref_arch_path: str, the path to the reference architecture
-        ref_arch_name: str, the name of the reference architecture
-        ref_arch_src: str, the source code of the reference architecture
+        tuple containing:
+            - ref_arch_path: Path to the reference architecture
+            - ref_arch_name: Name of the reference architecture file
+            - ref_arch_src: Source code of the reference architecture
     """
     ref_arch_path = dataset.get_problem_by_id(problem_id)
-    
     ref_arch_src = read_file(ref_arch_path)
-
-    ref_arch_name = ref_arch_path.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_path)
     return (ref_arch_path, ref_arch_name, ref_arch_src)
 
 @app.cls(image=image, scaledown_window=5)
diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py
index 1e1c5a10..6e887c9b 100644
--- a/scripts/inspect_triton.py
+++ b/scripts/inspect_triton.py
@@ -26,17 +26,23 @@
     set_seed,
 )
 
-from src.dataset import construct_kernelbench_dataset
+from src.dataset import construct_kernelbench_dataset, KernelBenchDataset
 
-def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str]:
-    """
-    Fetch the reference architecture from the problem directory
-    problem_id should be logical index (1-indexed), matching the problem_id in the problem_name
+def fetch_ref_arch_from_dataset(
+    dataset: KernelBenchDataset,
+    problem_id: int
+) -> tuple[str, str, str]:
+    """Fetch the reference architecture from the dataset.
+
+    Args:
+        dataset: KernelBenchDataset object
+        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
 
     Returns:
-        ref_arch_path: str, the path to the reference architecture
-        ref_arch_name: str, the name of the reference architecture
-        ref_arch_src: str, the source code of the reference architecture
+        tuple containing:
+            - ref_arch_path: Path to the reference architecture
+            - ref_arch_name: Name of the reference architecture file
+            - ref_arch_src: Source code of the reference architecture
     """
     ref_arch_path = dataset.get_problem_by_id(problem_id)
     ref_arch_src = read_file(ref_arch_path)
@@ -44,10 +50,20 @@ def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str
     return (ref_arch_path, ref_arch_name, ref_arch_src)
 
 
-def run_profile_and_save_trace(dataset: list[str], problem_id: int, num_trials=10):
-    """
-    Helper function to get Torch Profile of a problem
-    # TODO: Fix up this function
+def run_profile_and_save_trace(
+    dataset: KernelBenchDataset,
+    problem_id: int,
+    num_trials: int = 10
+) -> None:
+    """Helper function to get Torch Profile of a problem.
+
+    Args:
+        dataset: KernelBenchDataset object
+        problem_id: Problem ID to profile
+        num_trials: Number of profiling trials to run (default: 10)
+
+    Note:
+        Saves trace files to 'trace_non_compiled.json' and 'trace_compiled.json'
     """
     ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(
         dataset, problem_id
@@ -112,10 +128,17 @@ def run_profile_and_save_trace(dataset: list[str], problem_id: int, num_trials=1
     # except Exception as e:
         # print(f"[Eval] Error in Measuring Performance: {e}")
 
-def get_torch_compile_triton(level_num, problem_id):
-    """
-    Get the triton code generated by torch compile for a particular problem
+def get_torch_compile_triton(level_num: int, problem_id: int) -> str:
+    """Get the triton code generated by torch compile for a particular problem.
+
+    Args:
+        level_num: KernelBench level (1, 2, or 3)
+        problem_id: Problem ID to inspect
+
+    Returns:
+        str: Name of the reference architecture
     """
+    dataset = construct_kernelbench_dataset(level_num)
     ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(
         dataset, problem_id
     )
diff --git a/src/dataset.py b/src/dataset.py
index 81d393c2..0eb54815 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -39,49 +39,116 @@ def get_code_hash(problem_src: str) -> str:
 
 
 
-def check_id_matches_name(problem_id: int, problem_name: str):
-    """Check if the problem_id matches the ID in the problem_name"""
-    return problem_id == int(os.path.basename(problem_name).split('_')[0])
+def check_id_matches_name(problem_id: int, problem_name: str) -> bool:
+    """Check if the problem_id matches the ID in the problem_name.
+
+    Args:
+        problem_id: The problem ID to check
+        problem_name: Path to the problem file
+
+    Returns:
+        bool: True if the ID matches the filename prefix
+
+    Raises:
+        ValueError: If filename doesn't follow the expected format
+    """
+    basename = os.path.basename(problem_name)
+    parts = basename.split('_')
+
+    if not parts or not parts[0].isdigit():
+        raise ValueError(
+            f"Problem filename '{basename}' doesn't follow expected format '<id>_<name>.py'"
+        )
+
+    return problem_id == int(parts[0])
 
 
 class KernelBenchDataset():
-    def __init__(self, dataset_name: str, level: int, use_subset=False, dataset=[], subset_dataset=[]):
-        
+    """Dataset object for easy access to problems by IDs and iteration over problems.
+
+    Args:
+        dataset_name: Name of the dataset
+        level: KernelBench level (1, 2, or 3)
+        use_subset: Whether to use the subset_dataset instead of full dataset
+        dataset: List of problem file paths for the full dataset
+        subset_dataset: List of problem file paths for a subset
+    """
+
+    def __init__(
+        self,
+        dataset_name: str,
+        level: int,
+        use_subset: bool = False,
+        dataset: list[str] = None,
+        subset_dataset: list[str] = None
+    ):
         self.dataset_name = dataset_name
-        
+        self.level = level
+        self.use_subset = use_subset
+
+        # Avoid mutable default arguments
+        if dataset is None:
+            dataset = []
+        if subset_dataset is None:
+            subset_dataset = []
+
         if use_subset:
             self.problems = subset_dataset
         else:
             self.problems = dataset
 
-        self.level = level
-        self.use_subset = use_subset
+    def get_problem_by_id(self, problem_id: int) -> str:
+        """Get problem path by its ID (1-indexed logical index).
 
-        # print(f"[Initilaize Dataset Object] {self.dataset_name} with level {self.level} and use_subset {self.use_subset}")
+        Args:
+            problem_id: The problem ID to search for
 
-    def get_problem_by_id(self, problem_id=int):
-        "Logical index of problem_id (logical is 1-indexed)"
-        # Find problem with matching ID in basename
+        Returns:
+            str: Path to the problem file
 
+        Raises:
+            ValueError: If problem ID not found in dataset
+        """
         for problem in self.problems:
             if check_id_matches_name(problem_id, problem):
                 return problem
         raise ValueError(f"Problem ID {problem_id} not found in dataset")
     
-    # get the problem_ids 
-    def get_problem_ids(self):
-        # return self.whol
+    def get_problem_ids(self) -> list[int]:
+        """Get list of all problem IDs in the dataset.
+
+        Returns:
+            list[int]: Sorted list of problem IDs extracted from filenames
+        """
         return [int(os.path.basename(problem).split('_')[0]) for problem in self.problems]
 
-    def __len__(self):
+    def __len__(self) -> int:
+        """Return the number of problems in the dataset."""
         return len(self.problems)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> str:
+        """Get problem by index (0-indexed, for backward compatibility).
+
+        Args:
+            index: Zero-based index into the problems list
+
+        Returns:
+            str: Path to the problem file
+        """
         return self.problems[index]
-    
+
     def __iter__(self):
+        """Iterate over problem paths in the dataset."""
         return iter(self.problems)
 
+    def __repr__(self) -> str:
+        """Return string representation of the dataset."""
+        subset_str = " (subset)" if self.use_subset else ""
+        return (
+            f"KernelBenchDataset(name='{self.dataset_name}', "
+            f"level={self.level}, problems={len(self.problems)}{subset_str})"
+        )
+
 
 def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]:
     """

From 660a47f2cc5dd37ab0192732251ef653fea79dcf Mon Sep 17 00:00:00 2001
From: pythonomar22 <omarabulhassan@gmail.com>
Date: Thu, 20 Nov 2025 20:44:08 -0800
Subject: [PATCH 3/5] fixing off by one error after testing

---
 scripts/generate_baseline_time.py       | 26 ++-------------
 scripts/generate_baseline_time_modal.py | 25 ++-------------
 scripts/generate_samples.py             |  2 +-
 scripts/inspect_baseline.py             |  4 +--
 scripts/inspect_triton.py               | 23 +-------------
 src/dataset.py                          | 42 +++++++++++++++++++++++--
 src/eval.py                             |  3 +-
 7 files changed, 49 insertions(+), 76 deletions(-)

diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
index a8abb911..95fca7ad 100644
--- a/scripts/generate_baseline_time.py
+++ b/scripts/generate_baseline_time.py
@@ -7,7 +7,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_kernelbench_dataset, KernelBenchDataset
+from src.dataset import construct_kernelbench_dataset, KernelBenchDataset, fetch_ref_arch_from_dataset
 from src.utils import read_file
 import os
 import json
@@ -46,28 +46,6 @@
 TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing")
 
 
-def fetch_ref_arch_from_dataset(
-    dataset: KernelBenchDataset,
-    problem_id: int
-) -> tuple[str, str, str]:
-    """Fetch the reference architecture from the dataset.
-
-    Args:
-        dataset: KernelBenchDataset object
-        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
-
-    Returns:
-        tuple containing:
-            - ref_arch_path: Path to the reference architecture
-            - ref_arch_name: Name of the reference architecture file
-            - ref_arch_src: Source code of the reference architecture
-    """
-    ref_arch_path = dataset.get_problem_by_id(problem_id)
-    ref_arch_src = read_file(ref_arch_path)
-    ref_arch_name = os.path.basename(ref_arch_path)
-    return (ref_arch_path, ref_arch_name, ref_arch_src)
-
-
 def measure_program_time(
         ref_arch_name: str,
         ref_arch_src: str, 
@@ -243,7 +221,7 @@ def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False):
     ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id(
         level_num, problem_id, with_name=True
     )
-    ref_arch_name = ref_arch_name.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_name)
     context = {}
     Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
         ref_arch_src, context
diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
index 85fc5e88..f7a579fa 100644
--- a/scripts/generate_baseline_time_modal.py
+++ b/scripts/generate_baseline_time_modal.py
@@ -7,7 +7,7 @@
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
-from src.dataset import construct_kernelbench_dataset, KernelBenchDataset
+from src.dataset import construct_kernelbench_dataset, KernelBenchDataset, fetch_ref_arch_from_dataset
 from src.utils import read_file
 import os
 import json
@@ -126,27 +126,6 @@ def write_batch_to_json(entries_to_write: list, f_path: str):
     
     print(f"[INFO] Wrote {len(entries_to_write)} entries to {f_path}")
 
-def fetch_ref_arch_from_dataset(
-    dataset: KernelBenchDataset,
-    problem_id: int
-) -> tuple[str, str, str]:
-    """Fetch the reference architecture from the dataset.
-
-    Args:
-        dataset: KernelBenchDataset object
-        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
-
-    Returns:
-        tuple containing:
-            - ref_arch_path: Path to the reference architecture
-            - ref_arch_name: Name of the reference architecture file
-            - ref_arch_src: Source code of the reference architecture
-    """
-    ref_arch_path = dataset.get_problem_by_id(problem_id)
-    ref_arch_src = read_file(ref_arch_path)
-    ref_arch_name = os.path.basename(ref_arch_path)
-    return (ref_arch_path, ref_arch_name, ref_arch_src)
-
 @app.cls(image=image, scaledown_window=5)
 class EvalFunc:
 
@@ -348,7 +327,7 @@ def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False):
     ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id(
         level_num, problem_id, with_name=True
     )
-    ref_arch_name = ref_arch_name.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_name)
     context = {}
     Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
         ref_arch_src, context
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index c6869c15..630d6bf7 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -46,7 +46,7 @@ def __init__(self):
         self.subset = (
             None,
             None,
-        )  # (problem_id, problem_name), these are the logical index
+        )  # (start_id, end_id), both inclusive - logical 1-indexed IDs
 
         self.run_name = REQUIRED  # name of the run
 
diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py
index 29afe0c9..90bd7f2f 100644
--- a/scripts/inspect_baseline.py
+++ b/scripts/inspect_baseline.py
@@ -100,7 +100,7 @@ def inspect_torch_compile_triton(level_num, problem_id):
     ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id(
         level_num, problem_id, with_name=True
     )
-    ref_arch_name = ref_arch_name.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_name)
     context = {}
     Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
         ref_arch_src, context
@@ -115,7 +115,7 @@ def inspect_baseline_torch_compile(level_num, problem_id):
         level_num, problem_id, with_name=True
     )
 
-    ref_arch_name = ref_arch_name.split("/")[-1]
+    ref_arch_name = os.path.basename(ref_arch_name)
     context = {}
     Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
         ref_arch_src, context
diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py
index 6e887c9b..0170dada 100644
--- a/scripts/inspect_triton.py
+++ b/scripts/inspect_triton.py
@@ -26,28 +26,7 @@
     set_seed,
 )
 
-from src.dataset import construct_kernelbench_dataset, KernelBenchDataset
-
-def fetch_ref_arch_from_dataset(
-    dataset: KernelBenchDataset,
-    problem_id: int
-) -> tuple[str, str, str]:
-    """Fetch the reference architecture from the dataset.
-
-    Args:
-        dataset: KernelBenchDataset object
-        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
-
-    Returns:
-        tuple containing:
-            - ref_arch_path: Path to the reference architecture
-            - ref_arch_name: Name of the reference architecture file
-            - ref_arch_src: Source code of the reference architecture
-    """
-    ref_arch_path = dataset.get_problem_by_id(problem_id)
-    ref_arch_src = read_file(ref_arch_path)
-    ref_arch_name = os.path.basename(ref_arch_path)
-    return (ref_arch_path, ref_arch_name, ref_arch_src)
+from src.dataset import construct_kernelbench_dataset, KernelBenchDataset, fetch_ref_arch_from_dataset
 
 
 def run_profile_and_save_trace(
diff --git a/src/dataset.py b/src/dataset.py
index 0eb54815..a674d8a9 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -55,12 +55,19 @@ def check_id_matches_name(problem_id: int, problem_name: str) -> bool:
     basename = os.path.basename(problem_name)
     parts = basename.split('_')
 
-    if not parts or not parts[0].isdigit():
+    if len(parts) < 2:
         raise ValueError(
             f"Problem filename '{basename}' doesn't follow expected format '<id>_<name>.py'"
         )
 
-    return problem_id == int(parts[0])
+    try:
+        file_id = int(parts[0])
+    except ValueError:
+        raise ValueError(
+            f"Problem filename '{basename}' doesn't start with a numeric ID"
+        )
+
+    return problem_id == file_id
 
 
 class KernelBenchDataset():
@@ -82,6 +89,9 @@ def __init__(
         dataset: list[str] = None,
         subset_dataset: list[str] = None
     ):
+        if level not in [1, 2, 3]:
+            raise ValueError(f"level must be 1, 2, or 3, got {level}")
+
         self.dataset_name = dataset_name
         self.level = level
         self.use_subset = use_subset
@@ -120,7 +130,7 @@ def get_problem_ids(self) -> list[int]:
         Returns:
             list[int]: Sorted list of problem IDs extracted from filenames
         """
-        return [int(os.path.basename(problem).split('_')[0]) for problem in self.problems]
+        return sorted([int(os.path.basename(problem).split('_')[0]) for problem in self.problems])
 
     def __len__(self) -> int:
         """Return the number of problems in the dataset."""
@@ -150,6 +160,32 @@ def __repr__(self) -> str:
         )
 
 
+def fetch_ref_arch_from_dataset(
+    dataset: "KernelBenchDataset",
+    problem_id: int
+) -> tuple[str, str, str]:
+    """Fetch the reference architecture from the dataset.
+
+    This is a shared utility function to avoid duplication across scripts.
+
+    Args:
+        dataset: KernelBenchDataset object
+        problem_id: Logical index (1-indexed), matching the problem_id in the problem_name
+
+    Returns:
+        tuple containing:
+            - ref_arch_path: Path to the reference architecture
+            - ref_arch_name: Name of the reference architecture file
+            - ref_arch_src: Source code of the reference architecture
+    """
+    from .utils import read_file
+
+    ref_arch_path = dataset.get_problem_by_id(problem_id)
+    ref_arch_src = read_file(ref_arch_path)
+    ref_arch_name = os.path.basename(ref_arch_path)
+    return (ref_arch_path, ref_arch_name, ref_arch_src)
+
+
 def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]:
     """
     Construct a list of relative paths to all the python files in the problem directory
diff --git a/src/eval.py b/src/eval.py
index 9f2862a9..1ae8b83c 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -49,7 +49,8 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
     if hasattr(problems, "get_problem_by_id"):
         problem_path = problems.get_problem_by_id(problem_id)
     else:
-        problem_path = problems[problem_id]
+        # Fallback for old list-based API: problem_id is 1-indexed but lists are 0-indexed
+        problem_path = problems[problem_id - 1]
 
     # problem_path = os.path.join(REPO_ROOT_PATH, problem)
     if not os.path.exists(problem_path):

From dfbef11aab7e24ec787c2b127085cdb97688c550 Mon Sep 17 00:00:00 2001
From: pythonomar22 <omarabulhassan@gmail.com>
Date: Thu, 20 Nov 2025 20:47:45 -0800
Subject: [PATCH 4/5] Remove timing JSONs from PR and ignore them

---
 .../A10G_modal/baseline_time_torch.json       | 904 ------------------
 ...e_time_torch_compile_inductor_default.json | 904 ------------------
 .../H100_modal/baseline_time_torch.json       | 904 ------------------
 ...e_time_torch_compile_inductor_default.json | 904 ------------------
 4 files changed, 3616 deletions(-)
 delete mode 100644 results/timing/A10G_modal/baseline_time_torch.json
 delete mode 100644 results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json
 delete mode 100644 results/timing/H100_modal/baseline_time_torch.json
 delete mode 100644 results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json

diff --git a/results/timing/A10G_modal/baseline_time_torch.json b/results/timing/A10G_modal/baseline_time_torch.json
deleted file mode 100644
index 327a00c2..00000000
--- a/results/timing/A10G_modal/baseline_time_torch.json
+++ /dev/null
@@ -1,904 +0,0 @@
-{
-    "level1": {
-        "1_Square_matrix_multiplication_.py": {
-            "mean": 5.78,
-            "std": 0.0635,
-            "min": 5.55,
-            "max": 5.91,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "2_Standard_matrix_multiplication_.py": {
-            "mean": 8.47,
-            "std": 0.293,
-            "min": 6.99,
-            "max": 9.51,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "3_Batched_matrix_multiplication.py": {
-            "mean": 14.0,
-            "std": 0.169,
-            "min": 13.5,
-            "max": 14.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "4_Matrix_vector_multiplication_.py": {
-            "mean": 25.5,
-            "std": 0.157,
-            "min": 25.0,
-            "max": 25.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "5_Matrix_scalar_multiplication.py": {
-            "mean": 17.6,
-            "std": 0.0154,
-            "min": 17.6,
-            "max": 17.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "6_Matmul_with_large_K_dimension_.py": {
-            "mean": 3.3,
-            "std": 0.0495,
-            "min": 3.1,
-            "max": 3.34,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "7_Matmul_with_small_K_dimension_.py": {
-            "mean": 14.9,
-            "std": 1.19,
-            "min": 13.3,
-            "max": 23.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "8_Matmul_with_irregular_shapes_.py": {
-            "mean": 21.0,
-            "std": 0.68,
-            "min": 20.5,
-            "max": 25.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "9_Tall_skinny_matrix_multiplication_.py": {
-            "mean": 10.9,
-            "std": 0.0388,
-            "min": 10.9,
-            "max": 11.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "10_3D_tensor_matrix_multiplication.py": {
-            "mean": 2.4,
-            "std": 0.0551,
-            "min": 2.25,
-            "max": 2.45,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "11_4D_tensor_matrix_multiplication.py": {
-            "mean": 22.2,
-            "std": 0.224,
-            "min": 21.8,
-            "max": 22.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "12_Matmul_with_diagonal_matrices_.py": {
-            "mean": 9.62,
-            "std": 0.537,
-            "min": 7.46,
-            "max": 11.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "13_Matmul_for_symmetric_matrices.py": {
-            "mean": 10.4,
-            "std": 0.889,
-            "min": 8.12,
-            "max": 15.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "14_Matmul_for_upper_triangular_matrices.py": {
-            "mean": 5.75,
-            "std": 0.0515,
-            "min": 5.71,
-            "max": 6.06,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "15_Matmul_for_lower_triangular_matrices.py": {
-            "mean": 5.71,
-            "std": 0.0196,
-            "min": 5.7,
-            "max": 5.88,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "16_Matmul_with_transposed_A.py": {
-            "mean": 8.29,
-            "std": 0.228,
-            "min": 6.84,
-            "max": 8.54,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "17_Matmul_with_transposed_B.py": {
-            "mean": 11.9,
-            "std": 0.674,
-            "min": 10.1,
-            "max": 15.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "18_Matmul_with_transposed_both.py": {
-            "mean": 8.78,
-            "std": 0.415,
-            "min": 7.41,
-            "max": 10.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "19_ReLU.py": {
-            "mean": 26.6,
-            "std": 0.0288,
-            "min": 26.5,
-            "max": 26.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "20_LeakyReLU.py": {
-            "mean": 26.4,
-            "std": 0.0369,
-            "min": 26.4,
-            "max": 26.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "21_Sigmoid.py": {
-            "mean": 26.5,
-            "std": 0.0341,
-            "min": 26.4,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "22_Tanh.py": {
-            "mean": 26.6,
-            "std": 0.0275,
-            "min": 26.5,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "23_Softmax.py": {
-            "mean": 51.4,
-            "std": 0.0335,
-            "min": 51.3,
-            "max": 51.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "24_LogSoftmax.py": {
-            "mean": 51.4,
-            "std": 0.0255,
-            "min": 51.3,
-            "max": 51.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "25_Swish.py": {
-            "mean": 65.9,
-            "std": 0.033,
-            "min": 65.8,
-            "max": 66.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "26_GELU_.py": {
-            "mean": 26.5,
-            "std": 0.0244,
-            "min": 26.5,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "27_SELU_.py": {
-            "mean": 26.4,
-            "std": 0.014,
-            "min": 26.3,
-            "max": 26.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "28_HardSigmoid.py": {
-            "mean": 26.6,
-            "std": 0.0332,
-            "min": 26.5,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "29_Softplus.py": {
-            "mean": 26.5,
-            "std": 0.0349,
-            "min": 26.5,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "30_Softsign.py": {
-            "mean": 92.3,
-            "std": 0.0474,
-            "min": 92.2,
-            "max": 92.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "31_ELU.py": {
-            "mean": 26.4,
-            "std": 0.0291,
-            "min": 26.4,
-            "max": 26.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "32_HardTanh.py": {
-            "mean": 26.4,
-            "std": 0.0333,
-            "min": 26.4,
-            "max": 26.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "33_BatchNorm.py": {
-            "mean": 28.3,
-            "std": 0.0373,
-            "min": 28.2,
-            "max": 28.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "34_InstanceNorm.py": {
-            "mean": 47.4,
-            "std": 0.0383,
-            "min": 47.3,
-            "max": 47.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "35_GroupNorm_.py": {
-            "mean": 46.6,
-            "std": 0.0375,
-            "min": 46.5,
-            "max": 46.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "36_RMSNorm_.py": {
-            "mean": 80.9,
-            "std": 0.0425,
-            "min": 80.8,
-            "max": 81.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "37_FrobeniusNorm_.py": {
-            "mean": 45.5,
-            "std": 0.0466,
-            "min": 45.4,
-            "max": 45.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "38_L1Norm_.py": {
-            "mean": 88.2,
-            "std": 0.0341,
-            "min": 88.1,
-            "max": 88.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "39_L2Norm_.py": {
-            "mean": 53.0,
-            "std": 0.105,
-            "min": 52.9,
-            "max": 53.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "40_LayerNorm.py": {
-            "mean": 8.53,
-            "std": 0.0196,
-            "min": 8.52,
-            "max": 8.65,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "41_Max_Pooling_1D.py": {
-            "mean": 27.0,
-            "std": 0.0402,
-            "min": 26.9,
-            "max": 27.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "42_Max_Pooling_2D.py": {
-            "mean": 30.9,
-            "std": 1.31,
-            "min": 30.1,
-            "max": 40.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "43_Max_Pooling_3D.py": {
-            "mean": 12.9,
-            "std": 0.16,
-            "min": 12.2,
-            "max": 13.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "44_Average_Pooling_1D.py": {
-            "mean": 18.9,
-            "std": 0.497,
-            "min": 18.5,
-            "max": 21.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "45_Average_Pooling_2D.py": {
-            "mean": 44.0,
-            "std": 0.0797,
-            "min": 43.8,
-            "max": 44.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "46_Average_Pooling_3D.py": {
-            "mean": 18.8,
-            "std": 0.0448,
-            "min": 18.7,
-            "max": 18.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "47_Sum_reduction_over_a_dimension.py": {
-            "mean": 21.1,
-            "std": 0.0911,
-            "min": 20.8,
-            "max": 21.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "48_Mean_reduction_over_a_dimension.py": {
-            "mean": 21.0,
-            "std": 0.0635,
-            "min": 20.9,
-            "max": 21.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "49_Max_reduction_over_a_dimension.py": {
-            "mean": 20.1,
-            "std": 0.0694,
-            "min": 19.9,
-            "max": 20.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "50_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 16.1,
-            "std": 0.0699,
-            "min": 16.0,
-            "max": 16.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "51_Argmax_over_a_dimension.py": {
-            "mean": 20.9,
-            "std": 0.0814,
-            "min": 20.7,
-            "max": 21.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "52_Argmin_over_a_dimension.py": {
-            "mean": 20.9,
-            "std": 0.0777,
-            "min": 20.7,
-            "max": 21.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "53_Min_reduction_over_a_dimension.py": {
-            "mean": 20.9,
-            "std": 0.0826,
-            "min": 20.8,
-            "max": 21.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "54_conv_standard_3D__square_input__square_kernel.py": {
-            "mean": 14.4,
-            "std": 0.0301,
-            "min": 14.3,
-            "max": 14.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
-            "mean": 83.4,
-            "std": 2.16,
-            "min": 81.5,
-            "max": 101.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 26.2,
-            "std": 1.92,
-            "min": 25.5,
-            "max": 43.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "57_conv_transposed_2D__square_input__square_kernel.py": {
-            "mean": 39.5,
-            "std": 0.0511,
-            "min": 39.3,
-            "max": 39.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 17.4,
-            "std": 0.0441,
-            "min": 17.3,
-            "max": 17.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
-            "mean": 13.3,
-            "std": 0.0313,
-            "min": 13.2,
-            "max": 13.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
-            "mean": 31.5,
-            "std": 0.15,
-            "min": 31.3,
-            "max": 31.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "61_conv_transposed_3D__square_input__square_kernel.py": {
-            "mean": 27.5,
-            "std": 0.0604,
-            "min": 27.4,
-            "max": 27.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
-            "mean": 15.1,
-            "std": 0.0803,
-            "min": 14.9,
-            "max": 15.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "63_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 43.2,
-            "std": 0.142,
-            "min": 43.1,
-            "max": 44.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "64_conv_transposed_1D.py": {
-            "mean": 32.8,
-            "std": 0.033,
-            "min": 32.7,
-            "max": 32.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
-            "mean": 16.0,
-            "std": 0.049,
-            "min": 15.9,
-            "max": 16.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 22.5,
-            "std": 0.0621,
-            "min": 22.4,
-            "max": 22.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "67_conv_standard_1D.py": {
-            "mean": 12.5,
-            "std": 0.0419,
-            "min": 12.5,
-            "max": 12.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
-            "mean": 393.0,
-            "std": 0.128,
-            "min": 393.0,
-            "max": 394.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 23.3,
-            "std": 0.0538,
-            "min": 23.2,
-            "max": 23.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
-            "mean": 87.8,
-            "std": 0.0543,
-            "min": 87.6,
-            "max": 87.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
-            "mean": 9.27,
-            "std": 0.697,
-            "min": 8.88,
-            "max": 13.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
-            "mean": 5.16,
-            "std": 0.0349,
-            "min": 5.14,
-            "max": 5.44,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
-            "mean": 17.8,
-            "std": 0.0655,
-            "min": 17.7,
-            "max": 18.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "74_conv_transposed_1D_dilated.py": {
-            "mean": 8.32,
-            "std": 0.285,
-            "min": 6.68,
-            "max": 8.93,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
-            "mean": 16.9,
-            "std": 0.306,
-            "min": 16.5,
-            "max": 18.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "76_conv_standard_1D_dilated_strided__.py": {
-            "mean": 57.7,
-            "std": 2.91,
-            "min": 55.0,
-            "max": 76.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
-            "mean": 6.14,
-            "std": 0.0596,
-            "min": 6.05,
-            "max": 6.34,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
-            "mean": 16.2,
-            "std": 0.143,
-            "min": 16.1,
-            "max": 17.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
-            "mean": 10.9,
-            "std": 0.0716,
-            "min": 10.8,
-            "max": 11.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
-            "mean": 14.7,
-            "std": 0.0183,
-            "min": 14.7,
-            "max": 14.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
-            "mean": 6.15,
-            "std": 0.0244,
-            "min": 6.11,
-            "max": 6.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "82_conv_depthwise_2D_square_input_square_kernel.py": {
-            "mean": 8.39,
-            "std": 0.455,
-            "min": 6.91,
-            "max": 11.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
-            "mean": 3.72,
-            "std": 0.0183,
-            "min": 3.71,
-            "max": 3.84,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
-            "mean": 24.2,
-            "std": 0.0117,
-            "min": 24.2,
-            "max": 24.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
-            "mean": 5.31,
-            "std": 0.0288,
-            "min": 5.3,
-            "max": 5.51,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "86_conv_depthwise_separable_2D.py": {
-            "mean": 13.0,
-            "std": 0.0413,
-            "min": 12.9,
-            "max": 13.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "87_conv_pointwise_2D.py": {
-            "mean": 27.0,
-            "std": 0.0308,
-            "min": 27.0,
-            "max": 27.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "88_MinGPTNewGelu.py": {
-            "mean": 9.99,
-            "std": 0.0291,
-            "min": 9.95,
-            "max": 10.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "89_cumsum.py": {
-            "mean": 20.3,
-            "std": 0.0552,
-            "min": 20.3,
-            "max": 20.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "90_cumprod.py": {
-            "mean": 19.9,
-            "std": 0.0804,
-            "min": 19.9,
-            "max": 20.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "91_cumsum_reverse.py": {
-            "mean": 56.1,
-            "std": 0.0691,
-            "min": 56.0,
-            "max": 56.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "92_cumsum_exclusive.py": {
-            "mean": 45.0,
-            "std": 0.0489,
-            "min": 44.9,
-            "max": 45.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "93_masked_cumsum.py": {
-            "mean": 40.5,
-            "std": 0.173,
-            "min": 40.5,
-            "max": 42.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "94_MSELoss.py": {
-            "mean": 52.7,
-            "std": 0.0446,
-            "min": 52.6,
-            "max": 52.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "95_CrossEntropyLoss.py": {
-            "mean": 3.09,
-            "std": 0.0109,
-            "min": 3.08,
-            "max": 3.19,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "96_HuberLoss.py": {
-            "mean": 34.7,
-            "std": 0.0217,
-            "min": 34.7,
-            "max": 34.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "97_ScaledDotProductAttention.py": {
-            "mean": 44.1,
-            "std": 0.0406,
-            "min": 44.0,
-            "max": 44.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "98_KLDivLoss.py": {
-            "mean": 24.3,
-            "std": 0.0332,
-            "min": 24.2,
-            "max": 24.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "99_TripletMarginLoss.py": {
-            "mean": 26.4,
-            "std": 0.0708,
-            "min": 26.3,
-            "max": 27.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "100_HingeLoss.py": {
-            "mean": 61.9,
-            "std": 0.0296,
-            "min": 61.8,
-            "max": 62.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        }
-    }
-}
\ No newline at end of file
diff --git a/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json
deleted file mode 100644
index 596eb088..00000000
--- a/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json
+++ /dev/null
@@ -1,904 +0,0 @@
-{
-    "level1": {
-        "1_Square_matrix_multiplication_.py": {
-            "mean": 5.85,
-            "std": 0.0617,
-            "min": 5.55,
-            "max": 6.08,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "2_Standard_matrix_multiplication_.py": {
-            "mean": 5.93,
-            "std": 0.0839,
-            "min": 5.62,
-            "max": 6.17,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "3_Batched_matrix_multiplication.py": {
-            "mean": 22.0,
-            "std": 0.964,
-            "min": 21.2,
-            "max": 28.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "4_Matrix_vector_multiplication_.py": {
-            "mean": 25.5,
-            "std": 0.156,
-            "min": 25.1,
-            "max": 25.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "5_Matrix_scalar_multiplication.py": {
-            "mean": 17.6,
-            "std": 0.0187,
-            "min": 17.5,
-            "max": 17.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "6_Matmul_with_large_K_dimension_.py": {
-            "mean": 3.32,
-            "std": 0.0394,
-            "min": 3.15,
-            "max": 3.37,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "7_Matmul_with_small_K_dimension_.py": {
-            "mean": 15.1,
-            "std": 1.06,
-            "min": 13.1,
-            "max": 22.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "8_Matmul_with_irregular_shapes_.py": {
-            "mean": 20.6,
-            "std": 0.559,
-            "min": 20.0,
-            "max": 23.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "9_Tall_skinny_matrix_multiplication_.py": {
-            "mean": 11.1,
-            "std": 0.12,
-            "min": 10.9,
-            "max": 11.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "10_3D_tensor_matrix_multiplication.py": {
-            "mean": 2.44,
-            "std": 0.0532,
-            "min": 2.27,
-            "max": 2.58,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "11_4D_tensor_matrix_multiplication.py": {
-            "mean": 22.2,
-            "std": 0.279,
-            "min": 21.6,
-            "max": 23.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "12_Matmul_with_diagonal_matrices_.py": {
-            "mean": 9.59,
-            "std": 0.655,
-            "min": 7.75,
-            "max": 13.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "13_Matmul_for_symmetric_matrices.py": {
-            "mean": 10.4,
-            "std": 0.737,
-            "min": 8.74,
-            "max": 14.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "14_Matmul_for_upper_triangular_matrices.py": {
-            "mean": 5.93,
-            "std": 0.0786,
-            "min": 5.82,
-            "max": 6.16,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "15_Matmul_for_lower_triangular_matrices.py": {
-            "mean": 5.82,
-            "std": 0.0385,
-            "min": 5.81,
-            "max": 6.19,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "16_Matmul_with_transposed_A.py": {
-            "mean": 8.51,
-            "std": 0.245,
-            "min": 6.84,
-            "max": 8.95,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "17_Matmul_with_transposed_B.py": {
-            "mean": 7.43,
-            "std": 0.226,
-            "min": 6.94,
-            "max": 8.94,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "18_Matmul_with_transposed_both.py": {
-            "mean": 6.05,
-            "std": 0.167,
-            "min": 5.8,
-            "max": 7.13,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "19_ReLU.py": {
-            "mean": 26.5,
-            "std": 0.0308,
-            "min": 26.4,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "20_LeakyReLU.py": {
-            "mean": 26.5,
-            "std": 0.0313,
-            "min": 26.4,
-            "max": 26.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "21_Sigmoid.py": {
-            "mean": 26.5,
-            "std": 0.0288,
-            "min": 26.4,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "22_Tanh.py": {
-            "mean": 26.6,
-            "std": 0.516,
-            "min": 26.4,
-            "max": 30.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "23_Softmax.py": {
-            "mean": 52.8,
-            "std": 0.104,
-            "min": 52.6,
-            "max": 53.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "24_LogSoftmax.py": {
-            "mean": 52.9,
-            "std": 0.0791,
-            "min": 52.8,
-            "max": 53.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "25_Swish.py": {
-            "mean": 26.6,
-            "std": 0.0973,
-            "min": 26.5,
-            "max": 27.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "26_GELU_.py": {
-            "mean": 26.4,
-            "std": 0.104,
-            "min": 26.2,
-            "max": 26.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "27_SELU_.py": {
-            "mean": 26.6,
-            "std": 0.077,
-            "min": 26.4,
-            "max": 27.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "28_HardSigmoid.py": {
-            "mean": 26.5,
-            "std": 0.0566,
-            "min": 26.4,
-            "max": 26.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "29_Softplus.py": {
-            "mean": 26.2,
-            "std": 0.0483,
-            "min": 26.2,
-            "max": 26.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "30_Softsign.py": {
-            "mean": 26.5,
-            "std": 0.0414,
-            "min": 26.4,
-            "max": 26.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "31_ELU.py": {
-            "mean": 26.4,
-            "std": 0.0325,
-            "min": 26.3,
-            "max": 26.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "32_HardTanh.py": {
-            "mean": 26.5,
-            "std": 0.0774,
-            "min": 26.4,
-            "max": 26.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "33_BatchNorm.py": {
-            "mean": 26.0,
-            "std": 0.0405,
-            "min": 25.9,
-            "max": 26.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "34_InstanceNorm.py": {
-            "mean": 47.1,
-            "std": 0.068,
-            "min": 46.9,
-            "max": 47.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "35_GroupNorm_.py": {
-            "mean": 45.7,
-            "std": 0.0425,
-            "min": 45.6,
-            "max": 45.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "36_RMSNorm_.py": {
-            "mean": 48.9,
-            "std": 0.0773,
-            "min": 48.8,
-            "max": 49.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "37_FrobeniusNorm_.py": {
-            "mean": 45.8,
-            "std": 0.0738,
-            "min": 45.7,
-            "max": 46.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "38_L1Norm_.py": {
-            "mean": 72.5,
-            "std": 0.238,
-            "min": 72.2,
-            "max": 74.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "39_L2Norm_.py": {
-            "mean": 74.7,
-            "std": 0.106,
-            "min": 74.4,
-            "max": 74.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "40_LayerNorm.py": {
-            "mean": 2.79,
-            "std": 0.0655,
-            "min": 2.75,
-            "max": 3.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "41_Max_Pooling_1D.py": {
-            "mean": 31.0,
-            "std": 1.19,
-            "min": 30.4,
-            "max": 39.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "42_Max_Pooling_2D.py": {
-            "mean": 9.84,
-            "std": 0.21,
-            "min": 9.71,
-            "max": 10.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "43_Max_Pooling_3D.py": {
-            "mean": 12.9,
-            "std": 0.168,
-            "min": 12.2,
-            "max": 14.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "44_Average_Pooling_1D.py": {
-            "mean": 8.94,
-            "std": 0.0751,
-            "min": 8.84,
-            "max": 9.16,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "45_Average_Pooling_2D.py": {
-            "mean": 42.0,
-            "std": 0.909,
-            "min": 39.7,
-            "max": 43.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "46_Average_Pooling_3D.py": {
-            "mean": 19.0,
-            "std": 0.75,
-            "min": 18.8,
-            "max": 26.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "47_Sum_reduction_over_a_dimension.py": {
-            "mean": 22.0,
-            "std": 0.0462,
-            "min": 21.9,
-            "max": 22.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "48_Mean_reduction_over_a_dimension.py": {
-            "mean": 21.7,
-            "std": 0.0532,
-            "min": 21.6,
-            "max": 21.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "49_Max_reduction_over_a_dimension.py": {
-            "mean": 22.0,
-            "std": 0.0813,
-            "min": 21.8,
-            "max": 22.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "50_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 5.42,
-            "std": 0.127,
-            "min": 5.27,
-            "max": 5.76,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "51_Argmax_over_a_dimension.py": {
-            "mean": 21.5,
-            "std": 0.0593,
-            "min": 21.3,
-            "max": 21.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "52_Argmin_over_a_dimension.py": {
-            "mean": 21.6,
-            "std": 0.0785,
-            "min": 21.4,
-            "max": 21.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "53_Min_reduction_over_a_dimension.py": {
-            "mean": 21.9,
-            "std": 0.0312,
-            "min": 21.8,
-            "max": 22.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "54_conv_standard_3D__square_input__square_kernel.py": {
-            "mean": 11.5,
-            "std": 0.382,
-            "min": 10.0,
-            "max": 13.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
-            "mean": 37.5,
-            "std": 0.0376,
-            "min": 37.4,
-            "max": 37.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 22.2,
-            "std": 0.195,
-            "min": 22.1,
-            "max": 23.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "57_conv_transposed_2D__square_input__square_kernel.py": {
-            "mean": 49.1,
-            "std": 0.0756,
-            "min": 49.0,
-            "max": 49.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 17.4,
-            "std": 0.0674,
-            "min": 17.4,
-            "max": 17.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
-            "mean": 13.3,
-            "std": 0.05,
-            "min": 13.3,
-            "max": 13.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
-            "mean": 45.5,
-            "std": 0.107,
-            "min": 45.3,
-            "max": 45.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "61_conv_transposed_3D__square_input__square_kernel.py": {
-            "mean": 27.5,
-            "std": 0.093,
-            "min": 27.4,
-            "max": 27.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
-            "mean": 14.5,
-            "std": 0.406,
-            "min": 13.0,
-            "max": 16.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "63_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 80.1,
-            "std": 0.312,
-            "min": 79.7,
-            "max": 82.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "64_conv_transposed_1D.py": {
-            "mean": 33.1,
-            "std": 0.0561,
-            "min": 33.0,
-            "max": 33.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
-            "mean": 18.5,
-            "std": 0.062,
-            "min": 18.4,
-            "max": 18.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 22.6,
-            "std": 0.124,
-            "min": 22.4,
-            "max": 22.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "67_conv_standard_1D.py": {
-            "mean": 12.5,
-            "std": 0.0701,
-            "min": 12.5,
-            "max": 12.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
-            "mean": 393.0,
-            "std": 0.127,
-            "min": 393.0,
-            "max": 393.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 22.2,
-            "std": 0.479,
-            "min": 21.2,
-            "max": 23.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
-            "mean": 87.8,
-            "std": 0.0361,
-            "min": 87.7,
-            "max": 88.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
-            "mean": 12.8,
-            "std": 0.0219,
-            "min": 12.7,
-            "max": 12.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
-            "mean": 6.62,
-            "std": 0.157,
-            "min": 5.73,
-            "max": 6.87,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
-            "mean": 17.3,
-            "std": 0.322,
-            "min": 16.5,
-            "max": 18.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "74_conv_transposed_1D_dilated.py": {
-            "mean": 5.79,
-            "std": 0.0773,
-            "min": 5.46,
-            "max": 5.88,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
-            "mean": 14.3,
-            "std": 0.048,
-            "min": 14.3,
-            "max": 14.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "76_conv_standard_1D_dilated_strided__.py": {
-            "mean": 35.6,
-            "std": 0.569,
-            "min": 34.6,
-            "max": 36.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
-            "mean": 6.12,
-            "std": 0.0542,
-            "min": 6.08,
-            "max": 6.51,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
-            "mean": 18.5,
-            "std": 0.0573,
-            "min": 18.4,
-            "max": 18.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
-            "mean": 11.0,
-            "std": 0.13,
-            "min": 10.8,
-            "max": 11.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
-            "mean": 15.9,
-            "std": 0.0591,
-            "min": 15.9,
-            "max": 16.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
-            "mean": 6.4,
-            "std": 0.138,
-            "min": 6.28,
-            "max": 6.82,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "82_conv_depthwise_2D_square_input_square_kernel.py": {
-            "mean": 14.0,
-            "std": 0.113,
-            "min": 14.0,
-            "max": 15.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
-            "mean": 32.8,
-            "std": 0.106,
-            "min": 32.6,
-            "max": 33.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
-            "mean": 55.6,
-            "std": 0.0411,
-            "min": 55.6,
-            "max": 55.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
-            "mean": 47.9,
-            "std": 0.31,
-            "min": 47.2,
-            "max": 48.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "86_conv_depthwise_separable_2D.py": {
-            "mean": 25.3,
-            "std": 0.0247,
-            "min": 25.3,
-            "max": 25.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "87_conv_pointwise_2D.py": {
-            "mean": 83.6,
-            "std": 0.0593,
-            "min": 83.5,
-            "max": 83.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "88_MinGPTNewGelu.py": {
-            "mean": 1.16,
-            "std": 0.0281,
-            "min": 1.14,
-            "max": 1.29,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "89_cumsum.py": {
-            "mean": 17.7,
-            "std": 0.0877,
-            "min": 17.6,
-            "max": 18.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "90_cumprod.py": {
-            "mean": 17.8,
-            "std": 0.0928,
-            "min": 17.7,
-            "max": 18.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "91_cumsum_reverse.py": {
-            "mean": 35.2,
-            "std": 0.0892,
-            "min": 35.1,
-            "max": 35.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "92_cumsum_exclusive.py": {
-            "mean": 17.7,
-            "std": 0.0365,
-            "min": 17.7,
-            "max": 18.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "93_masked_cumsum.py": {
-            "mean": 19.9,
-            "std": 0.0254,
-            "min": 19.9,
-            "max": 20.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "94_MSELoss.py": {
-            "mean": 17.2,
-            "std": 0.207,
-            "min": 17.0,
-            "max": 18.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "95_CrossEntropyLoss.py": {
-            "mean": 1.13,
-            "std": 0.00513,
-            "min": 1.12,
-            "max": 1.15,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "96_HuberLoss.py": {
-            "mean": 17.1,
-            "std": 0.0508,
-            "min": 17.1,
-            "max": 17.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        },
-        "97_ScaledDotProductAttention.py": {
-            "mean": 44.6,
-            "std": 1.99,
-            "min": 44.2,
-            "max": 64.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "98_KLDivLoss.py": {
-            "mean": 4.21,
-            "std": 0.0217,
-            "min": 4.2,
-            "max": 4.32,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "99_TripletMarginLoss.py": {
-            "mean": 6.35,
-            "std": 0.00473,
-            "min": 6.34,
-            "max": 6.36,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10G",
-            "device": "cuda:0"
-        },
-        "100_HingeLoss.py": {
-            "mean": 8.55,
-            "std": 0.505,
-            "min": 8.4,
-            "max": 13.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA A10",
-            "device": "cuda:0"
-        }
-    }
-}
\ No newline at end of file
diff --git a/results/timing/H100_modal/baseline_time_torch.json b/results/timing/H100_modal/baseline_time_torch.json
deleted file mode 100644
index 5bdcd393..00000000
--- a/results/timing/H100_modal/baseline_time_torch.json
+++ /dev/null
@@ -1,904 +0,0 @@
-{
-    "level1": {
-        "1_Square_matrix_multiplication_.py": {
-            "mean": 2.66,
-            "std": 0.00178,
-            "min": 2.66,
-            "max": 2.67,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "2_Standard_matrix_multiplication_.py": {
-            "mean": 2.64,
-            "std": 0.0039,
-            "min": 2.64,
-            "max": 2.67,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "3_Batched_matrix_multiplication.py": {
-            "mean": 5.34,
-            "std": 0.00552,
-            "min": 5.33,
-            "max": 5.38,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "4_Matrix_vector_multiplication_.py": {
-            "mean": 2.78,
-            "std": 0.00237,
-            "min": 2.78,
-            "max": 2.79,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "5_Matrix_scalar_multiplication.py": {
-            "mean": 2.84,
-            "std": 0.00653,
-            "min": 2.83,
-            "max": 2.87,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "6_Matmul_with_large_K_dimension_.py": {
-            "mean": 1.32,
-            "std": 0.00464,
-            "min": 1.31,
-            "max": 1.34,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "7_Matmul_with_small_K_dimension_.py": {
-            "mean": 4.12,
-            "std": 0.00954,
-            "min": 4.11,
-            "max": 4.21,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "8_Matmul_with_irregular_shapes_.py": {
-            "mean": 6.42,
-            "std": 0.00544,
-            "min": 6.41,
-            "max": 6.45,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "9_Tall_skinny_matrix_multiplication_.py": {
-            "mean": 2.61,
-            "std": 0.00374,
-            "min": 2.61,
-            "max": 2.63,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "10_3D_tensor_matrix_multiplication.py": {
-            "mean": 1.05,
-            "std": 0.00159,
-            "min": 1.04,
-            "max": 1.06,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "11_4D_tensor_matrix_multiplication.py": {
-            "mean": 11.1,
-            "std": 0.957,
-            "min": 10.1,
-            "max": 13.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "12_Matmul_with_diagonal_matrices_.py": {
-            "mean": 2.69,
-            "std": 0.00425,
-            "min": 2.68,
-            "max": 2.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "13_Matmul_for_symmetric_matrices.py": {
-            "mean": 2.65,
-            "std": 0.0045,
-            "min": 2.65,
-            "max": 2.67,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "14_Matmul_for_upper_triangular_matrices.py": {
-            "mean": 2.71,
-            "std": 0.00376,
-            "min": 2.7,
-            "max": 2.73,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "15_Matmul_for_lower_triangular_matrices.py": {
-            "mean": 2.71,
-            "std": 0.00182,
-            "min": 2.71,
-            "max": 2.72,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "16_Matmul_with_transposed_A.py": {
-            "mean": 2.62,
-            "std": 0.00207,
-            "min": 2.61,
-            "max": 2.62,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "17_Matmul_with_transposed_B.py": {
-            "mean": 2.74,
-            "std": 0.00801,
-            "min": 2.71,
-            "max": 2.76,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "18_Matmul_with_transposed_both.py": {
-            "mean": 2.78,
-            "std": 0.00828,
-            "min": 2.76,
-            "max": 2.81,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "19_ReLU.py": {
-            "mean": 4.27,
-            "std": 0.00845,
-            "min": 4.26,
-            "max": 4.35,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "20_LeakyReLU.py": {
-            "mean": 4.27,
-            "std": 0.00191,
-            "min": 4.26,
-            "max": 4.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "21_Sigmoid.py": {
-            "mean": 4.26,
-            "std": 0.00198,
-            "min": 4.26,
-            "max": 4.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "22_Tanh.py": {
-            "mean": 3.05,
-            "std": 0.00172,
-            "min": 3.04,
-            "max": 3.05,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "23_Softmax.py": {
-            "mean": 7.12,
-            "std": 0.0142,
-            "min": 7.11,
-            "max": 7.18,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "24_LogSoftmax.py": {
-            "mean": 6.18,
-            "std": 0.0645,
-            "min": 6.06,
-            "max": 6.33,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "25_Swish.py": {
-            "mean": 10.6,
-            "std": 0.00389,
-            "min": 10.6,
-            "max": 10.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "26_GELU_.py": {
-            "mean": 4.24,
-            "std": 0.00177,
-            "min": 4.23,
-            "max": 4.24,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "27_SELU_.py": {
-            "mean": 3.02,
-            "std": 0.00174,
-            "min": 3.02,
-            "max": 3.03,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "28_HardSigmoid.py": {
-            "mean": 4.26,
-            "std": 0.00202,
-            "min": 4.26,
-            "max": 4.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "29_Softplus.py": {
-            "mean": 4.23,
-            "std": 0.00216,
-            "min": 4.23,
-            "max": 4.24,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "30_Softsign.py": {
-            "mean": 10.4,
-            "std": 0.00375,
-            "min": 10.4,
-            "max": 10.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "31_ELU.py": {
-            "mean": 4.24,
-            "std": 0.00229,
-            "min": 4.24,
-            "max": 4.25,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "32_HardTanh.py": {
-            "mean": 4.24,
-            "std": 0.00168,
-            "min": 4.23,
-            "max": 4.24,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "33_BatchNorm.py": {
-            "mean": 8.8,
-            "std": 0.016,
-            "min": 8.77,
-            "max": 8.85,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "34_InstanceNorm.py": {
-            "mean": 9.57,
-            "std": 0.0119,
-            "min": 9.55,
-            "max": 9.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "35_GroupNorm_.py": {
-            "mean": 9.94,
-            "std": 0.0103,
-            "min": 9.93,
-            "max": 10.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "36_RMSNorm_.py": {
-            "mean": 14.2,
-            "std": 0.00266,
-            "min": 14.2,
-            "max": 14.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "37_FrobeniusNorm_.py": {
-            "mean": 8.42,
-            "std": 0.00264,
-            "min": 8.41,
-            "max": 8.42,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "38_L1Norm_.py": {
-            "mean": 15.5,
-            "std": 0.00742,
-            "min": 15.5,
-            "max": 15.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "39_L2Norm_.py": {
-            "mean": 10.0,
-            "std": 0.0024,
-            "min": 10.0,
-            "max": 10.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "40_LayerNorm.py": {
-            "mean": 8.12,
-            "std": 0.00649,
-            "min": 8.11,
-            "max": 8.16,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "41_Max_Pooling_1D.py": {
-            "mean": 10.7,
-            "std": 0.00997,
-            "min": 10.7,
-            "max": 10.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "42_Max_Pooling_2D.py": {
-            "mean": 10.7,
-            "std": 0.00823,
-            "min": 10.7,
-            "max": 10.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "43_Max_Pooling_3D.py": {
-            "mean": 3.93,
-            "std": 0.00239,
-            "min": 3.93,
-            "max": 3.94,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "44_Average_Pooling_1D.py": {
-            "mean": 8.02,
-            "std": 0.0057,
-            "min": 8.01,
-            "max": 8.06,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "45_Average_Pooling_2D.py": {
-            "mean": 6.6,
-            "std": 0.0259,
-            "min": 6.55,
-            "max": 6.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "46_Average_Pooling_3D.py": {
-            "mean": 8.66,
-            "std": 0.00616,
-            "min": 8.65,
-            "max": 8.69,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "47_Sum_reduction_over_a_dimension.py": {
-            "mean": 2.11,
-            "std": 0.0154,
-            "min": 2.08,
-            "max": 2.16,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "48_Mean_reduction_over_a_dimension.py": {
-            "mean": 2.89,
-            "std": 0.0163,
-            "min": 2.86,
-            "max": 2.94,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "49_Max_reduction_over_a_dimension.py": {
-            "mean": 3.17,
-            "std": 0.00295,
-            "min": 3.16,
-            "max": 3.17,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "50_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 2.09,
-            "std": 0.0172,
-            "min": 2.08,
-            "max": 2.15,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "51_Argmax_over_a_dimension.py": {
-            "mean": 3.25,
-            "std": 0.00328,
-            "min": 3.25,
-            "max": 3.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "52_Argmin_over_a_dimension.py": {
-            "mean": 3.24,
-            "std": 0.00269,
-            "min": 3.24,
-            "max": 3.25,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "53_Min_reduction_over_a_dimension.py": {
-            "mean": 3.18,
-            "std": 0.00328,
-            "min": 3.17,
-            "max": 3.19,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "54_conv_standard_3D__square_input__square_kernel.py": {
-            "mean": 1.36,
-            "std": 0.00258,
-            "min": 1.36,
-            "max": 1.38,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
-            "mean": 4.18,
-            "std": 0.0626,
-            "min": 4.0,
-            "max": 4.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 3.44,
-            "std": 0.046,
-            "min": 3.37,
-            "max": 3.54,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "57_conv_transposed_2D__square_input__square_kernel.py": {
-            "mean": 6.56,
-            "std": 0.0393,
-            "min": 6.53,
-            "max": 6.86,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 2.32,
-            "std": 0.0145,
-            "min": 2.29,
-            "max": 2.37,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
-            "mean": 2.09,
-            "std": 0.0049,
-            "min": 2.08,
-            "max": 2.11,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
-            "mean": 5.29,
-            "std": 0.0129,
-            "min": 5.26,
-            "max": 5.31,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "61_conv_transposed_3D__square_input__square_kernel.py": {
-            "mean": 5.5,
-            "std": 0.011,
-            "min": 5.48,
-            "max": 5.58,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
-            "mean": 3.64,
-            "std": 0.0747,
-            "min": 3.56,
-            "max": 3.85,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "63_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 7.05,
-            "std": 0.0139,
-            "min": 7.03,
-            "max": 7.11,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "64_conv_transposed_1D.py": {
-            "mean": 5.28,
-            "std": 0.0103,
-            "min": 5.25,
-            "max": 5.31,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
-            "mean": 2.71,
-            "std": 0.0129,
-            "min": 2.68,
-            "max": 2.75,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 2.6,
-            "std": 0.00198,
-            "min": 2.6,
-            "max": 2.61,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "67_conv_standard_1D.py": {
-            "mean": 2.66,
-            "std": 0.0156,
-            "min": 2.63,
-            "max": 2.68,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
-            "mean": 9.54,
-            "std": 0.0113,
-            "min": 9.52,
-            "max": 9.61,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 2.75,
-            "std": 0.0209,
-            "min": 2.72,
-            "max": 2.81,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
-            "mean": 9.85,
-            "std": 0.0364,
-            "min": 9.83,
-            "max": 10.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
-            "mean": 1.59,
-            "std": 0.00375,
-            "min": 1.58,
-            "max": 1.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
-            "mean": 2.89,
-            "std": 0.00619,
-            "min": 2.88,
-            "max": 2.91,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
-            "mean": 2.08,
-            "std": 0.00526,
-            "min": 2.07,
-            "max": 2.09,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "74_conv_transposed_1D_dilated.py": {
-            "mean": 1.89,
-            "std": 0.017,
-            "min": 1.87,
-            "max": 2.03,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
-            "mean": 6.68,
-            "std": 0.00872,
-            "min": 6.67,
-            "max": 6.75,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "76_conv_standard_1D_dilated_strided__.py": {
-            "mean": 12.2,
-            "std": 0.0506,
-            "min": 12.2,
-            "max": 12.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
-            "mean": 1.95,
-            "std": 0.0152,
-            "min": 1.91,
-            "max": 1.99,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
-            "mean": 2.42,
-            "std": 0.00491,
-            "min": 2.41,
-            "max": 2.43,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
-            "mean": 1.93,
-            "std": 0.00786,
-            "min": 1.92,
-            "max": 1.95,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
-            "mean": 3.53,
-            "std": 0.00861,
-            "min": 3.52,
-            "max": 3.56,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
-            "mean": 1.81,
-            "std": 0.0117,
-            "min": 1.78,
-            "max": 1.83,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "82_conv_depthwise_2D_square_input_square_kernel.py": {
-            "mean": 2.54,
-            "std": 0.00128,
-            "min": 2.54,
-            "max": 2.54,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
-            "mean": 1.47,
-            "std": 0.00158,
-            "min": 1.47,
-            "max": 1.48,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
-            "mean": 10.1,
-            "std": 0.00491,
-            "min": 10.1,
-            "max": 10.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
-            "mean": 2.31,
-            "std": 0.00502,
-            "min": 2.31,
-            "max": 2.34,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "86_conv_depthwise_separable_2D.py": {
-            "mean": 3.7,
-            "std": 0.013,
-            "min": 3.67,
-            "max": 3.72,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "87_conv_pointwise_2D.py": {
-            "mean": 4.67,
-            "std": 0.00559,
-            "min": 4.66,
-            "max": 4.69,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "88_MinGPTNewGelu.py": {
-            "mean": 1.61,
-            "std": 0.00118,
-            "min": 1.6,
-            "max": 1.61,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "89_cumsum.py": {
-            "mean": 4.65,
-            "std": 0.00818,
-            "min": 4.64,
-            "max": 4.67,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "90_cumprod.py": {
-            "mean": 4.64,
-            "std": 0.00386,
-            "min": 4.63,
-            "max": 4.65,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "91_cumsum_reverse.py": {
-            "mean": 11.4,
-            "std": 0.0112,
-            "min": 11.4,
-            "max": 11.4,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "92_cumsum_exclusive.py": {
-            "mean": 8.86,
-            "std": 0.0191,
-            "min": 8.83,
-            "max": 8.93,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "93_masked_cumsum.py": {
-            "mean": 8.67,
-            "std": 0.00487,
-            "min": 8.66,
-            "max": 8.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "94_MSELoss.py": {
-            "mean": 8.43,
-            "std": 0.00298,
-            "min": 8.42,
-            "max": 8.44,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "95_CrossEntropyLoss.py": {
-            "mean": 1.45,
-            "std": 0.0022,
-            "min": 1.44,
-            "max": 1.45,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "96_HuberLoss.py": {
-            "mean": 5.52,
-            "std": 0.00201,
-            "min": 5.51,
-            "max": 5.53,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "97_ScaledDotProductAttention.py": {
-            "mean": 8.23,
-            "std": 0.193,
-            "min": 8.01,
-            "max": 9.38,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "98_KLDivLoss.py": {
-            "mean": 3.89,
-            "std": 0.00194,
-            "min": 3.89,
-            "max": 3.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "99_TripletMarginLoss.py": {
-            "mean": 4.25,
-            "std": 0.0129,
-            "min": 4.24,
-            "max": 4.37,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "100_HingeLoss.py": {
-            "mean": 10.4,
-            "std": 0.00488,
-            "min": 10.4,
-            "max": 10.5,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        }
-    }
-}
\ No newline at end of file
diff --git a/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json
deleted file mode 100644
index ee1fb338..00000000
--- a/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json
+++ /dev/null
@@ -1,904 +0,0 @@
-{
-    "level1": {
-        "1_Square_matrix_multiplication_.py": {
-            "mean": 2.66,
-            "std": 0.00503,
-            "min": 2.66,
-            "max": 2.68,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "2_Standard_matrix_multiplication_.py": {
-            "mean": 2.67,
-            "std": 0.00828,
-            "min": 2.65,
-            "max": 2.68,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "3_Batched_matrix_multiplication.py": {
-            "mean": 5.32,
-            "std": 0.0181,
-            "min": 5.3,
-            "max": 5.37,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "4_Matrix_vector_multiplication_.py": {
-            "mean": 2.9,
-            "std": 0.00233,
-            "min": 2.9,
-            "max": 2.91,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "5_Matrix_scalar_multiplication.py": {
-            "mean": 2.88,
-            "std": 0.0132,
-            "min": 2.86,
-            "max": 2.98,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "6_Matmul_with_large_K_dimension_.py": {
-            "mean": 1.33,
-            "std": 0.00488,
-            "min": 1.33,
-            "max": 1.35,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "7_Matmul_with_small_K_dimension_.py": {
-            "mean": 4.14,
-            "std": 0.0273,
-            "min": 4.12,
-            "max": 4.35,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "8_Matmul_with_irregular_shapes_.py": {
-            "mean": 6.44,
-            "std": 0.00478,
-            "min": 6.43,
-            "max": 6.46,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "9_Tall_skinny_matrix_multiplication_.py": {
-            "mean": 2.63,
-            "std": 0.0036,
-            "min": 2.63,
-            "max": 2.65,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "10_3D_tensor_matrix_multiplication.py": {
-            "mean": 1.07,
-            "std": 0.00272,
-            "min": 1.06,
-            "max": 1.08,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "11_4D_tensor_matrix_multiplication.py": {
-            "mean": 10.8,
-            "std": 1.01,
-            "min": 10.0,
-            "max": 13.2,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "12_Matmul_with_diagonal_matrices_.py": {
-            "mean": 2.69,
-            "std": 0.0112,
-            "min": 2.68,
-            "max": 2.72,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "13_Matmul_for_symmetric_matrices.py": {
-            "mean": 2.66,
-            "std": 0.00314,
-            "min": 2.66,
-            "max": 2.68,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "14_Matmul_for_upper_triangular_matrices.py": {
-            "mean": 2.72,
-            "std": 0.0045,
-            "min": 2.72,
-            "max": 2.75,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "15_Matmul_for_lower_triangular_matrices.py": {
-            "mean": 2.73,
-            "std": 0.005,
-            "min": 2.72,
-            "max": 2.75,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "16_Matmul_with_transposed_A.py": {
-            "mean": 2.65,
-            "std": 0.00467,
-            "min": 2.64,
-            "max": 2.67,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "17_Matmul_with_transposed_B.py": {
-            "mean": 2.77,
-            "std": 0.0083,
-            "min": 2.76,
-            "max": 2.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "18_Matmul_with_transposed_both.py": {
-            "mean": 2.79,
-            "std": 0.0101,
-            "min": 2.77,
-            "max": 2.82,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "19_ReLU.py": {
-            "mean": 4.31,
-            "std": 0.00483,
-            "min": 4.29,
-            "max": 4.33,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "20_LeakyReLU.py": {
-            "mean": 4.29,
-            "std": 0.00853,
-            "min": 4.28,
-            "max": 4.33,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "21_Sigmoid.py": {
-            "mean": 4.29,
-            "std": 0.0126,
-            "min": 4.27,
-            "max": 4.39,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "22_Tanh.py": {
-            "mean": 3.07,
-            "std": 0.0213,
-            "min": 3.06,
-            "max": 3.27,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "23_Softmax.py": {
-            "mean": 8.68,
-            "std": 0.0881,
-            "min": 8.65,
-            "max": 9.55,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "24_LogSoftmax.py": {
-            "mean": 8.65,
-            "std": 0.0181,
-            "min": 8.63,
-            "max": 8.74,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "25_Swish.py": {
-            "mean": 4.28,
-            "std": 0.00737,
-            "min": 4.27,
-            "max": 4.32,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "26_GELU_.py": {
-            "mean": 4.27,
-            "std": 0.00737,
-            "min": 4.26,
-            "max": 4.29,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "27_SELU_.py": {
-            "mean": 4.27,
-            "std": 0.0127,
-            "min": 4.26,
-            "max": 4.37,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "28_HardSigmoid.py": {
-            "mean": 4.29,
-            "std": 0.03,
-            "min": 4.27,
-            "max": 4.58,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "29_Softplus.py": {
-            "mean": 4.29,
-            "std": 0.0411,
-            "min": 4.27,
-            "max": 4.58,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "30_Softsign.py": {
-            "mean": 3.06,
-            "std": 0.00498,
-            "min": 3.05,
-            "max": 3.08,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "31_ELU.py": {
-            "mean": 4.28,
-            "std": 0.0248,
-            "min": 4.27,
-            "max": 4.52,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "32_HardTanh.py": {
-            "mean": 4.29,
-            "std": 0.0059,
-            "min": 4.28,
-            "max": 4.31,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "33_BatchNorm.py": {
-            "mean": 4.24,
-            "std": 0.00629,
-            "min": 4.22,
-            "max": 4.26,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "34_InstanceNorm.py": {
-            "mean": 7.66,
-            "std": 0.027,
-            "min": 7.64,
-            "max": 7.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "35_GroupNorm_.py": {
-            "mean": 7.49,
-            "std": 0.00591,
-            "min": 7.48,
-            "max": 7.51,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "36_RMSNorm_.py": {
-            "mean": 7.8,
-            "std": 0.00398,
-            "min": 7.8,
-            "max": 7.82,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "37_FrobeniusNorm_.py": {
-            "mean": 7.32,
-            "std": 0.0091,
-            "min": 7.31,
-            "max": 7.36,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "38_L1Norm_.py": {
-            "mean": 13.0,
-            "std": 0.0182,
-            "min": 13.0,
-            "max": 13.1,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "39_L2Norm_.py": {
-            "mean": 13.4,
-            "std": 0.029,
-            "min": 13.3,
-            "max": 13.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "40_LayerNorm.py": {
-            "mean": 0.476,
-            "std": 0.0031,
-            "min": 0.472,
-            "max": 0.491,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "41_Max_Pooling_1D.py": {
-            "mean": 10.7,
-            "std": 0.0108,
-            "min": 10.7,
-            "max": 10.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "42_Max_Pooling_2D.py": {
-            "mean": 4.45,
-            "std": 0.00646,
-            "min": 4.44,
-            "max": 4.49,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "43_Max_Pooling_3D.py": {
-            "mean": 3.95,
-            "std": 0.00396,
-            "min": 3.94,
-            "max": 3.97,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "44_Average_Pooling_1D.py": {
-            "mean": 1.89,
-            "std": 0.00262,
-            "min": 1.88,
-            "max": 1.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "45_Average_Pooling_2D.py": {
-            "mean": 6.43,
-            "std": 0.0493,
-            "min": 6.36,
-            "max": 6.79,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "46_Average_Pooling_3D.py": {
-            "mean": 8.71,
-            "std": 0.0143,
-            "min": 8.69,
-            "max": 8.81,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "47_Sum_reduction_over_a_dimension.py": {
-            "mean": 3.5,
-            "std": 0.0161,
-            "min": 3.45,
-            "max": 3.54,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "48_Mean_reduction_over_a_dimension.py": {
-            "mean": 3.39,
-            "std": 0.0227,
-            "min": 3.34,
-            "max": 3.55,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "49_Max_reduction_over_a_dimension.py": {
-            "mean": 3.49,
-            "std": 0.0149,
-            "min": 3.45,
-            "max": 3.56,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "50_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 1.66,
-            "std": 0.0152,
-            "min": 1.64,
-            "max": 1.72,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "51_Argmax_over_a_dimension.py": {
-            "mean": 2.94,
-            "std": 0.00652,
-            "min": 2.93,
-            "max": 2.96,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "52_Argmin_over_a_dimension.py": {
-            "mean": 3.07,
-            "std": 0.00532,
-            "min": 3.06,
-            "max": 3.09,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "53_Min_reduction_over_a_dimension.py": {
-            "mean": 3.48,
-            "std": 0.0141,
-            "min": 3.44,
-            "max": 3.52,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "54_conv_standard_3D__square_input__square_kernel.py": {
-            "mean": 1.4,
-            "std": 0.00443,
-            "min": 1.39,
-            "max": 1.42,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "55_conv_standard_2D__asymmetric_input__square_kernel.py": {
-            "mean": 4.63,
-            "std": 0.006,
-            "min": 4.61,
-            "max": 4.64,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 3.68,
-            "std": 0.037,
-            "min": 3.62,
-            "max": 3.76,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "57_conv_transposed_2D__square_input__square_kernel.py": {
-            "mean": 6.56,
-            "std": 0.00741,
-            "min": 6.54,
-            "max": 6.58,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 2.32,
-            "std": 0.0159,
-            "min": 2.29,
-            "max": 2.37,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "59_conv_standard_3D__asymmetric_input__square_kernel.py": {
-            "mean": 2.11,
-            "std": 0.00394,
-            "min": 2.1,
-            "max": 2.13,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "60_conv_standard_3D__square_input__asymmetric_kernel.py": {
-            "mean": 5.33,
-            "std": 0.0602,
-            "min": 5.32,
-            "max": 5.79,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "61_conv_transposed_3D__square_input__square_kernel.py": {
-            "mean": 5.51,
-            "std": 0.0104,
-            "min": 5.48,
-            "max": 5.54,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "62_conv_standard_2D__square_input__asymmetric_kernel.py": {
-            "mean": 2.68,
-            "std": 0.0374,
-            "min": 2.66,
-            "max": 3.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "63_conv_standard_2D__square_input__square_kernel.py": {
-            "mean": 13.9,
-            "std": 0.0149,
-            "min": 13.9,
-            "max": 14.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "64_conv_transposed_1D.py": {
-            "mean": 5.32,
-            "std": 0.0562,
-            "min": 5.29,
-            "max": 5.72,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "65_conv_transposed_2D__square_input__asymmetric_kernel.py": {
-            "mean": 2.72,
-            "std": 0.0112,
-            "min": 2.68,
-            "max": 2.74,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 2.65,
-            "std": 0.0043,
-            "min": 2.65,
-            "max": 2.68,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "67_conv_standard_1D.py": {
-            "mean": 2.69,
-            "std": 0.0361,
-            "min": 2.65,
-            "max": 3.01,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "68_conv_transposed_3D__square_input__asymmetric_kernel.py": {
-            "mean": 9.55,
-            "std": 0.00915,
-            "min": 9.53,
-            "max": 9.58,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": {
-            "mean": 2.74,
-            "std": 0.0113,
-            "min": 2.72,
-            "max": 2.78,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "70_conv_transposed_3D__asymmetric_input__square_kernel.py": {
-            "mean": 10.0,
-            "std": 0.0098,
-            "min": 9.98,
-            "max": 10.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "71_conv_transposed_2D__asymmetric_input__square_kernel.py": {
-            "mean": 1.59,
-            "std": 0.00411,
-            "min": 1.58,
-            "max": 1.6,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": {
-            "mean": 2.93,
-            "std": 0.00823,
-            "min": 2.92,
-            "max": 2.96,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": {
-            "mean": 2.07,
-            "std": 0.00509,
-            "min": 2.06,
-            "max": 2.09,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "74_conv_transposed_1D_dilated.py": {
-            "mean": 1.93,
-            "std": 0.00876,
-            "min": 1.91,
-            "max": 1.95,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": {
-            "mean": 6.6,
-            "std": 0.0337,
-            "min": 6.58,
-            "max": 6.91,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "76_conv_standard_1D_dilated_strided__.py": {
-            "mean": 12.4,
-            "std": 0.0826,
-            "min": 12.4,
-            "max": 12.7,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": {
-            "mean": 1.98,
-            "std": 0.0146,
-            "min": 1.95,
-            "max": 2.03,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": {
-            "mean": 2.37,
-            "std": 0.0217,
-            "min": 2.35,
-            "max": 2.55,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": {
-            "mean": 1.93,
-            "std": 0.00781,
-            "min": 1.92,
-            "max": 1.95,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": {
-            "mean": 2.65,
-            "std": 0.0159,
-            "min": 2.63,
-            "max": 2.68,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": {
-            "mean": 1.71,
-            "std": 0.00818,
-            "min": 1.7,
-            "max": 1.76,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "82_conv_depthwise_2D_square_input_square_kernel.py": {
-            "mean": 2.57,
-            "std": 0.0708,
-            "min": 2.53,
-            "max": 3.09,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": {
-            "mean": 19.0,
-            "std": 0.273,
-            "min": 18.8,
-            "max": 20.3,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": {
-            "mean": 9.79,
-            "std": 0.00959,
-            "min": 9.77,
-            "max": 9.81,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": {
-            "mean": 13.9,
-            "std": 0.00974,
-            "min": 13.9,
-            "max": 14.0,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "86_conv_depthwise_separable_2D.py": {
-            "mean": 3.41,
-            "std": 0.28,
-            "min": 3.31,
-            "max": 5.34,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "87_conv_pointwise_2D.py": {
-            "mean": 10.6,
-            "std": 0.098,
-            "min": 10.5,
-            "max": 10.9,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "88_MinGPTNewGelu.py": {
-            "mean": 0.161,
-            "std": 0.00496,
-            "min": 0.154,
-            "max": 0.178,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "89_cumsum.py": {
-            "mean": 2.69,
-            "std": 0.0867,
-            "min": 2.66,
-            "max": 3.45,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "90_cumprod.py": {
-            "mean": 2.64,
-            "std": 0.0149,
-            "min": 2.63,
-            "max": 2.73,
-            "num_trials": 100,
-            "hardware": "NVIDIA H200",
-            "device": "cuda:0"
-        },
-        "91_cumsum_reverse.py": {
-            "mean": 5.87,
-            "std": 0.00331,
-            "min": 5.86,
-            "max": 5.88,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "92_cumsum_exclusive.py": {
-            "mean": 5.67,
-            "std": 0.01,
-            "min": 5.66,
-            "max": 5.75,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "93_masked_cumsum.py": {
-            "mean": 3.31,
-            "std": 0.0057,
-            "min": 3.3,
-            "max": 3.33,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "94_MSELoss.py": {
-            "mean": 2.78,
-            "std": 0.00263,
-            "min": 2.78,
-            "max": 2.79,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "95_CrossEntropyLoss.py": {
-            "mean": 0.234,
-            "std": 0.0313,
-            "min": 0.223,
-            "max": 0.541,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "96_HuberLoss.py": {
-            "mean": 2.79,
-            "std": 0.00316,
-            "min": 2.78,
-            "max": 2.8,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "97_ScaledDotProductAttention.py": {
-            "mean": 8.25,
-            "std": 0.228,
-            "min": 8.1,
-            "max": 9.25,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "98_KLDivLoss.py": {
-            "mean": 0.745,
-            "std": 0.0237,
-            "min": 0.735,
-            "max": 0.965,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "99_TripletMarginLoss.py": {
-            "mean": 1.04,
-            "std": 0.00274,
-            "min": 1.04,
-            "max": 1.05,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        },
-        "100_HingeLoss.py": {
-            "mean": 1.41,
-            "std": 0.00369,
-            "min": 1.4,
-            "max": 1.42,
-            "num_trials": 100,
-            "hardware": "NVIDIA H100 80GB HBM3",
-            "device": "cuda:0"
-        }
-    }
-}
\ No newline at end of file

From 3e31530662468e82dd3e30365777f766b41f0d9b Mon Sep 17 00:00:00 2001
From: pythonomar22 <omarabulhassan@gmail.com>
Date: Thu, 20 Nov 2025 21:00:22 -0800
Subject: [PATCH 5/5] fallback correcting

---
 src/eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/eval.py b/src/eval.py
index 1ae8b83c..7e73479e 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -890,7 +890,8 @@ def fetch_baseline_time(
     if hasattr(dataset, "get_problem_by_id"):
         problem_path = dataset.get_problem_by_id(problem_id)
     else:
-        problem_path = dataset[problem_id]
+        # Fallback for old list-based API: problem_id is 1-indexed but lists are 0-indexed
+        problem_path = dataset[problem_id - 1]
 
     problem_name = os.path.basename(problem_path)
     baseline_time = baseline_json[level_name].get(problem_name, None)