From 509ed996193e3a1bf6b152622c39a3e8e4219ead Mon Sep 17 00:00:00 2001 From: pythonomar22 Date: Thu, 20 Nov 2025 18:20:42 -0800 Subject: [PATCH 1/5] before testing --- .../A10G_modal/baseline_time_torch.json | 904 ++++++++++++++++++ ...e_time_torch_compile_inductor_default.json | 904 ++++++++++++++++++ .../H100_modal/baseline_time_torch.json | 904 ++++++++++++++++++ ...e_time_torch_compile_inductor_default.json | 904 ++++++++++++++++++ scripts/benchmark_eval_analysis.py | 45 +- scripts/eval_from_generations.py | 25 +- scripts/generate_and_eval_single_sample.py | 5 +- .../generate_and_eval_single_sample_modal.py | 7 +- scripts/generate_baseline_time.py | 21 +- scripts/generate_baseline_time_modal.py | 18 +- scripts/generate_samples.py | 22 +- scripts/inspect_baseline.py | 5 +- scripts/inspect_triton.py | 20 +- scripts/run_and_check.py | 26 +- scripts/verify_bench.py | 58 +- src/dataset.py | 54 +- src/eval.py | 19 +- 17 files changed, 3807 insertions(+), 134 deletions(-) create mode 100644 results/timing/A10G_modal/baseline_time_torch.json create mode 100644 results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json create mode 100644 results/timing/H100_modal/baseline_time_torch.json create mode 100644 results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json diff --git a/results/timing/A10G_modal/baseline_time_torch.json b/results/timing/A10G_modal/baseline_time_torch.json new file mode 100644 index 00000000..327a00c2 --- /dev/null +++ b/results/timing/A10G_modal/baseline_time_torch.json @@ -0,0 +1,904 @@ +{ + "level1": { + "1_Square_matrix_multiplication_.py": { + "mean": 5.78, + "std": 0.0635, + "min": 5.55, + "max": 5.91, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "2_Standard_matrix_multiplication_.py": { + "mean": 8.47, + "std": 0.293, + "min": 6.99, + "max": 9.51, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "3_Batched_matrix_multiplication.py": { + "mean": 14.0, + "std": 0.169, + "min": 13.5, + "max": 14.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "4_Matrix_vector_multiplication_.py": { + "mean": 25.5, + "std": 0.157, + "min": 25.0, + "max": 25.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "5_Matrix_scalar_multiplication.py": { + "mean": 17.6, + "std": 0.0154, + "min": 17.6, + "max": 17.7, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "6_Matmul_with_large_K_dimension_.py": { + "mean": 3.3, + "std": 0.0495, + "min": 3.1, + "max": 3.34, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "7_Matmul_with_small_K_dimension_.py": { + "mean": 14.9, + "std": 1.19, + "min": 13.3, + "max": 23.3, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "8_Matmul_with_irregular_shapes_.py": { + "mean": 21.0, + "std": 0.68, + "min": 20.5, + "max": 25.7, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "9_Tall_skinny_matrix_multiplication_.py": { + "mean": 10.9, + "std": 0.0388, + "min": 10.9, + "max": 11.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "10_3D_tensor_matrix_multiplication.py": { + "mean": 2.4, + "std": 0.0551, + "min": 2.25, + "max": 2.45, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "11_4D_tensor_matrix_multiplication.py": { + "mean": 22.2, + "std": 0.224, + "min": 21.8, + "max": 22.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "12_Matmul_with_diagonal_matrices_.py": { + "mean": 9.62, + "std": 0.537, + "min": 7.46, + "max": 11.6, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "13_Matmul_for_symmetric_matrices.py": { + "mean": 10.4, + "std": 0.889, + "min": 8.12, + "max": 15.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "14_Matmul_for_upper_triangular_matrices.py": { + "mean": 5.75, + "std": 0.0515, + "min": 5.71, + "max": 6.06, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "15_Matmul_for_lower_triangular_matrices.py": { + "mean": 5.71, + "std": 0.0196, + "min": 5.7, + "max": 5.88, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "16_Matmul_with_transposed_A.py": { + "mean": 8.29, + "std": 0.228, + "min": 6.84, + "max": 8.54, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "17_Matmul_with_transposed_B.py": { + "mean": 11.9, + "std": 0.674, + "min": 10.1, + "max": 15.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "18_Matmul_with_transposed_both.py": { + "mean": 8.78, + "std": 0.415, + "min": 7.41, + "max": 10.7, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "19_ReLU.py": { + "mean": 26.6, + "std": 0.0288, + "min": 26.5, + "max": 26.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "20_LeakyReLU.py": { + "mean": 26.4, + "std": 0.0369, + "min": 26.4, + "max": 26.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "21_Sigmoid.py": { + "mean": 26.5, + "std": 0.0341, + "min": 26.4, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "22_Tanh.py": { + "mean": 26.6, + "std": 0.0275, + "min": 26.5, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "23_Softmax.py": { + "mean": 51.4, + "std": 0.0335, + "min": 51.3, + "max": 51.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "24_LogSoftmax.py": { + "mean": 51.4, + "std": 0.0255, + "min": 51.3, + "max": 51.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "25_Swish.py": { + "mean": 65.9, + "std": 0.033, + "min": 65.8, + "max": 66.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "26_GELU_.py": { + "mean": 26.5, + "std": 0.0244, + "min": 26.5, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "27_SELU_.py": { + "mean": 26.4, + "std": 0.014, + "min": 26.3, + "max": 26.4, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "28_HardSigmoid.py": { + "mean": 26.6, + "std": 0.0332, + "min": 26.5, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "29_Softplus.py": { + "mean": 26.5, + "std": 0.0349, + "min": 26.5, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "30_Softsign.py": { + "mean": 92.3, + "std": 0.0474, + "min": 92.2, + "max": 92.4, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "31_ELU.py": { + "mean": 26.4, + "std": 0.0291, + "min": 26.4, + "max": 26.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "32_HardTanh.py": { + "mean": 26.4, + "std": 0.0333, + "min": 26.4, + "max": 26.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "33_BatchNorm.py": { + "mean": 28.3, + "std": 0.0373, + "min": 28.2, + "max": 28.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "34_InstanceNorm.py": { + "mean": 47.4, + "std": 0.0383, + "min": 47.3, + "max": 47.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "35_GroupNorm_.py": { + "mean": 46.6, + "std": 0.0375, + "min": 46.5, + "max": 46.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "36_RMSNorm_.py": { + "mean": 80.9, + "std": 0.0425, + "min": 80.8, + "max": 81.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "37_FrobeniusNorm_.py": { + "mean": 45.5, + "std": 0.0466, + "min": 45.4, + "max": 45.7, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "38_L1Norm_.py": { + "mean": 88.2, + "std": 0.0341, + "min": 88.1, + "max": 88.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "39_L2Norm_.py": { + "mean": 53.0, + "std": 0.105, + "min": 52.9, + "max": 53.9, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "40_LayerNorm.py": { + "mean": 8.53, + "std": 0.0196, + "min": 8.52, + "max": 8.65, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "41_Max_Pooling_1D.py": { + "mean": 27.0, + "std": 0.0402, + "min": 26.9, + "max": 27.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "42_Max_Pooling_2D.py": { + "mean": 30.9, + "std": 1.31, + "min": 30.1, + "max": 40.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "43_Max_Pooling_3D.py": { + "mean": 12.9, + "std": 0.16, + "min": 12.2, + "max": 13.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "44_Average_Pooling_1D.py": { + "mean": 18.9, + "std": 0.497, + "min": 18.5, + "max": 21.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "45_Average_Pooling_2D.py": { + "mean": 44.0, + "std": 0.0797, + "min": 43.8, + "max": 44.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "46_Average_Pooling_3D.py": { + "mean": 18.8, + "std": 0.0448, + "min": 18.7, + "max": 18.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "47_Sum_reduction_over_a_dimension.py": { + "mean": 21.1, + "std": 0.0911, + "min": 20.8, + "max": 21.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "48_Mean_reduction_over_a_dimension.py": { + "mean": 21.0, + "std": 0.0635, + "min": 20.9, + "max": 21.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "49_Max_reduction_over_a_dimension.py": { + "mean": 20.1, + "std": 0.0694, + "min": 19.9, + "max": 20.3, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "50_conv_standard_2D__square_input__square_kernel.py": { + "mean": 16.1, + "std": 0.0699, + "min": 16.0, + "max": 16.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "51_Argmax_over_a_dimension.py": { + "mean": 20.9, + "std": 0.0814, + "min": 20.7, + "max": 21.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "52_Argmin_over_a_dimension.py": { + "mean": 20.9, + "std": 0.0777, + "min": 20.7, + "max": 21.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "53_Min_reduction_over_a_dimension.py": { + "mean": 20.9, + "std": 0.0826, + "min": 20.8, + "max": 21.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "54_conv_standard_3D__square_input__square_kernel.py": { + "mean": 14.4, + "std": 0.0301, + "min": 14.3, + "max": 14.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "55_conv_standard_2D__asymmetric_input__square_kernel.py": { + "mean": 83.4, + "std": 2.16, + "min": 81.5, + "max": 101.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 26.2, + "std": 1.92, + "min": 25.5, + "max": 43.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "57_conv_transposed_2D__square_input__square_kernel.py": { + "mean": 39.5, + "std": 0.0511, + "min": 39.3, + "max": 39.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 17.4, + "std": 0.0441, + "min": 17.3, + "max": 17.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "59_conv_standard_3D__asymmetric_input__square_kernel.py": { + "mean": 13.3, + "std": 0.0313, + "min": 13.2, + "max": 13.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "60_conv_standard_3D__square_input__asymmetric_kernel.py": { + "mean": 31.5, + "std": 0.15, + "min": 31.3, + "max": 31.6, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "61_conv_transposed_3D__square_input__square_kernel.py": { + "mean": 27.5, + "std": 0.0604, + "min": 27.4, + "max": 27.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "62_conv_standard_2D__square_input__asymmetric_kernel.py": { + "mean": 15.1, + "std": 0.0803, + "min": 14.9, + "max": 15.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "63_conv_standard_2D__square_input__square_kernel.py": { + "mean": 43.2, + "std": 0.142, + "min": 43.1, + "max": 44.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "64_conv_transposed_1D.py": { + "mean": 32.8, + "std": 0.033, + "min": 32.7, + "max": 32.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { + "mean": 16.0, + "std": 0.049, + "min": 15.9, + "max": 16.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 22.5, + "std": 0.0621, + "min": 22.4, + "max": 22.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "67_conv_standard_1D.py": { + "mean": 12.5, + "std": 0.0419, + "min": 12.5, + "max": 12.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { + "mean": 393.0, + "std": 0.128, + "min": 393.0, + "max": 394.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 23.3, + "std": 0.0538, + "min": 23.2, + "max": 23.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { + "mean": 87.8, + "std": 0.0543, + "min": 87.6, + "max": 87.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { + "mean": 9.27, + "std": 0.697, + "min": 8.88, + "max": 13.9, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { + "mean": 5.16, + "std": 0.0349, + "min": 5.14, + "max": 5.44, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { + "mean": 17.8, + "std": 0.0655, + "min": 17.7, + "max": 18.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "74_conv_transposed_1D_dilated.py": { + "mean": 8.32, + "std": 0.285, + "min": 6.68, + "max": 8.93, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { + "mean": 16.9, + "std": 0.306, + "min": 16.5, + "max": 18.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "76_conv_standard_1D_dilated_strided__.py": { + "mean": 57.7, + "std": 2.91, + "min": 55.0, + "max": 76.9, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { + "mean": 6.14, + "std": 0.0596, + "min": 6.05, + "max": 6.34, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { + "mean": 16.2, + "std": 0.143, + "min": 16.1, + "max": 17.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { + "mean": 10.9, + "std": 0.0716, + "min": 10.8, + "max": 11.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { + "mean": 14.7, + "std": 0.0183, + "min": 14.7, + "max": 14.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { + "mean": 6.15, + "std": 0.0244, + "min": 6.11, + "max": 6.27, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "82_conv_depthwise_2D_square_input_square_kernel.py": { + "mean": 8.39, + "std": 0.455, + "min": 6.91, + "max": 11.7, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { + "mean": 3.72, + "std": 0.0183, + "min": 3.71, + "max": 3.84, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { + "mean": 24.2, + "std": 0.0117, + "min": 24.2, + "max": 24.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { + "mean": 5.31, + "std": 0.0288, + "min": 5.3, + "max": 5.51, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "86_conv_depthwise_separable_2D.py": { + "mean": 13.0, + "std": 0.0413, + "min": 12.9, + "max": 13.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "87_conv_pointwise_2D.py": { + "mean": 27.0, + "std": 0.0308, + "min": 27.0, + "max": 27.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "88_MinGPTNewGelu.py": { + "mean": 9.99, + "std": 0.0291, + "min": 9.95, + "max": 10.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "89_cumsum.py": { + "mean": 20.3, + "std": 0.0552, + "min": 20.3, + "max": 20.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "90_cumprod.py": { + "mean": 19.9, + "std": 0.0804, + "min": 19.9, + "max": 20.3, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "91_cumsum_reverse.py": { + "mean": 56.1, + "std": 0.0691, + "min": 56.0, + "max": 56.5, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "92_cumsum_exclusive.py": { + "mean": 45.0, + "std": 0.0489, + "min": 44.9, + "max": 45.2, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "93_masked_cumsum.py": { + "mean": 40.5, + "std": 0.173, + "min": 40.5, + "max": 42.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "94_MSELoss.py": { + "mean": 52.7, + "std": 0.0446, + "min": 52.6, + "max": 52.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "95_CrossEntropyLoss.py": { + "mean": 3.09, + "std": 0.0109, + "min": 3.08, + "max": 3.19, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "96_HuberLoss.py": { + "mean": 34.7, + "std": 0.0217, + "min": 34.7, + "max": 34.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "97_ScaledDotProductAttention.py": { + "mean": 44.1, + "std": 0.0406, + "min": 44.0, + "max": 44.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "98_KLDivLoss.py": { + "mean": 24.3, + "std": 0.0332, + "min": 24.2, + "max": 24.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "99_TripletMarginLoss.py": { + "mean": 26.4, + "std": 0.0708, + "min": 26.3, + "max": 27.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "100_HingeLoss.py": { + "mean": 61.9, + "std": 0.0296, + "min": 61.8, + "max": 62.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + } + } +} \ No newline at end of file diff --git a/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json new file mode 100644 index 00000000..596eb088 --- /dev/null +++ b/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json @@ -0,0 +1,904 @@ +{ + "level1": { + "1_Square_matrix_multiplication_.py": { + "mean": 5.85, + "std": 0.0617, + "min": 5.55, + "max": 6.08, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "2_Standard_matrix_multiplication_.py": { + "mean": 5.93, + "std": 0.0839, + "min": 5.62, + "max": 6.17, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "3_Batched_matrix_multiplication.py": { + "mean": 22.0, + "std": 0.964, + "min": 21.2, + "max": 28.1, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "4_Matrix_vector_multiplication_.py": { + "mean": 25.5, + "std": 0.156, + "min": 25.1, + "max": 25.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "5_Matrix_scalar_multiplication.py": { + "mean": 17.6, + "std": 0.0187, + "min": 17.5, + "max": 17.7, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "6_Matmul_with_large_K_dimension_.py": { + "mean": 3.32, + "std": 0.0394, + "min": 3.15, + "max": 3.37, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "7_Matmul_with_small_K_dimension_.py": { + "mean": 15.1, + "std": 1.06, + "min": 13.1, + "max": 22.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "8_Matmul_with_irregular_shapes_.py": { + "mean": 20.6, + "std": 0.559, + "min": 20.0, + "max": 23.4, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "9_Tall_skinny_matrix_multiplication_.py": { + "mean": 11.1, + "std": 0.12, + "min": 10.9, + "max": 11.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "10_3D_tensor_matrix_multiplication.py": { + "mean": 2.44, + "std": 0.0532, + "min": 2.27, + "max": 2.58, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "11_4D_tensor_matrix_multiplication.py": { + "mean": 22.2, + "std": 0.279, + "min": 21.6, + "max": 23.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "12_Matmul_with_diagonal_matrices_.py": { + "mean": 9.59, + "std": 0.655, + "min": 7.75, + "max": 13.1, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "13_Matmul_for_symmetric_matrices.py": { + "mean": 10.4, + "std": 0.737, + "min": 8.74, + "max": 14.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "14_Matmul_for_upper_triangular_matrices.py": { + "mean": 5.93, + "std": 0.0786, + "min": 5.82, + "max": 6.16, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "15_Matmul_for_lower_triangular_matrices.py": { + "mean": 5.82, + "std": 0.0385, + "min": 5.81, + "max": 6.19, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "16_Matmul_with_transposed_A.py": { + "mean": 8.51, + "std": 0.245, + "min": 6.84, + "max": 8.95, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "17_Matmul_with_transposed_B.py": { + "mean": 7.43, + "std": 0.226, + "min": 6.94, + "max": 8.94, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "18_Matmul_with_transposed_both.py": { + "mean": 6.05, + "std": 0.167, + "min": 5.8, + "max": 7.13, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "19_ReLU.py": { + "mean": 26.5, + "std": 0.0308, + "min": 26.4, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "20_LeakyReLU.py": { + "mean": 26.5, + "std": 0.0313, + "min": 26.4, + "max": 26.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "21_Sigmoid.py": { + "mean": 26.5, + "std": 0.0288, + "min": 26.4, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "22_Tanh.py": { + "mean": 26.6, + "std": 0.516, + "min": 26.4, + "max": 30.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "23_Softmax.py": { + "mean": 52.8, + "std": 0.104, + "min": 52.6, + "max": 53.1, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "24_LogSoftmax.py": { + "mean": 52.9, + "std": 0.0791, + "min": 52.8, + "max": 53.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "25_Swish.py": { + "mean": 26.6, + "std": 0.0973, + "min": 26.5, + "max": 27.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "26_GELU_.py": { + "mean": 26.4, + "std": 0.104, + "min": 26.2, + "max": 26.8, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "27_SELU_.py": { + "mean": 26.6, + "std": 0.077, + "min": 26.4, + "max": 27.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "28_HardSigmoid.py": { + "mean": 26.5, + "std": 0.0566, + "min": 26.4, + "max": 26.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "29_Softplus.py": { + "mean": 26.2, + "std": 0.0483, + "min": 26.2, + "max": 26.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "30_Softsign.py": { + "mean": 26.5, + "std": 0.0414, + "min": 26.4, + "max": 26.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "31_ELU.py": { + "mean": 26.4, + "std": 0.0325, + "min": 26.3, + "max": 26.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "32_HardTanh.py": { + "mean": 26.5, + "std": 0.0774, + "min": 26.4, + "max": 26.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "33_BatchNorm.py": { + "mean": 26.0, + "std": 0.0405, + "min": 25.9, + "max": 26.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "34_InstanceNorm.py": { + "mean": 47.1, + "std": 0.068, + "min": 46.9, + "max": 47.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "35_GroupNorm_.py": { + "mean": 45.7, + "std": 0.0425, + "min": 45.6, + "max": 45.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "36_RMSNorm_.py": { + "mean": 48.9, + "std": 0.0773, + "min": 48.8, + "max": 49.2, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "37_FrobeniusNorm_.py": { + "mean": 45.8, + "std": 0.0738, + "min": 45.7, + "max": 46.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "38_L1Norm_.py": { + "mean": 72.5, + "std": 0.238, + "min": 72.2, + "max": 74.6, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "39_L2Norm_.py": { + "mean": 74.7, + "std": 0.106, + "min": 74.4, + "max": 74.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "40_LayerNorm.py": { + "mean": 2.79, + "std": 0.0655, + "min": 2.75, + "max": 3.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "41_Max_Pooling_1D.py": { + "mean": 31.0, + "std": 1.19, + "min": 30.4, + "max": 39.9, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "42_Max_Pooling_2D.py": { + "mean": 9.84, + "std": 0.21, + "min": 9.71, + "max": 10.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "43_Max_Pooling_3D.py": { + "mean": 12.9, + "std": 0.168, + "min": 12.2, + "max": 14.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "44_Average_Pooling_1D.py": { + "mean": 8.94, + "std": 0.0751, + "min": 8.84, + "max": 9.16, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "45_Average_Pooling_2D.py": { + "mean": 42.0, + "std": 0.909, + "min": 39.7, + "max": 43.1, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "46_Average_Pooling_3D.py": { + "mean": 19.0, + "std": 0.75, + "min": 18.8, + "max": 26.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "47_Sum_reduction_over_a_dimension.py": { + "mean": 22.0, + "std": 0.0462, + "min": 21.9, + "max": 22.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "48_Mean_reduction_over_a_dimension.py": { + "mean": 21.7, + "std": 0.0532, + "min": 21.6, + "max": 21.9, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "49_Max_reduction_over_a_dimension.py": { + "mean": 22.0, + "std": 0.0813, + "min": 21.8, + "max": 22.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "50_conv_standard_2D__square_input__square_kernel.py": { + "mean": 5.42, + "std": 0.127, + "min": 5.27, + "max": 5.76, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "51_Argmax_over_a_dimension.py": { + "mean": 21.5, + "std": 0.0593, + "min": 21.3, + "max": 21.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "52_Argmin_over_a_dimension.py": { + "mean": 21.6, + "std": 0.0785, + "min": 21.4, + "max": 21.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "53_Min_reduction_over_a_dimension.py": { + "mean": 21.9, + "std": 0.0312, + "min": 21.8, + "max": 22.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "54_conv_standard_3D__square_input__square_kernel.py": { + "mean": 11.5, + "std": 0.382, + "min": 10.0, + "max": 13.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "55_conv_standard_2D__asymmetric_input__square_kernel.py": { + "mean": 37.5, + "std": 0.0376, + "min": 37.4, + "max": 37.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 22.2, + "std": 0.195, + "min": 22.1, + "max": 23.4, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "57_conv_transposed_2D__square_input__square_kernel.py": { + "mean": 49.1, + "std": 0.0756, + "min": 49.0, + "max": 49.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 17.4, + "std": 0.0674, + "min": 17.4, + "max": 17.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "59_conv_standard_3D__asymmetric_input__square_kernel.py": { + "mean": 13.3, + "std": 0.05, + "min": 13.3, + "max": 13.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "60_conv_standard_3D__square_input__asymmetric_kernel.py": { + "mean": 45.5, + "std": 0.107, + "min": 45.3, + "max": 45.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "61_conv_transposed_3D__square_input__square_kernel.py": { + "mean": 27.5, + "std": 0.093, + "min": 27.4, + "max": 27.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "62_conv_standard_2D__square_input__asymmetric_kernel.py": { + "mean": 14.5, + "std": 0.406, + "min": 13.0, + "max": 16.4, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "63_conv_standard_2D__square_input__square_kernel.py": { + "mean": 80.1, + "std": 0.312, + "min": 79.7, + "max": 82.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "64_conv_transposed_1D.py": { + "mean": 33.1, + "std": 0.0561, + "min": 33.0, + "max": 33.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { + "mean": 18.5, + "std": 0.062, + "min": 18.4, + "max": 18.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 22.6, + "std": 0.124, + "min": 22.4, + "max": 22.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "67_conv_standard_1D.py": { + "mean": 12.5, + "std": 0.0701, + "min": 12.5, + "max": 12.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { + "mean": 393.0, + "std": 0.127, + "min": 393.0, + "max": 393.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 22.2, + "std": 0.479, + "min": 21.2, + "max": 23.8, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { + "mean": 87.8, + "std": 0.0361, + "min": 87.7, + "max": 88.0, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { + "mean": 12.8, + "std": 0.0219, + "min": 12.7, + "max": 12.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { + "mean": 6.62, + "std": 0.157, + "min": 5.73, + "max": 6.87, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { + "mean": 17.3, + "std": 0.322, + "min": 16.5, + "max": 18.9, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "74_conv_transposed_1D_dilated.py": { + "mean": 5.79, + "std": 0.0773, + "min": 5.46, + "max": 5.88, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { + "mean": 14.3, + "std": 0.048, + "min": 14.3, + "max": 14.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "76_conv_standard_1D_dilated_strided__.py": { + "mean": 35.6, + "std": 0.569, + "min": 34.6, + "max": 36.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { + "mean": 6.12, + "std": 0.0542, + "min": 6.08, + "max": 6.51, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { + "mean": 18.5, + "std": 0.0573, + "min": 18.4, + "max": 18.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { + "mean": 11.0, + "std": 0.13, + "min": 10.8, + "max": 11.7, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { + "mean": 15.9, + "std": 0.0591, + "min": 15.9, + "max": 16.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { + "mean": 6.4, + "std": 0.138, + "min": 6.28, + "max": 6.82, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "82_conv_depthwise_2D_square_input_square_kernel.py": { + "mean": 14.0, + "std": 0.113, + "min": 14.0, + "max": 15.1, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { + "mean": 32.8, + "std": 0.106, + "min": 32.6, + "max": 33.2, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { + "mean": 55.6, + "std": 0.0411, + "min": 55.6, + "max": 55.8, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { + "mean": 47.9, + "std": 0.31, + "min": 47.2, + "max": 48.8, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "86_conv_depthwise_separable_2D.py": { + "mean": 25.3, + "std": 0.0247, + "min": 25.3, + "max": 25.4, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "87_conv_pointwise_2D.py": { + "mean": 83.6, + "std": 0.0593, + "min": 83.5, + "max": 83.9, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "88_MinGPTNewGelu.py": { + "mean": 1.16, + "std": 0.0281, + "min": 1.14, + "max": 1.29, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "89_cumsum.py": { + "mean": 17.7, + "std": 0.0877, + "min": 17.6, + "max": 18.0, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "90_cumprod.py": { + "mean": 17.8, + "std": 0.0928, + "min": 17.7, + "max": 18.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "91_cumsum_reverse.py": { + "mean": 35.2, + "std": 0.0892, + "min": 35.1, + "max": 35.4, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "92_cumsum_exclusive.py": { + "mean": 17.7, + "std": 0.0365, + "min": 17.7, + "max": 18.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "93_masked_cumsum.py": { + "mean": 19.9, + "std": 0.0254, + "min": 19.9, + "max": 20.1, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "94_MSELoss.py": { + "mean": 17.2, + "std": 0.207, + "min": 17.0, + "max": 18.6, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "95_CrossEntropyLoss.py": { + "mean": 1.13, + "std": 0.00513, + "min": 1.12, + "max": 1.15, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "96_HuberLoss.py": { + "mean": 17.1, + "std": 0.0508, + "min": 17.1, + "max": 17.3, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + }, + "97_ScaledDotProductAttention.py": { + "mean": 44.6, + "std": 1.99, + "min": 44.2, + "max": 64.3, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "98_KLDivLoss.py": { + "mean": 4.21, + "std": 0.0217, + "min": 4.2, + "max": 4.32, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "99_TripletMarginLoss.py": { + "mean": 6.35, + "std": 0.00473, + "min": 6.34, + "max": 6.36, + "num_trials": 100, + "hardware": "NVIDIA A10G", + "device": "cuda:0" + }, + "100_HingeLoss.py": { + "mean": 8.55, + "std": 0.505, + "min": 8.4, + "max": 13.5, + "num_trials": 100, + "hardware": "NVIDIA A10", + "device": "cuda:0" + } + } +} \ No newline at end of file diff --git a/results/timing/H100_modal/baseline_time_torch.json b/results/timing/H100_modal/baseline_time_torch.json new file mode 100644 index 00000000..5bdcd393 --- /dev/null +++ b/results/timing/H100_modal/baseline_time_torch.json @@ -0,0 +1,904 @@ +{ + "level1": { + "1_Square_matrix_multiplication_.py": { + "mean": 2.66, + "std": 0.00178, + "min": 2.66, + "max": 2.67, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "2_Standard_matrix_multiplication_.py": { + "mean": 2.64, + "std": 0.0039, + "min": 2.64, + "max": 2.67, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "3_Batched_matrix_multiplication.py": { + "mean": 5.34, + "std": 0.00552, + "min": 5.33, + "max": 5.38, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "4_Matrix_vector_multiplication_.py": { + "mean": 2.78, + "std": 0.00237, + "min": 2.78, + "max": 2.79, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "5_Matrix_scalar_multiplication.py": { + "mean": 2.84, + "std": 0.00653, + "min": 2.83, + "max": 2.87, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "6_Matmul_with_large_K_dimension_.py": { + "mean": 1.32, + "std": 0.00464, + "min": 1.31, + "max": 1.34, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "7_Matmul_with_small_K_dimension_.py": { + "mean": 4.12, + "std": 0.00954, + "min": 4.11, + "max": 4.21, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "8_Matmul_with_irregular_shapes_.py": { + "mean": 6.42, + "std": 0.00544, + "min": 6.41, + "max": 6.45, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "9_Tall_skinny_matrix_multiplication_.py": { + "mean": 2.61, + "std": 0.00374, + "min": 2.61, + "max": 2.63, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "10_3D_tensor_matrix_multiplication.py": { + "mean": 1.05, + "std": 0.00159, + "min": 1.04, + "max": 1.06, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "11_4D_tensor_matrix_multiplication.py": { + "mean": 11.1, + "std": 0.957, + "min": 10.1, + "max": 13.6, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "12_Matmul_with_diagonal_matrices_.py": { + "mean": 2.69, + "std": 0.00425, + "min": 2.68, + "max": 2.7, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "13_Matmul_for_symmetric_matrices.py": { + "mean": 2.65, + "std": 0.0045, + "min": 2.65, + "max": 2.67, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "14_Matmul_for_upper_triangular_matrices.py": { + "mean": 2.71, + "std": 0.00376, + "min": 2.7, + "max": 2.73, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "15_Matmul_for_lower_triangular_matrices.py": { + "mean": 2.71, + "std": 0.00182, + "min": 2.71, + "max": 2.72, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "16_Matmul_with_transposed_A.py": { + "mean": 2.62, + "std": 0.00207, + "min": 2.61, + "max": 2.62, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "17_Matmul_with_transposed_B.py": { + "mean": 2.74, + "std": 0.00801, + "min": 2.71, + "max": 2.76, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "18_Matmul_with_transposed_both.py": { + "mean": 2.78, + "std": 0.00828, + "min": 2.76, + "max": 2.81, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "19_ReLU.py": { + "mean": 4.27, + "std": 0.00845, + "min": 4.26, + "max": 4.35, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "20_LeakyReLU.py": { + "mean": 4.27, + "std": 0.00191, + "min": 4.26, + "max": 4.27, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "21_Sigmoid.py": { + "mean": 4.26, + "std": 0.00198, + "min": 4.26, + "max": 4.27, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "22_Tanh.py": { + "mean": 3.05, + "std": 0.00172, + "min": 3.04, + "max": 3.05, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "23_Softmax.py": { + "mean": 7.12, + "std": 0.0142, + "min": 7.11, + "max": 7.18, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "24_LogSoftmax.py": { + "mean": 6.18, + "std": 0.0645, + "min": 6.06, + "max": 6.33, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "25_Swish.py": { + "mean": 10.6, + "std": 0.00389, + "min": 10.6, + "max": 10.6, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "26_GELU_.py": { + "mean": 4.24, + "std": 0.00177, + "min": 4.23, + "max": 4.24, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "27_SELU_.py": { + "mean": 3.02, + "std": 0.00174, + "min": 3.02, + "max": 3.03, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "28_HardSigmoid.py": { + "mean": 4.26, + "std": 0.00202, + "min": 4.26, + "max": 4.27, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "29_Softplus.py": { + "mean": 4.23, + "std": 0.00216, + "min": 4.23, + "max": 4.24, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "30_Softsign.py": { + "mean": 10.4, + "std": 0.00375, + "min": 10.4, + "max": 10.4, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "31_ELU.py": { + "mean": 4.24, + "std": 0.00229, + "min": 4.24, + "max": 4.25, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "32_HardTanh.py": { + "mean": 4.24, + "std": 0.00168, + "min": 4.23, + "max": 4.24, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "33_BatchNorm.py": { + "mean": 8.8, + "std": 0.016, + "min": 8.77, + "max": 8.85, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "34_InstanceNorm.py": { + "mean": 9.57, + "std": 0.0119, + "min": 9.55, + "max": 9.6, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "35_GroupNorm_.py": { + "mean": 9.94, + "std": 0.0103, + "min": 9.93, + "max": 10.0, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "36_RMSNorm_.py": { + "mean": 14.2, + "std": 0.00266, + "min": 14.2, + "max": 14.2, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "37_FrobeniusNorm_.py": { + "mean": 8.42, + "std": 0.00264, + "min": 8.41, + "max": 8.42, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "38_L1Norm_.py": { + "mean": 15.5, + "std": 0.00742, + "min": 15.5, + "max": 15.5, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "39_L2Norm_.py": { + "mean": 10.0, + "std": 0.0024, + "min": 10.0, + "max": 10.0, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "40_LayerNorm.py": { + "mean": 8.12, + "std": 0.00649, + "min": 8.11, + "max": 8.16, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "41_Max_Pooling_1D.py": { + "mean": 10.7, + "std": 0.00997, + "min": 10.7, + "max": 10.7, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "42_Max_Pooling_2D.py": { + "mean": 10.7, + "std": 0.00823, + "min": 10.7, + "max": 10.8, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "43_Max_Pooling_3D.py": { + "mean": 3.93, + "std": 0.00239, + "min": 3.93, + "max": 3.94, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "44_Average_Pooling_1D.py": { + "mean": 8.02, + "std": 0.0057, + "min": 8.01, + "max": 8.06, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "45_Average_Pooling_2D.py": { + "mean": 6.6, + "std": 0.0259, + "min": 6.55, + "max": 6.7, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "46_Average_Pooling_3D.py": { + "mean": 8.66, + "std": 0.00616, + "min": 8.65, + "max": 8.69, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "47_Sum_reduction_over_a_dimension.py": { + "mean": 2.11, + "std": 0.0154, + "min": 2.08, + "max": 2.16, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "48_Mean_reduction_over_a_dimension.py": { + "mean": 2.89, + "std": 0.0163, + "min": 2.86, + "max": 2.94, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "49_Max_reduction_over_a_dimension.py": { + "mean": 3.17, + "std": 0.00295, + "min": 3.16, + "max": 3.17, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "50_conv_standard_2D__square_input__square_kernel.py": { + "mean": 2.09, + "std": 0.0172, + "min": 2.08, + "max": 2.15, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "51_Argmax_over_a_dimension.py": { + "mean": 3.25, + "std": 0.00328, + "min": 3.25, + "max": 3.27, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "52_Argmin_over_a_dimension.py": { + "mean": 3.24, + "std": 0.00269, + "min": 3.24, + "max": 3.25, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "53_Min_reduction_over_a_dimension.py": { + "mean": 3.18, + "std": 0.00328, + "min": 3.17, + "max": 3.19, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "54_conv_standard_3D__square_input__square_kernel.py": { + "mean": 1.36, + "std": 0.00258, + "min": 1.36, + "max": 1.38, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "55_conv_standard_2D__asymmetric_input__square_kernel.py": { + "mean": 4.18, + "std": 0.0626, + "min": 4.0, + "max": 4.27, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 3.44, + "std": 0.046, + "min": 3.37, + "max": 3.54, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "57_conv_transposed_2D__square_input__square_kernel.py": { + "mean": 6.56, + "std": 0.0393, + "min": 6.53, + "max": 6.86, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 2.32, + "std": 0.0145, + "min": 2.29, + "max": 2.37, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "59_conv_standard_3D__asymmetric_input__square_kernel.py": { + "mean": 2.09, + "std": 0.0049, + "min": 2.08, + "max": 2.11, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "60_conv_standard_3D__square_input__asymmetric_kernel.py": { + "mean": 5.29, + "std": 0.0129, + "min": 5.26, + "max": 5.31, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "61_conv_transposed_3D__square_input__square_kernel.py": { + "mean": 5.5, + "std": 0.011, + "min": 5.48, + "max": 5.58, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "62_conv_standard_2D__square_input__asymmetric_kernel.py": { + "mean": 3.64, + "std": 0.0747, + "min": 3.56, + "max": 3.85, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "63_conv_standard_2D__square_input__square_kernel.py": { + "mean": 7.05, + "std": 0.0139, + "min": 7.03, + "max": 7.11, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "64_conv_transposed_1D.py": { + "mean": 5.28, + "std": 0.0103, + "min": 5.25, + "max": 5.31, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { + "mean": 2.71, + "std": 0.0129, + "min": 2.68, + "max": 2.75, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 2.6, + "std": 0.00198, + "min": 2.6, + "max": 2.61, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "67_conv_standard_1D.py": { + "mean": 2.66, + "std": 0.0156, + "min": 2.63, + "max": 2.68, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { + "mean": 9.54, + "std": 0.0113, + "min": 9.52, + "max": 9.61, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 2.75, + "std": 0.0209, + "min": 2.72, + "max": 2.81, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { + "mean": 9.85, + "std": 0.0364, + "min": 9.83, + "max": 10.1, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { + "mean": 1.59, + "std": 0.00375, + "min": 1.58, + "max": 1.6, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { + "mean": 2.89, + "std": 0.00619, + "min": 2.88, + "max": 2.91, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { + "mean": 2.08, + "std": 0.00526, + "min": 2.07, + "max": 2.09, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "74_conv_transposed_1D_dilated.py": { + "mean": 1.89, + "std": 0.017, + "min": 1.87, + "max": 2.03, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { + "mean": 6.68, + "std": 0.00872, + "min": 6.67, + "max": 6.75, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "76_conv_standard_1D_dilated_strided__.py": { + "mean": 12.2, + "std": 0.0506, + "min": 12.2, + "max": 12.4, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { + "mean": 1.95, + "std": 0.0152, + "min": 1.91, + "max": 1.99, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { + "mean": 2.42, + "std": 0.00491, + "min": 2.41, + "max": 2.43, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { + "mean": 1.93, + "std": 0.00786, + "min": 1.92, + "max": 1.95, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { + "mean": 3.53, + "std": 0.00861, + "min": 3.52, + "max": 3.56, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { + "mean": 1.81, + "std": 0.0117, + "min": 1.78, + "max": 1.83, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "82_conv_depthwise_2D_square_input_square_kernel.py": { + "mean": 2.54, + "std": 0.00128, + "min": 2.54, + "max": 2.54, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { + "mean": 1.47, + "std": 0.00158, + "min": 1.47, + "max": 1.48, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { + "mean": 10.1, + "std": 0.00491, + "min": 10.1, + "max": 10.2, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { + "mean": 2.31, + "std": 0.00502, + "min": 2.31, + "max": 2.34, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "86_conv_depthwise_separable_2D.py": { + "mean": 3.7, + "std": 0.013, + "min": 3.67, + "max": 3.72, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "87_conv_pointwise_2D.py": { + "mean": 4.67, + "std": 0.00559, + "min": 4.66, + "max": 4.69, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "88_MinGPTNewGelu.py": { + "mean": 1.61, + "std": 0.00118, + "min": 1.6, + "max": 1.61, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "89_cumsum.py": { + "mean": 4.65, + "std": 0.00818, + "min": 4.64, + "max": 4.67, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "90_cumprod.py": { + "mean": 4.64, + "std": 0.00386, + "min": 4.63, + "max": 4.65, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "91_cumsum_reverse.py": { + "mean": 11.4, + "std": 0.0112, + "min": 11.4, + "max": 11.4, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "92_cumsum_exclusive.py": { + "mean": 8.86, + "std": 0.0191, + "min": 8.83, + "max": 8.93, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "93_masked_cumsum.py": { + "mean": 8.67, + "std": 0.00487, + "min": 8.66, + "max": 8.7, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "94_MSELoss.py": { + "mean": 8.43, + "std": 0.00298, + "min": 8.42, + "max": 8.44, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "95_CrossEntropyLoss.py": { + "mean": 1.45, + "std": 0.0022, + "min": 1.44, + "max": 1.45, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "96_HuberLoss.py": { + "mean": 5.52, + "std": 0.00201, + "min": 5.51, + "max": 5.53, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "97_ScaledDotProductAttention.py": { + "mean": 8.23, + "std": 0.193, + "min": 8.01, + "max": 9.38, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "98_KLDivLoss.py": { + "mean": 3.89, + "std": 0.00194, + "min": 3.89, + "max": 3.9, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "99_TripletMarginLoss.py": { + "mean": 4.25, + "std": 0.0129, + "min": 4.24, + "max": 4.37, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "100_HingeLoss.py": { + "mean": 10.4, + "std": 0.00488, + "min": 10.4, + "max": 10.5, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + } + } +} \ No newline at end of file diff --git a/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json new file mode 100644 index 00000000..ee1fb338 --- /dev/null +++ b/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json @@ -0,0 +1,904 @@ +{ + "level1": { + "1_Square_matrix_multiplication_.py": { + "mean": 2.66, + "std": 0.00503, + "min": 2.66, + "max": 2.68, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "2_Standard_matrix_multiplication_.py": { + "mean": 2.67, + "std": 0.00828, + "min": 2.65, + "max": 2.68, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "3_Batched_matrix_multiplication.py": { + "mean": 5.32, + "std": 0.0181, + "min": 5.3, + "max": 5.37, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "4_Matrix_vector_multiplication_.py": { + "mean": 2.9, + "std": 0.00233, + "min": 2.9, + "max": 2.91, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "5_Matrix_scalar_multiplication.py": { + "mean": 2.88, + "std": 0.0132, + "min": 2.86, + "max": 2.98, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "6_Matmul_with_large_K_dimension_.py": { + "mean": 1.33, + "std": 0.00488, + "min": 1.33, + "max": 1.35, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "7_Matmul_with_small_K_dimension_.py": { + "mean": 4.14, + "std": 0.0273, + "min": 4.12, + "max": 4.35, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "8_Matmul_with_irregular_shapes_.py": { + "mean": 6.44, + "std": 0.00478, + "min": 6.43, + "max": 6.46, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "9_Tall_skinny_matrix_multiplication_.py": { + "mean": 2.63, + "std": 0.0036, + "min": 2.63, + "max": 2.65, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "10_3D_tensor_matrix_multiplication.py": { + "mean": 1.07, + "std": 0.00272, + "min": 1.06, + "max": 1.08, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "11_4D_tensor_matrix_multiplication.py": { + "mean": 10.8, + "std": 1.01, + "min": 10.0, + "max": 13.2, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "12_Matmul_with_diagonal_matrices_.py": { + "mean": 2.69, + "std": 0.0112, + "min": 2.68, + "max": 2.72, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "13_Matmul_for_symmetric_matrices.py": { + "mean": 2.66, + "std": 0.00314, + "min": 2.66, + "max": 2.68, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "14_Matmul_for_upper_triangular_matrices.py": { + "mean": 2.72, + "std": 0.0045, + "min": 2.72, + "max": 2.75, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "15_Matmul_for_lower_triangular_matrices.py": { + "mean": 2.73, + "std": 0.005, + "min": 2.72, + "max": 2.75, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "16_Matmul_with_transposed_A.py": { + "mean": 2.65, + "std": 0.00467, + "min": 2.64, + "max": 2.67, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "17_Matmul_with_transposed_B.py": { + "mean": 2.77, + "std": 0.0083, + "min": 2.76, + "max": 2.8, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "18_Matmul_with_transposed_both.py": { + "mean": 2.79, + "std": 0.0101, + "min": 2.77, + "max": 2.82, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "19_ReLU.py": { + "mean": 4.31, + "std": 0.00483, + "min": 4.29, + "max": 4.33, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "20_LeakyReLU.py": { + "mean": 4.29, + "std": 0.00853, + "min": 4.28, + "max": 4.33, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "21_Sigmoid.py": { + "mean": 4.29, + "std": 0.0126, + "min": 4.27, + "max": 4.39, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "22_Tanh.py": { + "mean": 3.07, + "std": 0.0213, + "min": 3.06, + "max": 3.27, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "23_Softmax.py": { + "mean": 8.68, + "std": 0.0881, + "min": 8.65, + "max": 9.55, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "24_LogSoftmax.py": { + "mean": 8.65, + "std": 0.0181, + "min": 8.63, + "max": 8.74, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "25_Swish.py": { + "mean": 4.28, + "std": 0.00737, + "min": 4.27, + "max": 4.32, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "26_GELU_.py": { + "mean": 4.27, + "std": 0.00737, + "min": 4.26, + "max": 4.29, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "27_SELU_.py": { + "mean": 4.27, + "std": 0.0127, + "min": 4.26, + "max": 4.37, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "28_HardSigmoid.py": { + "mean": 4.29, + "std": 0.03, + "min": 4.27, + "max": 4.58, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "29_Softplus.py": { + "mean": 4.29, + "std": 0.0411, + "min": 4.27, + "max": 4.58, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "30_Softsign.py": { + "mean": 3.06, + "std": 0.00498, + "min": 3.05, + "max": 3.08, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "31_ELU.py": { + "mean": 4.28, + "std": 0.0248, + "min": 4.27, + "max": 4.52, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "32_HardTanh.py": { + "mean": 4.29, + "std": 0.0059, + "min": 4.28, + "max": 4.31, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "33_BatchNorm.py": { + "mean": 4.24, + "std": 0.00629, + "min": 4.22, + "max": 4.26, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "34_InstanceNorm.py": { + "mean": 7.66, + "std": 0.027, + "min": 7.64, + "max": 7.9, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "35_GroupNorm_.py": { + "mean": 7.49, + "std": 0.00591, + "min": 7.48, + "max": 7.51, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "36_RMSNorm_.py": { + "mean": 7.8, + "std": 0.00398, + "min": 7.8, + "max": 7.82, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "37_FrobeniusNorm_.py": { + "mean": 7.32, + "std": 0.0091, + "min": 7.31, + "max": 7.36, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "38_L1Norm_.py": { + "mean": 13.0, + "std": 0.0182, + "min": 13.0, + "max": 13.1, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "39_L2Norm_.py": { + "mean": 13.4, + "std": 0.029, + "min": 13.3, + "max": 13.6, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "40_LayerNorm.py": { + "mean": 0.476, + "std": 0.0031, + "min": 0.472, + "max": 0.491, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "41_Max_Pooling_1D.py": { + "mean": 10.7, + "std": 0.0108, + "min": 10.7, + "max": 10.8, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "42_Max_Pooling_2D.py": { + "mean": 4.45, + "std": 0.00646, + "min": 4.44, + "max": 4.49, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "43_Max_Pooling_3D.py": { + "mean": 3.95, + "std": 0.00396, + "min": 3.94, + "max": 3.97, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "44_Average_Pooling_1D.py": { + "mean": 1.89, + "std": 0.00262, + "min": 1.88, + "max": 1.9, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "45_Average_Pooling_2D.py": { + "mean": 6.43, + "std": 0.0493, + "min": 6.36, + "max": 6.79, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "46_Average_Pooling_3D.py": { + "mean": 8.71, + "std": 0.0143, + "min": 8.69, + "max": 8.81, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "47_Sum_reduction_over_a_dimension.py": { + "mean": 3.5, + "std": 0.0161, + "min": 3.45, + "max": 3.54, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "48_Mean_reduction_over_a_dimension.py": { + "mean": 3.39, + "std": 0.0227, + "min": 3.34, + "max": 3.55, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "49_Max_reduction_over_a_dimension.py": { + "mean": 3.49, + "std": 0.0149, + "min": 3.45, + "max": 3.56, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "50_conv_standard_2D__square_input__square_kernel.py": { + "mean": 1.66, + "std": 0.0152, + "min": 1.64, + "max": 1.72, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "51_Argmax_over_a_dimension.py": { + "mean": 2.94, + "std": 0.00652, + "min": 2.93, + "max": 2.96, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "52_Argmin_over_a_dimension.py": { + "mean": 3.07, + "std": 0.00532, + "min": 3.06, + "max": 3.09, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "53_Min_reduction_over_a_dimension.py": { + "mean": 3.48, + "std": 0.0141, + "min": 3.44, + "max": 3.52, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "54_conv_standard_3D__square_input__square_kernel.py": { + "mean": 1.4, + "std": 0.00443, + "min": 1.39, + "max": 1.42, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "55_conv_standard_2D__asymmetric_input__square_kernel.py": { + "mean": 4.63, + "std": 0.006, + "min": 4.61, + "max": 4.64, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 3.68, + "std": 0.037, + "min": 3.62, + "max": 3.76, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "57_conv_transposed_2D__square_input__square_kernel.py": { + "mean": 6.56, + "std": 0.00741, + "min": 6.54, + "max": 6.58, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 2.32, + "std": 0.0159, + "min": 2.29, + "max": 2.37, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "59_conv_standard_3D__asymmetric_input__square_kernel.py": { + "mean": 2.11, + "std": 0.00394, + "min": 2.1, + "max": 2.13, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "60_conv_standard_3D__square_input__asymmetric_kernel.py": { + "mean": 5.33, + "std": 0.0602, + "min": 5.32, + "max": 5.79, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "61_conv_transposed_3D__square_input__square_kernel.py": { + "mean": 5.51, + "std": 0.0104, + "min": 5.48, + "max": 5.54, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "62_conv_standard_2D__square_input__asymmetric_kernel.py": { + "mean": 2.68, + "std": 0.0374, + "min": 2.66, + "max": 3.0, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "63_conv_standard_2D__square_input__square_kernel.py": { + "mean": 13.9, + "std": 0.0149, + "min": 13.9, + "max": 14.0, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "64_conv_transposed_1D.py": { + "mean": 5.32, + "std": 0.0562, + "min": 5.29, + "max": 5.72, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { + "mean": 2.72, + "std": 0.0112, + "min": 2.68, + "max": 2.74, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { + "mean": 2.65, + "std": 0.0043, + "min": 2.65, + "max": 2.68, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "67_conv_standard_1D.py": { + "mean": 2.69, + "std": 0.0361, + "min": 2.65, + "max": 3.01, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { + "mean": 9.55, + "std": 0.00915, + "min": 9.53, + "max": 9.58, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { + "mean": 2.74, + "std": 0.0113, + "min": 2.72, + "max": 2.78, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { + "mean": 10.0, + "std": 0.0098, + "min": 9.98, + "max": 10.0, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { + "mean": 1.59, + "std": 0.00411, + "min": 1.58, + "max": 1.6, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { + "mean": 2.93, + "std": 0.00823, + "min": 2.92, + "max": 2.96, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { + "mean": 2.07, + "std": 0.00509, + "min": 2.06, + "max": 2.09, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "74_conv_transposed_1D_dilated.py": { + "mean": 1.93, + "std": 0.00876, + "min": 1.91, + "max": 1.95, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { + "mean": 6.6, + "std": 0.0337, + "min": 6.58, + "max": 6.91, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "76_conv_standard_1D_dilated_strided__.py": { + "mean": 12.4, + "std": 0.0826, + "min": 12.4, + "max": 12.7, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { + "mean": 1.98, + "std": 0.0146, + "min": 1.95, + "max": 2.03, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { + "mean": 2.37, + "std": 0.0217, + "min": 2.35, + "max": 2.55, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { + "mean": 1.93, + "std": 0.00781, + "min": 1.92, + "max": 1.95, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { + "mean": 2.65, + "std": 0.0159, + "min": 2.63, + "max": 2.68, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { + "mean": 1.71, + "std": 0.00818, + "min": 1.7, + "max": 1.76, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "82_conv_depthwise_2D_square_input_square_kernel.py": { + "mean": 2.57, + "std": 0.0708, + "min": 2.53, + "max": 3.09, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { + "mean": 19.0, + "std": 0.273, + "min": 18.8, + "max": 20.3, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { + "mean": 9.79, + "std": 0.00959, + "min": 9.77, + "max": 9.81, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { + "mean": 13.9, + "std": 0.00974, + "min": 13.9, + "max": 14.0, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "86_conv_depthwise_separable_2D.py": { + "mean": 3.41, + "std": 0.28, + "min": 3.31, + "max": 5.34, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "87_conv_pointwise_2D.py": { + "mean": 10.6, + "std": 0.098, + "min": 10.5, + "max": 10.9, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "88_MinGPTNewGelu.py": { + "mean": 0.161, + "std": 0.00496, + "min": 0.154, + "max": 0.178, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "89_cumsum.py": { + "mean": 2.69, + "std": 0.0867, + "min": 2.66, + "max": 3.45, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "90_cumprod.py": { + "mean": 2.64, + "std": 0.0149, + "min": 2.63, + "max": 2.73, + "num_trials": 100, + "hardware": "NVIDIA H200", + "device": "cuda:0" + }, + "91_cumsum_reverse.py": { + "mean": 5.87, + "std": 0.00331, + "min": 5.86, + "max": 5.88, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "92_cumsum_exclusive.py": { + "mean": 5.67, + "std": 0.01, + "min": 5.66, + "max": 5.75, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "93_masked_cumsum.py": { + "mean": 3.31, + "std": 0.0057, + "min": 3.3, + "max": 3.33, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "94_MSELoss.py": { + "mean": 2.78, + "std": 0.00263, + "min": 2.78, + "max": 2.79, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "95_CrossEntropyLoss.py": { + "mean": 0.234, + "std": 0.0313, + "min": 0.223, + "max": 0.541, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "96_HuberLoss.py": { + "mean": 2.79, + "std": 0.00316, + "min": 2.78, + "max": 2.8, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "97_ScaledDotProductAttention.py": { + "mean": 8.25, + "std": 0.228, + "min": 8.1, + "max": 9.25, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "98_KLDivLoss.py": { + "mean": 0.745, + "std": 0.0237, + "min": 0.735, + "max": 0.965, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "99_TripletMarginLoss.py": { + "mean": 1.04, + "std": 0.00274, + "min": 1.04, + "max": 1.05, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + }, + "100_HingeLoss.py": { + "mean": 1.41, + "std": 0.00369, + "min": 1.4, + "max": 1.42, + "num_trials": 100, + "hardware": "NVIDIA H100 80GB HBM3", + "device": "cuda:0" + } + } +} \ No newline at end of file diff --git a/scripts/benchmark_eval_analysis.py b/scripts/benchmark_eval_analysis.py index e2bea005..428e240f 100644 --- a/scripts/benchmark_eval_analysis.py +++ b/scripts/benchmark_eval_analysis.py @@ -40,7 +40,7 @@ def patch(eval_results, dataset): """ Patch the eval results with the dataset """ - for pid in range(1, len(dataset) + 1): + for pid in dataset.get_problem_ids(): if str(pid) not in eval_results: eval_results[str(pid)] = { "sample_id": 0, @@ -136,19 +136,40 @@ def analyze_greedy_eval(run_name, hardware, baseline, level): ) # Extract the speedup values - is_correct = np.array([entry["correctness"] for entry in eval_results.values()]) - baseline_speed = np.array( - [entry["mean"] for entry in baseline_results[f"level{level}"].values()] - ) - actual_speed = np.array([entry["runtime"] for entry in eval_results.values()]) + is_correct_list = [] + baseline_speed_list = [] + actual_speed_list = [] + + # Sort problem IDs to ensure consistent order + sorted_pids = sorted(dataset.get_problem_ids()) + + for pid in sorted_pids: + # Get eval result + if str(pid) not in eval_results: + print(f"Warning: Problem {pid} not found in eval results") + continue + eval_entry = eval_results[str(pid)] + + # Get baseline result + problem_path = dataset.get_problem_by_id(pid) + problem_name = os.path.basename(problem_path) + + if problem_name not in baseline_results[f"level{level}"]: + print(f"Warning: Problem {problem_name} not found in baseline results") + continue + + baseline_entry = baseline_results[f"level{level}"][problem_name] + + is_correct_list.append(eval_entry["correctness"]) + actual_speed_list.append(eval_entry["runtime"]) + baseline_speed_list.append(baseline_entry["mean"]) + + is_correct = np.array(is_correct_list) + baseline_speed = np.array(baseline_speed_list) + actual_speed = np.array(actual_speed_list) n = len(is_correct) - assert ( - len(baseline_speed) == n - ), "Baseline speedup values do not match the number of eval results" - assert ( - len(actual_speed) == n - ), "Actual speedup values do not match the number of eval results" + print(f"Aligned {n} problems for analysis") # Calculate the metrics gmsr_correct = geometric_mean_speed_ratio_correct_only( diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py index 2e39e3be..a973187f 100644 --- a/scripts/eval_from_generations.py +++ b/scripts/eval_from_generations.py @@ -257,10 +257,7 @@ def fetch_ref_arch_from_problem_id( problem_name = curr_problem_row["name"][0] elif dataset_src == "local": - problem_idx_in_dataset = ( - problem_id - 1 - ) # due to dataset list being 0-indexed locally - ref_arch_path = dataset[problem_idx_in_dataset] + ref_arch_path = dataset.get_problem_by_id(problem_id) problem_name = os.path.basename(ref_arch_path) ref_arch_src = read_file(ref_arch_path) @@ -764,17 +761,18 @@ def main(config: EvalConfig): curr_level_dataset = construct_kernelbench_dataset(config.level) num_problems_in_level = len(curr_level_dataset) + all_problem_ids = curr_level_dataset.get_problem_ids() if config.dataset_src == "local" else list(range(1, num_problems_in_level + 1)) if config.subset == (None, None): - problem_id_range = range(1, num_problems_in_level) + problem_ids_to_run = all_problem_ids else: - assert ( - config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level - ), f"Subset range {config.subset} out of range for Level {config.level}" - problem_id_range = range(config.subset[0], config.subset[1]) + start, end = config.subset + problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end] + if not problem_ids_to_run: + print(f"Warning: No problems found in subset range {config.subset}") print( - f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_id_range}" + f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_ids_to_run}" ) run_dir = os.path.join(config.runs_dir, config.run_name) @@ -784,22 +782,19 @@ def main(config: EvalConfig): # single_eval_example(config, curr_level_dataset, run_dir, eval_file_path) total_work = [] - for problem_id in range( - problem_id_range.start, problem_id_range.stop + 1 - ): # end index is inclusive + for problem_id in problem_ids_to_run: for sample_id in range(config.num_samples_per_problem): if not check_if_eval_exists_local(problem_id, sample_id, eval_file_path): total_work.append((problem_id, sample_id)) print( f"Start evaluation on {len(total_work)} unevaluated samples" - f" in range: {problem_id_range}" + f" in range: {problem_ids_to_run}" ) # Build Cache on CPU as that is faster (only for local mode) if config.build_cache and config.eval_mode == "local": compile.batch_compile(total_work, config.to_dict()) - # Batch Eval on multiple GPUs in parallel batch_eval(total_work, config, curr_level_dataset, run_dir, eval_file_path) # Calculate pass@k metrics if multiple samples per problem were evaluated diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py index 18fb3c55..0b86964a 100644 --- a/scripts/generate_and_eval_single_sample.py +++ b/scripts/generate_and_eval_single_sample.py @@ -139,10 +139,7 @@ def main(config: EvalConfig): problem_name = curr_problem_row["name"][0] elif config.dataset_src == "local": - problem_idx_in_dataset = ( - config.problem_id - 1 - ) # due to dataset list being 0-indexed locally - ref_arch_path = curr_level_dataset[problem_idx_in_dataset] + ref_arch_path = curr_level_dataset.get_problem_by_id(config.problem_id) problem_name = os.path.basename(ref_arch_path) ref_arch_src = read_file(ref_arch_path) diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py index 6962f515..471cee9a 100644 --- a/scripts/generate_and_eval_single_sample_modal.py +++ b/scripts/generate_and_eval_single_sample_modal.py @@ -13,7 +13,7 @@ from datasets import load_dataset -#from src.dataset import construct_kernelbench_dataset +from src.dataset import construct_kernelbench_dataset from src.eval import eval_kernel_against_ref from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template from src.prompt_constructor_multilang import get_prompt_for_backend @@ -148,6 +148,8 @@ def main(config: EvalConfig): if config.dataset_src == "huggingface": dataset = load_dataset(config.dataset_name) curr_level_dataset = dataset[f"level_{config.level}"] + elif config.dataset_src == "local": + curr_level_dataset = construct_kernelbench_dataset(config.level) if config.log: os.makedirs(config.logdir, exist_ok=True) @@ -168,8 +170,7 @@ def main(config: EvalConfig): problem_name = curr_problem_row["name"][0] elif config.dataset_src == "local": - problem_idx_in_dataset = config.problem_id - 1 # due to dataset list being 0-indexed locally - ref_arch_path = curr_level_dataset[problem_idx_in_dataset] + ref_arch_path = curr_level_dataset.get_problem_by_id(config.problem_id) problem_name = os.path.basename(ref_arch_path) ref_arch_src = read_file(ref_arch_path) diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py index 5a68ea08..739ffcc1 100644 --- a/scripts/generate_baseline_time.py +++ b/scripts/generate_baseline_time.py @@ -7,7 +7,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_problem_dataset_from_problem_dir +from src.dataset import construct_kernelbench_dataset from src.utils import read_file import os import json @@ -46,7 +46,7 @@ TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing") -def fetch_ref_arch_from_dataset(dataset: list[str], +def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str]: """ Fetch the reference architecture from the problem directory @@ -57,14 +57,7 @@ def fetch_ref_arch_from_dataset(dataset: list[str], ref_arch_name: str, the name of the reference architecture ref_arch_src: str, the source code of the reference architecture """ - ref_arch_path = None - - for file in dataset: - if file.split("/")[-1].split("_")[0] == str(problem_id): - ref_arch_path = file - break - if ref_arch_path is None: - raise ValueError(f"No reference architecture found for problem_id {problem_id}") + ref_arch_path = dataset.get_problem_by_id(problem_id) ref_arch_src = read_file(ref_arch_path) @@ -143,12 +136,11 @@ def record_baseline_times(use_torch_compile: bool = False, json_results = {} for level in [1, 2, 3]: - PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level)) - dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR) + dataset = construct_kernelbench_dataset(level) json_results[f"level{level}"] = {} num_problems = len(dataset) - for problem_id in tqdm(range(1, num_problems + 1)): + for problem_id in tqdm(dataset.get_problem_ids()): ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(dataset, problem_id) runtime_stats = measure_program_time( ref_arch_name=ref_arch_name, @@ -174,8 +166,7 @@ def test_measure_particular_program(level_num: int, problem_id: int): """ device = torch.device("cuda:0") - PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level_num)) - dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR) + dataset = construct_kernelbench_dataset(level_num) ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(dataset, problem_id) diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py index a0039193..f89a6a84 100644 --- a/scripts/generate_baseline_time_modal.py +++ b/scripts/generate_baseline_time_modal.py @@ -7,7 +7,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_problem_dataset_from_problem_dir +from src.dataset import construct_kernelbench_dataset from src.utils import read_file import os import json @@ -126,7 +126,7 @@ def write_batch_to_json(entries_to_write: list, f_path: str): print(f"[INFO] Wrote {len(entries_to_write)} entries to {f_path}") -def fetch_ref_arch_from_dataset(dataset: list[str], +def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str]: """ Fetch the reference architecture from the problem directory @@ -137,14 +137,7 @@ def fetch_ref_arch_from_dataset(dataset: list[str], ref_arch_name: str, the name of the reference architecture ref_arch_src: str, the source code of the reference architecture """ - ref_arch_path = None - - for file in dataset: - if file.split("/")[-1].split("_")[0] == str(problem_id): - ref_arch_path = file - break - if ref_arch_path is None: - raise ValueError(f"No reference architecture found for problem_id {problem_id}") + ref_arch_path = dataset.get_problem_by_id(problem_id) ref_arch_src = read_file(ref_arch_path) @@ -229,10 +222,9 @@ def record_baseline_times(config: BaselineConfig, json_results = [] level = config.level - PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level)) - dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR) + dataset = construct_kernelbench_dataset(level) num_problems = len(dataset) - total_work = [(i, *fetch_ref_arch_from_dataset(dataset, i)) for i in list(range(1, num_problems + 1))] + total_work = [(i, *fetch_ref_arch_from_dataset(dataset, i)) for i in dataset.get_problem_ids()] with tqdm(total=len(total_work), desc="Processing batches") as pbar: while len(total_work) > 0: diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py index 5b476445..c6869c15 100644 --- a/scripts/generate_samples.py +++ b/scripts/generate_samples.py @@ -112,10 +112,7 @@ def generate_sample_single( problem_name = curr_problem_row["name"][0] elif config.dataset_src == "local": - problem_idx_in_dataset = ( - work.problem_id - 1 - ) # due to dataset list being 0-indexed locally - ref_arch_path = dataset[problem_idx_in_dataset] + ref_arch_path = dataset.get_problem_by_id(work.problem_id) problem_name = os.path.basename(ref_arch_path) ref_arch_src = read_file(ref_arch_path) @@ -224,17 +221,18 @@ def main(config: GenerationConfig): curr_level_dataset = construct_kernelbench_dataset(config.level) num_problems_in_level = len(curr_level_dataset) + all_problem_ids = curr_level_dataset.get_problem_ids() if config.dataset_src == "local" else list(range(1, num_problems_in_level + 1)) if config.subset == (None, None): - problem_id_range = range(1, num_problems_in_level) + problem_ids_to_run = all_problem_ids else: - assert ( - config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level - ), f"Subset range {config.subset} out of range for Level {config.level}" - problem_id_range = range(config.subset[0], config.subset[1]) + start, end = config.subset + problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end] + if not problem_ids_to_run: + print(f"Warning: No problems found in subset range {config.subset}") print( - f"Generating {config.num_samples} sample(s) each for level {config.level} problems: {problem_id_range}" + f"Generating {config.num_samples} sample(s) each for level {config.level} problems: {problem_ids_to_run}" ) # set up run directory @@ -253,9 +251,7 @@ def main(config: GenerationConfig): problems_to_run = [] total_problems = 0 already_completed = 0 - for problem_id in range( - problem_id_range.start, problem_id_range.stop + 1 - ): # end index is inclusive + for problem_id in problem_ids_to_run: for sample_id in range(config.num_samples): total_problems += 1 if not check_kernel_exists(run_dir, config.level, problem_id, sample_id): diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py index e7811f64..29afe0c9 100644 --- a/scripts/inspect_baseline.py +++ b/scripts/inspect_baseline.py @@ -10,7 +10,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_problem_dataset_from_problem_dir +from src.dataset import construct_kernelbench_dataset import os, sys import logging import json @@ -93,8 +93,7 @@ def emit(self, record): separator("") def fetch_ref_arch_from_level_problem_id(level_num, problem_id, with_name=False): - PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level_num)) - dataset = construct_problem_dataset_from_problem_dir(PROBLEM_DIR) + dataset = construct_kernelbench_dataset(level_num) return fetch_ref_arch_from_problem_id(problem_id, dataset, with_name) def inspect_torch_compile_triton(level_num, problem_id): diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py index 4f13c8af..1e1c5a10 100644 --- a/scripts/inspect_triton.py +++ b/scripts/inspect_triton.py @@ -26,8 +26,9 @@ set_seed, ) -def fetch_ref_arch_from_dataset(dataset: list[str], - problem_id: int) -> tuple[str, str, str]: +from src.dataset import construct_kernelbench_dataset + +def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str]: """ Fetch the reference architecture from the problem directory problem_id should be logical index (1-indexed), matching the problem_id in the problem_name @@ -37,18 +38,9 @@ def fetch_ref_arch_from_dataset(dataset: list[str], ref_arch_name: str, the name of the reference architecture ref_arch_src: str, the source code of the reference architecture """ - ref_arch_path = None - - for file in dataset: - if file.split("/")[-1].split("_")[0] == str(problem_id): - ref_arch_path = file - break - if ref_arch_path is None: - raise ValueError(f"No reference architecture found for problem_id {problem_id}") - + ref_arch_path = dataset.get_problem_by_id(problem_id) ref_arch_src = read_file(ref_arch_path) - - ref_arch_name = ref_arch_path.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_path) return (ref_arch_path, ref_arch_name, ref_arch_src) @@ -125,7 +117,7 @@ def get_torch_compile_triton(level_num, problem_id): Get the triton code generated by torch compile for a particular problem """ ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset( - dataset, problem_id, with_name=True + dataset, problem_id ) context = {} # import pdb; pdb.set_trace() diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py index 316b96ee..43fb4e81 100644 --- a/scripts/run_and_check.py +++ b/scripts/run_and_check.py @@ -81,6 +81,7 @@ def __init__(self): # ref_origin is local, specify local file path self.ref_arch_src_path = "" # ref_origin is kernelbench, specify level and problem id + self.dataset_src = "huggingface" # either huggingface or local self.dataset_name = "ScalingIntelligence/KernelBench" self.level = "" self.problem_id = "" @@ -240,16 +241,25 @@ def main(config: ScriptConfig): assert config.level != "", "level is required" assert config.problem_id != "", "problem_id is required" - # for now use the HuggingFace dataset - dataset = load_dataset(config.dataset_name) - curr_level_dataset = dataset[f"level_{config.level}"] - - curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id) - ref_arch_src = curr_problem_row["code"][0] - problem_name = curr_problem_row["name"][0] + if config.dataset_src == "huggingface": + # for now use the HuggingFace dataset + dataset = load_dataset(config.dataset_name) + curr_level_dataset = dataset[f"level_{config.level}"] + + curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id) + ref_arch_src = curr_problem_row["code"][0] + problem_name = curr_problem_row["name"][0] + elif config.dataset_src == "local": + from src.dataset import construct_kernelbench_dataset + dataset = construct_kernelbench_dataset(config.level) + ref_arch_path = dataset.get_problem_by_id(int(config.problem_id)) + ref_arch_src = read_file(ref_arch_path) + problem_name = os.path.basename(ref_arch_path) + else: + raise ValueError(f"Invalid dataset_src: {config.dataset_src}") problem_number = int(problem_name.split("_")[0]) - assert problem_number == config.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})" + assert problem_number == int(config.problem_id), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})" print(f"Fetched problem {config.problem_id} from KernelBench level {config.level}: {problem_name}") diff --git a/scripts/verify_bench.py b/scripts/verify_bench.py index 5fdc6862..2ad79395 100644 --- a/scripts/verify_bench.py +++ b/scripts/verify_bench.py @@ -71,37 +71,43 @@ def run(Model, NewModel, get_inputs, get_init_inputs, seed=1012): return check_correctness(Model, NewModel, get_inputs, get_init_inputs, seed) -def run_all(directory): - print(f"Running {directory}") +from src.dataset import construct_kernelbench_dataset + +def run_all(level): + print(f"Running Level {level}") + dataset = construct_kernelbench_dataset(level) total = 0 passed = 0 fail_tests = [] - abs_path = os.path.abspath(directory) - for filename in os.listdir(abs_path): - if filename.endswith(".py"): - total += 1 - module_name = filename[:-3] # Remove .py extension - try: - # Dynamically import the module - spec = importlib.util.spec_from_file_location( - module_name, os.path.join(abs_path, filename) - ) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - # Get the required attributes from the module - Model = getattr(module, "Model") - get_inputs = getattr(module, "get_inputs") - get_init_inputs = getattr(module, "get_init_inputs") - assert run(Model, Model, get_inputs, get_init_inputs) - passed += 1 - except Exception as e: - fail_tests.append(module_name) - print(f"{directory}: {passed}/{total} passed") + + for problem_id in dataset.get_problem_ids(): + problem_path = dataset.get_problem_by_id(problem_id) + filename = os.path.basename(problem_path) + + total += 1 + module_name = filename[:-3] # Remove .py extension + try: + # Dynamically import the module + spec = importlib.util.spec_from_file_location( + module_name, problem_path + ) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + # Get the required attributes from the module + Model = getattr(module, "Model") + get_inputs = getattr(module, "get_inputs") + get_init_inputs = getattr(module, "get_init_inputs") + assert run(Model, Model, get_inputs, get_init_inputs) + passed += 1 + except Exception as e: + print(f"Failed {module_name}: {e}") + fail_tests.append(module_name) + print(f"Level {level}: {passed}/{total} passed") if len(fail_tests) > 0: print(f"Failed tests: {fail_tests}") if __name__ == "__main__": - run_all(KERNEL_BENCH_PATH + "/level1") - run_all(KERNEL_BENCH_PATH + "/level2") - run_all(KERNEL_BENCH_PATH + "/level3") + run_all(1) + run_all(2) + run_all(3) diff --git a/src/dataset.py b/src/dataset.py index cb429dc1..81d393c2 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -38,6 +38,51 @@ def get_code_hash(problem_src: str) -> str: return hashlib.md5(cleaned_problem_src.encode()).hexdigest() + +def check_id_matches_name(problem_id: int, problem_name: str): + """Check if the problem_id matches the ID in the problem_name""" + return problem_id == int(os.path.basename(problem_name).split('_')[0]) + + +class KernelBenchDataset(): + def __init__(self, dataset_name: str, level: int, use_subset=False, dataset=[], subset_dataset=[]): + + self.dataset_name = dataset_name + + if use_subset: + self.problems = subset_dataset + else: + self.problems = dataset + + self.level = level + self.use_subset = use_subset + + # print(f"[Initilaize Dataset Object] {self.dataset_name} with level {self.level} and use_subset {self.use_subset}") + + def get_problem_by_id(self, problem_id=int): + "Logical index of problem_id (logical is 1-indexed)" + # Find problem with matching ID in basename + + for problem in self.problems: + if check_id_matches_name(problem_id, problem): + return problem + raise ValueError(f"Problem ID {problem_id} not found in dataset") + + # get the problem_ids + def get_problem_ids(self): + # return self.whol + return [int(os.path.basename(problem).split('_')[0]) for problem in self.problems] + + def __len__(self): + return len(self.problems) + + def __getitem__(self, index): + return self.problems[index] + + def __iter__(self): + return iter(self.problems) + + def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]: """ Construct a list of relative paths to all the python files in the problem directory @@ -57,10 +102,15 @@ def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]: return DATASET -def construct_kernelbench_dataset(level: int) -> list[str]: - return construct_problem_dataset_from_problem_dir( +def construct_kernelbench_dataset(level: int) -> KernelBenchDataset: + dataset_list = construct_problem_dataset_from_problem_dir( os.path.join(KERNEL_BENCH_PATH, f"level{level}") ) + return KernelBenchDataset( + dataset_name=f"KernelBench_Level_{level}", + level=level, + dataset=dataset_list + ) KERNELBENCH_LEVEL_1_DATASET = construct_kernelbench_dataset(level=1) diff --git a/src/eval.py b/src/eval.py index 4a072c89..9f2862a9 100644 --- a/src/eval.py +++ b/src/eval.py @@ -21,7 +21,7 @@ import torch.nn as nn from pydantic import BaseModel -from . import utils +from . import utils, dataset REPO_TOP_PATH = os.path.abspath( os.path.join( @@ -46,7 +46,10 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str if isinstance(problem_id, str): problem_id = int(problem_id) - problem_path = problems[problem_id] + if hasattr(problems, "get_problem_by_id"): + problem_path = problems.get_problem_by_id(problem_id) + else: + problem_path = problems[problem_id] # problem_path = os.path.join(REPO_ROOT_PATH, problem) if not os.path.exists(problem_path): @@ -60,9 +63,8 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False): - PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level)) - dataset = utils.construct_problem_dataset_from_problem_dir(PROBLEM_DIR) - return fetch_ref_arch_from_problem_id(problem_id, dataset, with_name) + kb_dataset = dataset.construct_kernelbench_dataset(level) + return fetch_ref_arch_from_problem_id(problem_id, kb_dataset, with_name) def set_seed(seed: int): @@ -884,7 +886,12 @@ def fetch_baseline_time( with open(baseline_time_filepath, "r") as f: baseline_json = json.load(f) - problem_name = dataset[problem_id].split("/")[-1] + if hasattr(dataset, "get_problem_by_id"): + problem_path = dataset.get_problem_by_id(problem_id) + else: + problem_path = dataset[problem_id] + + problem_name = os.path.basename(problem_path) baseline_time = baseline_json[level_name].get(problem_name, None) return baseline_time From c77955fd8a8daeb3e651cbd861fae75189b48aed Mon Sep 17 00:00:00 2001 From: pythonomar22 Date: Thu, 20 Nov 2025 20:24:40 -0800 Subject: [PATCH 2/5] fixing some syntax --- scripts/generate_baseline_time.py | 27 ++++--- scripts/generate_baseline_time_modal.py | 27 ++++--- scripts/inspect_triton.py | 53 ++++++++---- src/dataset.py | 103 +++++++++++++++++++----- 4 files changed, 153 insertions(+), 57 deletions(-) diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py index 739ffcc1..a8abb911 100644 --- a/scripts/generate_baseline_time.py +++ b/scripts/generate_baseline_time.py @@ -7,7 +7,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_kernelbench_dataset +from src.dataset import construct_kernelbench_dataset, KernelBenchDataset from src.utils import read_file import os import json @@ -46,22 +46,25 @@ TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing") -def fetch_ref_arch_from_dataset(dataset, - problem_id: int) -> tuple[str, str, str]: - """ - Fetch the reference architecture from the problem directory - problem_id should be logical index (1-indexed), matching the problem_id in the problem_name +def fetch_ref_arch_from_dataset( + dataset: KernelBenchDataset, + problem_id: int +) -> tuple[str, str, str]: + """Fetch the reference architecture from the dataset. + + Args: + dataset: KernelBenchDataset object + problem_id: Logical index (1-indexed), matching the problem_id in the problem_name Returns: - ref_arch_path: str, the path to the reference architecture - ref_arch_name: str, the name of the reference architecture - ref_arch_src: str, the source code of the reference architecture + tuple containing: + - ref_arch_path: Path to the reference architecture + - ref_arch_name: Name of the reference architecture file + - ref_arch_src: Source code of the reference architecture """ ref_arch_path = dataset.get_problem_by_id(problem_id) - ref_arch_src = read_file(ref_arch_path) - - ref_arch_name = ref_arch_path.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_path) return (ref_arch_path, ref_arch_name, ref_arch_src) diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py index f89a6a84..85fc5e88 100644 --- a/scripts/generate_baseline_time_modal.py +++ b/scripts/generate_baseline_time_modal.py @@ -7,7 +7,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_kernelbench_dataset +from src.dataset import construct_kernelbench_dataset, KernelBenchDataset from src.utils import read_file import os import json @@ -126,22 +126,25 @@ def write_batch_to_json(entries_to_write: list, f_path: str): print(f"[INFO] Wrote {len(entries_to_write)} entries to {f_path}") -def fetch_ref_arch_from_dataset(dataset, - problem_id: int) -> tuple[str, str, str]: - """ - Fetch the reference architecture from the problem directory - problem_id should be logical index (1-indexed), matching the problem_id in the problem_name +def fetch_ref_arch_from_dataset( + dataset: KernelBenchDataset, + problem_id: int +) -> tuple[str, str, str]: + """Fetch the reference architecture from the dataset. + + Args: + dataset: KernelBenchDataset object + problem_id: Logical index (1-indexed), matching the problem_id in the problem_name Returns: - ref_arch_path: str, the path to the reference architecture - ref_arch_name: str, the name of the reference architecture - ref_arch_src: str, the source code of the reference architecture + tuple containing: + - ref_arch_path: Path to the reference architecture + - ref_arch_name: Name of the reference architecture file + - ref_arch_src: Source code of the reference architecture """ ref_arch_path = dataset.get_problem_by_id(problem_id) - ref_arch_src = read_file(ref_arch_path) - - ref_arch_name = ref_arch_path.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_path) return (ref_arch_path, ref_arch_name, ref_arch_src) @app.cls(image=image, scaledown_window=5) diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py index 1e1c5a10..6e887c9b 100644 --- a/scripts/inspect_triton.py +++ b/scripts/inspect_triton.py @@ -26,17 +26,23 @@ set_seed, ) -from src.dataset import construct_kernelbench_dataset +from src.dataset import construct_kernelbench_dataset, KernelBenchDataset -def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str]: - """ - Fetch the reference architecture from the problem directory - problem_id should be logical index (1-indexed), matching the problem_id in the problem_name +def fetch_ref_arch_from_dataset( + dataset: KernelBenchDataset, + problem_id: int +) -> tuple[str, str, str]: + """Fetch the reference architecture from the dataset. + + Args: + dataset: KernelBenchDataset object + problem_id: Logical index (1-indexed), matching the problem_id in the problem_name Returns: - ref_arch_path: str, the path to the reference architecture - ref_arch_name: str, the name of the reference architecture - ref_arch_src: str, the source code of the reference architecture + tuple containing: + - ref_arch_path: Path to the reference architecture + - ref_arch_name: Name of the reference architecture file + - ref_arch_src: Source code of the reference architecture """ ref_arch_path = dataset.get_problem_by_id(problem_id) ref_arch_src = read_file(ref_arch_path) @@ -44,10 +50,20 @@ def fetch_ref_arch_from_dataset(dataset, problem_id: int) -> tuple[str, str, str return (ref_arch_path, ref_arch_name, ref_arch_src) -def run_profile_and_save_trace(dataset: list[str], problem_id: int, num_trials=10): - """ - Helper function to get Torch Profile of a problem - # TODO: Fix up this function +def run_profile_and_save_trace( + dataset: KernelBenchDataset, + problem_id: int, + num_trials: int = 10 +) -> None: + """Helper function to get Torch Profile of a problem. + + Args: + dataset: KernelBenchDataset object + problem_id: Problem ID to profile + num_trials: Number of profiling trials to run (default: 10) + + Note: + Saves trace files to 'trace_non_compiled.json' and 'trace_compiled.json' """ ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset( dataset, problem_id @@ -112,10 +128,17 @@ def run_profile_and_save_trace(dataset: list[str], problem_id: int, num_trials=1 # except Exception as e: # print(f"[Eval] Error in Measuring Performance: {e}") -def get_torch_compile_triton(level_num, problem_id): - """ - Get the triton code generated by torch compile for a particular problem +def get_torch_compile_triton(level_num: int, problem_id: int) -> str: + """Get the triton code generated by torch compile for a particular problem. + + Args: + level_num: KernelBench level (1, 2, or 3) + problem_id: Problem ID to inspect + + Returns: + str: Name of the reference architecture """ + dataset = construct_kernelbench_dataset(level_num) ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset( dataset, problem_id ) diff --git a/src/dataset.py b/src/dataset.py index 81d393c2..0eb54815 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -39,49 +39,116 @@ def get_code_hash(problem_src: str) -> str: -def check_id_matches_name(problem_id: int, problem_name: str): - """Check if the problem_id matches the ID in the problem_name""" - return problem_id == int(os.path.basename(problem_name).split('_')[0]) +def check_id_matches_name(problem_id: int, problem_name: str) -> bool: + """Check if the problem_id matches the ID in the problem_name. + + Args: + problem_id: The problem ID to check + problem_name: Path to the problem file + + Returns: + bool: True if the ID matches the filename prefix + + Raises: + ValueError: If filename doesn't follow the expected format + """ + basename = os.path.basename(problem_name) + parts = basename.split('_') + + if not parts or not parts[0].isdigit(): + raise ValueError( + f"Problem filename '{basename}' doesn't follow expected format '_.py'" + ) + + return problem_id == int(parts[0]) class KernelBenchDataset(): - def __init__(self, dataset_name: str, level: int, use_subset=False, dataset=[], subset_dataset=[]): - + """Dataset object for easy access to problems by IDs and iteration over problems. + + Args: + dataset_name: Name of the dataset + level: KernelBench level (1, 2, or 3) + use_subset: Whether to use the subset_dataset instead of full dataset + dataset: List of problem file paths for the full dataset + subset_dataset: List of problem file paths for a subset + """ + + def __init__( + self, + dataset_name: str, + level: int, + use_subset: bool = False, + dataset: list[str] = None, + subset_dataset: list[str] = None + ): self.dataset_name = dataset_name - + self.level = level + self.use_subset = use_subset + + # Avoid mutable default arguments + if dataset is None: + dataset = [] + if subset_dataset is None: + subset_dataset = [] + if use_subset: self.problems = subset_dataset else: self.problems = dataset - self.level = level - self.use_subset = use_subset + def get_problem_by_id(self, problem_id: int) -> str: + """Get problem path by its ID (1-indexed logical index). - # print(f"[Initilaize Dataset Object] {self.dataset_name} with level {self.level} and use_subset {self.use_subset}") + Args: + problem_id: The problem ID to search for - def get_problem_by_id(self, problem_id=int): - "Logical index of problem_id (logical is 1-indexed)" - # Find problem with matching ID in basename + Returns: + str: Path to the problem file + Raises: + ValueError: If problem ID not found in dataset + """ for problem in self.problems: if check_id_matches_name(problem_id, problem): return problem raise ValueError(f"Problem ID {problem_id} not found in dataset") - # get the problem_ids - def get_problem_ids(self): - # return self.whol + def get_problem_ids(self) -> list[int]: + """Get list of all problem IDs in the dataset. + + Returns: + list[int]: Sorted list of problem IDs extracted from filenames + """ return [int(os.path.basename(problem).split('_')[0]) for problem in self.problems] - def __len__(self): + def __len__(self) -> int: + """Return the number of problems in the dataset.""" return len(self.problems) - def __getitem__(self, index): + def __getitem__(self, index: int) -> str: + """Get problem by index (0-indexed, for backward compatibility). + + Args: + index: Zero-based index into the problems list + + Returns: + str: Path to the problem file + """ return self.problems[index] - + def __iter__(self): + """Iterate over problem paths in the dataset.""" return iter(self.problems) + def __repr__(self) -> str: + """Return string representation of the dataset.""" + subset_str = " (subset)" if self.use_subset else "" + return ( + f"KernelBenchDataset(name='{self.dataset_name}', " + f"level={self.level}, problems={len(self.problems)}{subset_str})" + ) + def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]: """ From 660a47f2cc5dd37ab0192732251ef653fea79dcf Mon Sep 17 00:00:00 2001 From: pythonomar22 Date: Thu, 20 Nov 2025 20:44:08 -0800 Subject: [PATCH 3/5] fixing off by one error after testing --- scripts/generate_baseline_time.py | 26 ++------------- scripts/generate_baseline_time_modal.py | 25 ++------------- scripts/generate_samples.py | 2 +- scripts/inspect_baseline.py | 4 +-- scripts/inspect_triton.py | 23 +------------- src/dataset.py | 42 +++++++++++++++++++++++-- src/eval.py | 3 +- 7 files changed, 49 insertions(+), 76 deletions(-) diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py index a8abb911..95fca7ad 100644 --- a/scripts/generate_baseline_time.py +++ b/scripts/generate_baseline_time.py @@ -7,7 +7,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_kernelbench_dataset, KernelBenchDataset +from src.dataset import construct_kernelbench_dataset, KernelBenchDataset, fetch_ref_arch_from_dataset from src.utils import read_file import os import json @@ -46,28 +46,6 @@ TIMING_DIR = os.path.join(REPO_TOP_PATH, "results", "timing") -def fetch_ref_arch_from_dataset( - dataset: KernelBenchDataset, - problem_id: int -) -> tuple[str, str, str]: - """Fetch the reference architecture from the dataset. - - Args: - dataset: KernelBenchDataset object - problem_id: Logical index (1-indexed), matching the problem_id in the problem_name - - Returns: - tuple containing: - - ref_arch_path: Path to the reference architecture - - ref_arch_name: Name of the reference architecture file - - ref_arch_src: Source code of the reference architecture - """ - ref_arch_path = dataset.get_problem_by_id(problem_id) - ref_arch_src = read_file(ref_arch_path) - ref_arch_name = os.path.basename(ref_arch_path) - return (ref_arch_path, ref_arch_name, ref_arch_src) - - def measure_program_time( ref_arch_name: str, ref_arch_src: str, @@ -243,7 +221,7 @@ def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False): ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id( level_num, problem_id, with_name=True ) - ref_arch_name = ref_arch_name.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_name) context = {} Model, get_init_inputs, get_inputs = load_original_model_and_inputs( ref_arch_src, context diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py index 85fc5e88..f7a579fa 100644 --- a/scripts/generate_baseline_time_modal.py +++ b/scripts/generate_baseline_time_modal.py @@ -7,7 +7,7 @@ set_seed, fetch_ref_arch_from_problem_id, ) -from src.dataset import construct_kernelbench_dataset, KernelBenchDataset +from src.dataset import construct_kernelbench_dataset, KernelBenchDataset, fetch_ref_arch_from_dataset from src.utils import read_file import os import json @@ -126,27 +126,6 @@ def write_batch_to_json(entries_to_write: list, f_path: str): print(f"[INFO] Wrote {len(entries_to_write)} entries to {f_path}") -def fetch_ref_arch_from_dataset( - dataset: KernelBenchDataset, - problem_id: int -) -> tuple[str, str, str]: - """Fetch the reference architecture from the dataset. - - Args: - dataset: KernelBenchDataset object - problem_id: Logical index (1-indexed), matching the problem_id in the problem_name - - Returns: - tuple containing: - - ref_arch_path: Path to the reference architecture - - ref_arch_name: Name of the reference architecture file - - ref_arch_src: Source code of the reference architecture - """ - ref_arch_path = dataset.get_problem_by_id(problem_id) - ref_arch_src = read_file(ref_arch_path) - ref_arch_name = os.path.basename(ref_arch_path) - return (ref_arch_path, ref_arch_name, ref_arch_src) - @app.cls(image=image, scaledown_window=5) class EvalFunc: @@ -348,7 +327,7 @@ def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False): ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id( level_num, problem_id, with_name=True ) - ref_arch_name = ref_arch_name.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_name) context = {} Model, get_init_inputs, get_inputs = load_original_model_and_inputs( ref_arch_src, context diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py index c6869c15..630d6bf7 100644 --- a/scripts/generate_samples.py +++ b/scripts/generate_samples.py @@ -46,7 +46,7 @@ def __init__(self): self.subset = ( None, None, - ) # (problem_id, problem_name), these are the logical index + ) # (start_id, end_id), both inclusive - logical 1-indexed IDs self.run_name = REQUIRED # name of the run diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py index 29afe0c9..90bd7f2f 100644 --- a/scripts/inspect_baseline.py +++ b/scripts/inspect_baseline.py @@ -100,7 +100,7 @@ def inspect_torch_compile_triton(level_num, problem_id): ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id( level_num, problem_id, with_name=True ) - ref_arch_name = ref_arch_name.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_name) context = {} Model, get_init_inputs, get_inputs = load_original_model_and_inputs( ref_arch_src, context @@ -115,7 +115,7 @@ def inspect_baseline_torch_compile(level_num, problem_id): level_num, problem_id, with_name=True ) - ref_arch_name = ref_arch_name.split("/")[-1] + ref_arch_name = os.path.basename(ref_arch_name) context = {} Model, get_init_inputs, get_inputs = load_original_model_and_inputs( ref_arch_src, context diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py index 6e887c9b..0170dada 100644 --- a/scripts/inspect_triton.py +++ b/scripts/inspect_triton.py @@ -26,28 +26,7 @@ set_seed, ) -from src.dataset import construct_kernelbench_dataset, KernelBenchDataset - -def fetch_ref_arch_from_dataset( - dataset: KernelBenchDataset, - problem_id: int -) -> tuple[str, str, str]: - """Fetch the reference architecture from the dataset. - - Args: - dataset: KernelBenchDataset object - problem_id: Logical index (1-indexed), matching the problem_id in the problem_name - - Returns: - tuple containing: - - ref_arch_path: Path to the reference architecture - - ref_arch_name: Name of the reference architecture file - - ref_arch_src: Source code of the reference architecture - """ - ref_arch_path = dataset.get_problem_by_id(problem_id) - ref_arch_src = read_file(ref_arch_path) - ref_arch_name = os.path.basename(ref_arch_path) - return (ref_arch_path, ref_arch_name, ref_arch_src) +from src.dataset import construct_kernelbench_dataset, KernelBenchDataset, fetch_ref_arch_from_dataset def run_profile_and_save_trace( diff --git a/src/dataset.py b/src/dataset.py index 0eb54815..a674d8a9 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -55,12 +55,19 @@ def check_id_matches_name(problem_id: int, problem_name: str) -> bool: basename = os.path.basename(problem_name) parts = basename.split('_') - if not parts or not parts[0].isdigit(): + if len(parts) < 2: raise ValueError( f"Problem filename '{basename}' doesn't follow expected format '_.py'" ) - return problem_id == int(parts[0]) + try: + file_id = int(parts[0]) + except ValueError: + raise ValueError( + f"Problem filename '{basename}' doesn't start with a numeric ID" + ) + + return problem_id == file_id class KernelBenchDataset(): @@ -82,6 +89,9 @@ def __init__( dataset: list[str] = None, subset_dataset: list[str] = None ): + if level not in [1, 2, 3]: + raise ValueError(f"level must be 1, 2, or 3, got {level}") + self.dataset_name = dataset_name self.level = level self.use_subset = use_subset @@ -120,7 +130,7 @@ def get_problem_ids(self) -> list[int]: Returns: list[int]: Sorted list of problem IDs extracted from filenames """ - return [int(os.path.basename(problem).split('_')[0]) for problem in self.problems] + return sorted([int(os.path.basename(problem).split('_')[0]) for problem in self.problems]) def __len__(self) -> int: """Return the number of problems in the dataset.""" @@ -150,6 +160,32 @@ def __repr__(self) -> str: ) +def fetch_ref_arch_from_dataset( + dataset: "KernelBenchDataset", + problem_id: int +) -> tuple[str, str, str]: + """Fetch the reference architecture from the dataset. + + This is a shared utility function to avoid duplication across scripts. + + Args: + dataset: KernelBenchDataset object + problem_id: Logical index (1-indexed), matching the problem_id in the problem_name + + Returns: + tuple containing: + - ref_arch_path: Path to the reference architecture + - ref_arch_name: Name of the reference architecture file + - ref_arch_src: Source code of the reference architecture + """ + from .utils import read_file + + ref_arch_path = dataset.get_problem_by_id(problem_id) + ref_arch_src = read_file(ref_arch_path) + ref_arch_name = os.path.basename(ref_arch_path) + return (ref_arch_path, ref_arch_name, ref_arch_src) + + def construct_problem_dataset_from_problem_dir(problem_dir: str) -> list[str]: """ Construct a list of relative paths to all the python files in the problem directory diff --git a/src/eval.py b/src/eval.py index 9f2862a9..1ae8b83c 100644 --- a/src/eval.py +++ b/src/eval.py @@ -49,7 +49,8 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str if hasattr(problems, "get_problem_by_id"): problem_path = problems.get_problem_by_id(problem_id) else: - problem_path = problems[problem_id] + # Fallback for old list-based API: problem_id is 1-indexed but lists are 0-indexed + problem_path = problems[problem_id - 1] # problem_path = os.path.join(REPO_ROOT_PATH, problem) if not os.path.exists(problem_path): From dfbef11aab7e24ec787c2b127085cdb97688c550 Mon Sep 17 00:00:00 2001 From: pythonomar22 Date: Thu, 20 Nov 2025 20:47:45 -0800 Subject: [PATCH 4/5] Remove timing JSONs from PR and ignore them --- .../A10G_modal/baseline_time_torch.json | 904 ------------------ ...e_time_torch_compile_inductor_default.json | 904 ------------------ .../H100_modal/baseline_time_torch.json | 904 ------------------ ...e_time_torch_compile_inductor_default.json | 904 ------------------ 4 files changed, 3616 deletions(-) delete mode 100644 results/timing/A10G_modal/baseline_time_torch.json delete mode 100644 results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json delete mode 100644 results/timing/H100_modal/baseline_time_torch.json delete mode 100644 results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json diff --git a/results/timing/A10G_modal/baseline_time_torch.json b/results/timing/A10G_modal/baseline_time_torch.json deleted file mode 100644 index 327a00c2..00000000 --- a/results/timing/A10G_modal/baseline_time_torch.json +++ /dev/null @@ -1,904 +0,0 @@ -{ - "level1": { - "1_Square_matrix_multiplication_.py": { - "mean": 5.78, - "std": 0.0635, - "min": 5.55, - "max": 5.91, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "2_Standard_matrix_multiplication_.py": { - "mean": 8.47, - "std": 0.293, - "min": 6.99, - "max": 9.51, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "3_Batched_matrix_multiplication.py": { - "mean": 14.0, - "std": 0.169, - "min": 13.5, - "max": 14.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "4_Matrix_vector_multiplication_.py": { - "mean": 25.5, - "std": 0.157, - "min": 25.0, - "max": 25.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "5_Matrix_scalar_multiplication.py": { - "mean": 17.6, - "std": 0.0154, - "min": 17.6, - "max": 17.7, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "6_Matmul_with_large_K_dimension_.py": { - "mean": 3.3, - "std": 0.0495, - "min": 3.1, - "max": 3.34, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "7_Matmul_with_small_K_dimension_.py": { - "mean": 14.9, - "std": 1.19, - "min": 13.3, - "max": 23.3, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "8_Matmul_with_irregular_shapes_.py": { - "mean": 21.0, - "std": 0.68, - "min": 20.5, - "max": 25.7, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "9_Tall_skinny_matrix_multiplication_.py": { - "mean": 10.9, - "std": 0.0388, - "min": 10.9, - "max": 11.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "10_3D_tensor_matrix_multiplication.py": { - "mean": 2.4, - "std": 0.0551, - "min": 2.25, - "max": 2.45, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "11_4D_tensor_matrix_multiplication.py": { - "mean": 22.2, - "std": 0.224, - "min": 21.8, - "max": 22.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "12_Matmul_with_diagonal_matrices_.py": { - "mean": 9.62, - "std": 0.537, - "min": 7.46, - "max": 11.6, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "13_Matmul_for_symmetric_matrices.py": { - "mean": 10.4, - "std": 0.889, - "min": 8.12, - "max": 15.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "14_Matmul_for_upper_triangular_matrices.py": { - "mean": 5.75, - "std": 0.0515, - "min": 5.71, - "max": 6.06, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "15_Matmul_for_lower_triangular_matrices.py": { - "mean": 5.71, - "std": 0.0196, - "min": 5.7, - "max": 5.88, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "16_Matmul_with_transposed_A.py": { - "mean": 8.29, - "std": 0.228, - "min": 6.84, - "max": 8.54, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "17_Matmul_with_transposed_B.py": { - "mean": 11.9, - "std": 0.674, - "min": 10.1, - "max": 15.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "18_Matmul_with_transposed_both.py": { - "mean": 8.78, - "std": 0.415, - "min": 7.41, - "max": 10.7, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "19_ReLU.py": { - "mean": 26.6, - "std": 0.0288, - "min": 26.5, - "max": 26.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "20_LeakyReLU.py": { - "mean": 26.4, - "std": 0.0369, - "min": 26.4, - "max": 26.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "21_Sigmoid.py": { - "mean": 26.5, - "std": 0.0341, - "min": 26.4, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "22_Tanh.py": { - "mean": 26.6, - "std": 0.0275, - "min": 26.5, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "23_Softmax.py": { - "mean": 51.4, - "std": 0.0335, - "min": 51.3, - "max": 51.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "24_LogSoftmax.py": { - "mean": 51.4, - "std": 0.0255, - "min": 51.3, - "max": 51.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "25_Swish.py": { - "mean": 65.9, - "std": 0.033, - "min": 65.8, - "max": 66.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "26_GELU_.py": { - "mean": 26.5, - "std": 0.0244, - "min": 26.5, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "27_SELU_.py": { - "mean": 26.4, - "std": 0.014, - "min": 26.3, - "max": 26.4, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "28_HardSigmoid.py": { - "mean": 26.6, - "std": 0.0332, - "min": 26.5, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "29_Softplus.py": { - "mean": 26.5, - "std": 0.0349, - "min": 26.5, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "30_Softsign.py": { - "mean": 92.3, - "std": 0.0474, - "min": 92.2, - "max": 92.4, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "31_ELU.py": { - "mean": 26.4, - "std": 0.0291, - "min": 26.4, - "max": 26.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "32_HardTanh.py": { - "mean": 26.4, - "std": 0.0333, - "min": 26.4, - "max": 26.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "33_BatchNorm.py": { - "mean": 28.3, - "std": 0.0373, - "min": 28.2, - "max": 28.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "34_InstanceNorm.py": { - "mean": 47.4, - "std": 0.0383, - "min": 47.3, - "max": 47.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "35_GroupNorm_.py": { - "mean": 46.6, - "std": 0.0375, - "min": 46.5, - "max": 46.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "36_RMSNorm_.py": { - "mean": 80.9, - "std": 0.0425, - "min": 80.8, - "max": 81.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "37_FrobeniusNorm_.py": { - "mean": 45.5, - "std": 0.0466, - "min": 45.4, - "max": 45.7, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "38_L1Norm_.py": { - "mean": 88.2, - "std": 0.0341, - "min": 88.1, - "max": 88.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "39_L2Norm_.py": { - "mean": 53.0, - "std": 0.105, - "min": 52.9, - "max": 53.9, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "40_LayerNorm.py": { - "mean": 8.53, - "std": 0.0196, - "min": 8.52, - "max": 8.65, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "41_Max_Pooling_1D.py": { - "mean": 27.0, - "std": 0.0402, - "min": 26.9, - "max": 27.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "42_Max_Pooling_2D.py": { - "mean": 30.9, - "std": 1.31, - "min": 30.1, - "max": 40.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "43_Max_Pooling_3D.py": { - "mean": 12.9, - "std": 0.16, - "min": 12.2, - "max": 13.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "44_Average_Pooling_1D.py": { - "mean": 18.9, - "std": 0.497, - "min": 18.5, - "max": 21.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "45_Average_Pooling_2D.py": { - "mean": 44.0, - "std": 0.0797, - "min": 43.8, - "max": 44.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "46_Average_Pooling_3D.py": { - "mean": 18.8, - "std": 0.0448, - "min": 18.7, - "max": 18.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "47_Sum_reduction_over_a_dimension.py": { - "mean": 21.1, - "std": 0.0911, - "min": 20.8, - "max": 21.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "48_Mean_reduction_over_a_dimension.py": { - "mean": 21.0, - "std": 0.0635, - "min": 20.9, - "max": 21.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "49_Max_reduction_over_a_dimension.py": { - "mean": 20.1, - "std": 0.0694, - "min": 19.9, - "max": 20.3, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "50_conv_standard_2D__square_input__square_kernel.py": { - "mean": 16.1, - "std": 0.0699, - "min": 16.0, - "max": 16.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "51_Argmax_over_a_dimension.py": { - "mean": 20.9, - "std": 0.0814, - "min": 20.7, - "max": 21.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "52_Argmin_over_a_dimension.py": { - "mean": 20.9, - "std": 0.0777, - "min": 20.7, - "max": 21.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "53_Min_reduction_over_a_dimension.py": { - "mean": 20.9, - "std": 0.0826, - "min": 20.8, - "max": 21.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "54_conv_standard_3D__square_input__square_kernel.py": { - "mean": 14.4, - "std": 0.0301, - "min": 14.3, - "max": 14.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "55_conv_standard_2D__asymmetric_input__square_kernel.py": { - "mean": 83.4, - "std": 2.16, - "min": 81.5, - "max": 101.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 26.2, - "std": 1.92, - "min": 25.5, - "max": 43.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "57_conv_transposed_2D__square_input__square_kernel.py": { - "mean": 39.5, - "std": 0.0511, - "min": 39.3, - "max": 39.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 17.4, - "std": 0.0441, - "min": 17.3, - "max": 17.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "59_conv_standard_3D__asymmetric_input__square_kernel.py": { - "mean": 13.3, - "std": 0.0313, - "min": 13.2, - "max": 13.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "60_conv_standard_3D__square_input__asymmetric_kernel.py": { - "mean": 31.5, - "std": 0.15, - "min": 31.3, - "max": 31.6, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "61_conv_transposed_3D__square_input__square_kernel.py": { - "mean": 27.5, - "std": 0.0604, - "min": 27.4, - "max": 27.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "62_conv_standard_2D__square_input__asymmetric_kernel.py": { - "mean": 15.1, - "std": 0.0803, - "min": 14.9, - "max": 15.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "63_conv_standard_2D__square_input__square_kernel.py": { - "mean": 43.2, - "std": 0.142, - "min": 43.1, - "max": 44.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "64_conv_transposed_1D.py": { - "mean": 32.8, - "std": 0.033, - "min": 32.7, - "max": 32.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { - "mean": 16.0, - "std": 0.049, - "min": 15.9, - "max": 16.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 22.5, - "std": 0.0621, - "min": 22.4, - "max": 22.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "67_conv_standard_1D.py": { - "mean": 12.5, - "std": 0.0419, - "min": 12.5, - "max": 12.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { - "mean": 393.0, - "std": 0.128, - "min": 393.0, - "max": 394.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 23.3, - "std": 0.0538, - "min": 23.2, - "max": 23.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { - "mean": 87.8, - "std": 0.0543, - "min": 87.6, - "max": 87.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { - "mean": 9.27, - "std": 0.697, - "min": 8.88, - "max": 13.9, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { - "mean": 5.16, - "std": 0.0349, - "min": 5.14, - "max": 5.44, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { - "mean": 17.8, - "std": 0.0655, - "min": 17.7, - "max": 18.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "74_conv_transposed_1D_dilated.py": { - "mean": 8.32, - "std": 0.285, - "min": 6.68, - "max": 8.93, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { - "mean": 16.9, - "std": 0.306, - "min": 16.5, - "max": 18.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "76_conv_standard_1D_dilated_strided__.py": { - "mean": 57.7, - "std": 2.91, - "min": 55.0, - "max": 76.9, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { - "mean": 6.14, - "std": 0.0596, - "min": 6.05, - "max": 6.34, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { - "mean": 16.2, - "std": 0.143, - "min": 16.1, - "max": 17.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { - "mean": 10.9, - "std": 0.0716, - "min": 10.8, - "max": 11.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { - "mean": 14.7, - "std": 0.0183, - "min": 14.7, - "max": 14.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { - "mean": 6.15, - "std": 0.0244, - "min": 6.11, - "max": 6.27, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "82_conv_depthwise_2D_square_input_square_kernel.py": { - "mean": 8.39, - "std": 0.455, - "min": 6.91, - "max": 11.7, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { - "mean": 3.72, - "std": 0.0183, - "min": 3.71, - "max": 3.84, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { - "mean": 24.2, - "std": 0.0117, - "min": 24.2, - "max": 24.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { - "mean": 5.31, - "std": 0.0288, - "min": 5.3, - "max": 5.51, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "86_conv_depthwise_separable_2D.py": { - "mean": 13.0, - "std": 0.0413, - "min": 12.9, - "max": 13.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "87_conv_pointwise_2D.py": { - "mean": 27.0, - "std": 0.0308, - "min": 27.0, - "max": 27.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "88_MinGPTNewGelu.py": { - "mean": 9.99, - "std": 0.0291, - "min": 9.95, - "max": 10.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "89_cumsum.py": { - "mean": 20.3, - "std": 0.0552, - "min": 20.3, - "max": 20.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "90_cumprod.py": { - "mean": 19.9, - "std": 0.0804, - "min": 19.9, - "max": 20.3, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "91_cumsum_reverse.py": { - "mean": 56.1, - "std": 0.0691, - "min": 56.0, - "max": 56.5, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "92_cumsum_exclusive.py": { - "mean": 45.0, - "std": 0.0489, - "min": 44.9, - "max": 45.2, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "93_masked_cumsum.py": { - "mean": 40.5, - "std": 0.173, - "min": 40.5, - "max": 42.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "94_MSELoss.py": { - "mean": 52.7, - "std": 0.0446, - "min": 52.6, - "max": 52.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "95_CrossEntropyLoss.py": { - "mean": 3.09, - "std": 0.0109, - "min": 3.08, - "max": 3.19, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "96_HuberLoss.py": { - "mean": 34.7, - "std": 0.0217, - "min": 34.7, - "max": 34.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "97_ScaledDotProductAttention.py": { - "mean": 44.1, - "std": 0.0406, - "min": 44.0, - "max": 44.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "98_KLDivLoss.py": { - "mean": 24.3, - "std": 0.0332, - "min": 24.2, - "max": 24.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "99_TripletMarginLoss.py": { - "mean": 26.4, - "std": 0.0708, - "min": 26.3, - "max": 27.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "100_HingeLoss.py": { - "mean": 61.9, - "std": 0.0296, - "min": 61.8, - "max": 62.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - } - } -} \ No newline at end of file diff --git a/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json deleted file mode 100644 index 596eb088..00000000 --- a/results/timing/A10G_modal/baseline_time_torch_compile_inductor_default.json +++ /dev/null @@ -1,904 +0,0 @@ -{ - "level1": { - "1_Square_matrix_multiplication_.py": { - "mean": 5.85, - "std": 0.0617, - "min": 5.55, - "max": 6.08, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "2_Standard_matrix_multiplication_.py": { - "mean": 5.93, - "std": 0.0839, - "min": 5.62, - "max": 6.17, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "3_Batched_matrix_multiplication.py": { - "mean": 22.0, - "std": 0.964, - "min": 21.2, - "max": 28.1, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "4_Matrix_vector_multiplication_.py": { - "mean": 25.5, - "std": 0.156, - "min": 25.1, - "max": 25.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "5_Matrix_scalar_multiplication.py": { - "mean": 17.6, - "std": 0.0187, - "min": 17.5, - "max": 17.7, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "6_Matmul_with_large_K_dimension_.py": { - "mean": 3.32, - "std": 0.0394, - "min": 3.15, - "max": 3.37, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "7_Matmul_with_small_K_dimension_.py": { - "mean": 15.1, - "std": 1.06, - "min": 13.1, - "max": 22.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "8_Matmul_with_irregular_shapes_.py": { - "mean": 20.6, - "std": 0.559, - "min": 20.0, - "max": 23.4, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "9_Tall_skinny_matrix_multiplication_.py": { - "mean": 11.1, - "std": 0.12, - "min": 10.9, - "max": 11.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "10_3D_tensor_matrix_multiplication.py": { - "mean": 2.44, - "std": 0.0532, - "min": 2.27, - "max": 2.58, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "11_4D_tensor_matrix_multiplication.py": { - "mean": 22.2, - "std": 0.279, - "min": 21.6, - "max": 23.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "12_Matmul_with_diagonal_matrices_.py": { - "mean": 9.59, - "std": 0.655, - "min": 7.75, - "max": 13.1, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "13_Matmul_for_symmetric_matrices.py": { - "mean": 10.4, - "std": 0.737, - "min": 8.74, - "max": 14.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "14_Matmul_for_upper_triangular_matrices.py": { - "mean": 5.93, - "std": 0.0786, - "min": 5.82, - "max": 6.16, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "15_Matmul_for_lower_triangular_matrices.py": { - "mean": 5.82, - "std": 0.0385, - "min": 5.81, - "max": 6.19, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "16_Matmul_with_transposed_A.py": { - "mean": 8.51, - "std": 0.245, - "min": 6.84, - "max": 8.95, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "17_Matmul_with_transposed_B.py": { - "mean": 7.43, - "std": 0.226, - "min": 6.94, - "max": 8.94, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "18_Matmul_with_transposed_both.py": { - "mean": 6.05, - "std": 0.167, - "min": 5.8, - "max": 7.13, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "19_ReLU.py": { - "mean": 26.5, - "std": 0.0308, - "min": 26.4, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "20_LeakyReLU.py": { - "mean": 26.5, - "std": 0.0313, - "min": 26.4, - "max": 26.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "21_Sigmoid.py": { - "mean": 26.5, - "std": 0.0288, - "min": 26.4, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "22_Tanh.py": { - "mean": 26.6, - "std": 0.516, - "min": 26.4, - "max": 30.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "23_Softmax.py": { - "mean": 52.8, - "std": 0.104, - "min": 52.6, - "max": 53.1, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "24_LogSoftmax.py": { - "mean": 52.9, - "std": 0.0791, - "min": 52.8, - "max": 53.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "25_Swish.py": { - "mean": 26.6, - "std": 0.0973, - "min": 26.5, - "max": 27.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "26_GELU_.py": { - "mean": 26.4, - "std": 0.104, - "min": 26.2, - "max": 26.8, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "27_SELU_.py": { - "mean": 26.6, - "std": 0.077, - "min": 26.4, - "max": 27.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "28_HardSigmoid.py": { - "mean": 26.5, - "std": 0.0566, - "min": 26.4, - "max": 26.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "29_Softplus.py": { - "mean": 26.2, - "std": 0.0483, - "min": 26.2, - "max": 26.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "30_Softsign.py": { - "mean": 26.5, - "std": 0.0414, - "min": 26.4, - "max": 26.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "31_ELU.py": { - "mean": 26.4, - "std": 0.0325, - "min": 26.3, - "max": 26.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "32_HardTanh.py": { - "mean": 26.5, - "std": 0.0774, - "min": 26.4, - "max": 26.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "33_BatchNorm.py": { - "mean": 26.0, - "std": 0.0405, - "min": 25.9, - "max": 26.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "34_InstanceNorm.py": { - "mean": 47.1, - "std": 0.068, - "min": 46.9, - "max": 47.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "35_GroupNorm_.py": { - "mean": 45.7, - "std": 0.0425, - "min": 45.6, - "max": 45.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "36_RMSNorm_.py": { - "mean": 48.9, - "std": 0.0773, - "min": 48.8, - "max": 49.2, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "37_FrobeniusNorm_.py": { - "mean": 45.8, - "std": 0.0738, - "min": 45.7, - "max": 46.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "38_L1Norm_.py": { - "mean": 72.5, - "std": 0.238, - "min": 72.2, - "max": 74.6, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "39_L2Norm_.py": { - "mean": 74.7, - "std": 0.106, - "min": 74.4, - "max": 74.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "40_LayerNorm.py": { - "mean": 2.79, - "std": 0.0655, - "min": 2.75, - "max": 3.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "41_Max_Pooling_1D.py": { - "mean": 31.0, - "std": 1.19, - "min": 30.4, - "max": 39.9, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "42_Max_Pooling_2D.py": { - "mean": 9.84, - "std": 0.21, - "min": 9.71, - "max": 10.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "43_Max_Pooling_3D.py": { - "mean": 12.9, - "std": 0.168, - "min": 12.2, - "max": 14.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "44_Average_Pooling_1D.py": { - "mean": 8.94, - "std": 0.0751, - "min": 8.84, - "max": 9.16, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "45_Average_Pooling_2D.py": { - "mean": 42.0, - "std": 0.909, - "min": 39.7, - "max": 43.1, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "46_Average_Pooling_3D.py": { - "mean": 19.0, - "std": 0.75, - "min": 18.8, - "max": 26.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "47_Sum_reduction_over_a_dimension.py": { - "mean": 22.0, - "std": 0.0462, - "min": 21.9, - "max": 22.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "48_Mean_reduction_over_a_dimension.py": { - "mean": 21.7, - "std": 0.0532, - "min": 21.6, - "max": 21.9, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "49_Max_reduction_over_a_dimension.py": { - "mean": 22.0, - "std": 0.0813, - "min": 21.8, - "max": 22.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "50_conv_standard_2D__square_input__square_kernel.py": { - "mean": 5.42, - "std": 0.127, - "min": 5.27, - "max": 5.76, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "51_Argmax_over_a_dimension.py": { - "mean": 21.5, - "std": 0.0593, - "min": 21.3, - "max": 21.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "52_Argmin_over_a_dimension.py": { - "mean": 21.6, - "std": 0.0785, - "min": 21.4, - "max": 21.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "53_Min_reduction_over_a_dimension.py": { - "mean": 21.9, - "std": 0.0312, - "min": 21.8, - "max": 22.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "54_conv_standard_3D__square_input__square_kernel.py": { - "mean": 11.5, - "std": 0.382, - "min": 10.0, - "max": 13.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "55_conv_standard_2D__asymmetric_input__square_kernel.py": { - "mean": 37.5, - "std": 0.0376, - "min": 37.4, - "max": 37.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 22.2, - "std": 0.195, - "min": 22.1, - "max": 23.4, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "57_conv_transposed_2D__square_input__square_kernel.py": { - "mean": 49.1, - "std": 0.0756, - "min": 49.0, - "max": 49.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 17.4, - "std": 0.0674, - "min": 17.4, - "max": 17.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "59_conv_standard_3D__asymmetric_input__square_kernel.py": { - "mean": 13.3, - "std": 0.05, - "min": 13.3, - "max": 13.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "60_conv_standard_3D__square_input__asymmetric_kernel.py": { - "mean": 45.5, - "std": 0.107, - "min": 45.3, - "max": 45.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "61_conv_transposed_3D__square_input__square_kernel.py": { - "mean": 27.5, - "std": 0.093, - "min": 27.4, - "max": 27.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "62_conv_standard_2D__square_input__asymmetric_kernel.py": { - "mean": 14.5, - "std": 0.406, - "min": 13.0, - "max": 16.4, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "63_conv_standard_2D__square_input__square_kernel.py": { - "mean": 80.1, - "std": 0.312, - "min": 79.7, - "max": 82.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "64_conv_transposed_1D.py": { - "mean": 33.1, - "std": 0.0561, - "min": 33.0, - "max": 33.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { - "mean": 18.5, - "std": 0.062, - "min": 18.4, - "max": 18.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 22.6, - "std": 0.124, - "min": 22.4, - "max": 22.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "67_conv_standard_1D.py": { - "mean": 12.5, - "std": 0.0701, - "min": 12.5, - "max": 12.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { - "mean": 393.0, - "std": 0.127, - "min": 393.0, - "max": 393.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 22.2, - "std": 0.479, - "min": 21.2, - "max": 23.8, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { - "mean": 87.8, - "std": 0.0361, - "min": 87.7, - "max": 88.0, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { - "mean": 12.8, - "std": 0.0219, - "min": 12.7, - "max": 12.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { - "mean": 6.62, - "std": 0.157, - "min": 5.73, - "max": 6.87, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { - "mean": 17.3, - "std": 0.322, - "min": 16.5, - "max": 18.9, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "74_conv_transposed_1D_dilated.py": { - "mean": 5.79, - "std": 0.0773, - "min": 5.46, - "max": 5.88, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { - "mean": 14.3, - "std": 0.048, - "min": 14.3, - "max": 14.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "76_conv_standard_1D_dilated_strided__.py": { - "mean": 35.6, - "std": 0.569, - "min": 34.6, - "max": 36.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { - "mean": 6.12, - "std": 0.0542, - "min": 6.08, - "max": 6.51, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { - "mean": 18.5, - "std": 0.0573, - "min": 18.4, - "max": 18.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { - "mean": 11.0, - "std": 0.13, - "min": 10.8, - "max": 11.7, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { - "mean": 15.9, - "std": 0.0591, - "min": 15.9, - "max": 16.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { - "mean": 6.4, - "std": 0.138, - "min": 6.28, - "max": 6.82, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "82_conv_depthwise_2D_square_input_square_kernel.py": { - "mean": 14.0, - "std": 0.113, - "min": 14.0, - "max": 15.1, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { - "mean": 32.8, - "std": 0.106, - "min": 32.6, - "max": 33.2, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { - "mean": 55.6, - "std": 0.0411, - "min": 55.6, - "max": 55.8, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { - "mean": 47.9, - "std": 0.31, - "min": 47.2, - "max": 48.8, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "86_conv_depthwise_separable_2D.py": { - "mean": 25.3, - "std": 0.0247, - "min": 25.3, - "max": 25.4, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "87_conv_pointwise_2D.py": { - "mean": 83.6, - "std": 0.0593, - "min": 83.5, - "max": 83.9, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "88_MinGPTNewGelu.py": { - "mean": 1.16, - "std": 0.0281, - "min": 1.14, - "max": 1.29, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "89_cumsum.py": { - "mean": 17.7, - "std": 0.0877, - "min": 17.6, - "max": 18.0, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "90_cumprod.py": { - "mean": 17.8, - "std": 0.0928, - "min": 17.7, - "max": 18.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "91_cumsum_reverse.py": { - "mean": 35.2, - "std": 0.0892, - "min": 35.1, - "max": 35.4, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "92_cumsum_exclusive.py": { - "mean": 17.7, - "std": 0.0365, - "min": 17.7, - "max": 18.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "93_masked_cumsum.py": { - "mean": 19.9, - "std": 0.0254, - "min": 19.9, - "max": 20.1, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "94_MSELoss.py": { - "mean": 17.2, - "std": 0.207, - "min": 17.0, - "max": 18.6, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "95_CrossEntropyLoss.py": { - "mean": 1.13, - "std": 0.00513, - "min": 1.12, - "max": 1.15, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "96_HuberLoss.py": { - "mean": 17.1, - "std": 0.0508, - "min": 17.1, - "max": 17.3, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - }, - "97_ScaledDotProductAttention.py": { - "mean": 44.6, - "std": 1.99, - "min": 44.2, - "max": 64.3, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "98_KLDivLoss.py": { - "mean": 4.21, - "std": 0.0217, - "min": 4.2, - "max": 4.32, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "99_TripletMarginLoss.py": { - "mean": 6.35, - "std": 0.00473, - "min": 6.34, - "max": 6.36, - "num_trials": 100, - "hardware": "NVIDIA A10G", - "device": "cuda:0" - }, - "100_HingeLoss.py": { - "mean": 8.55, - "std": 0.505, - "min": 8.4, - "max": 13.5, - "num_trials": 100, - "hardware": "NVIDIA A10", - "device": "cuda:0" - } - } -} \ No newline at end of file diff --git a/results/timing/H100_modal/baseline_time_torch.json b/results/timing/H100_modal/baseline_time_torch.json deleted file mode 100644 index 5bdcd393..00000000 --- a/results/timing/H100_modal/baseline_time_torch.json +++ /dev/null @@ -1,904 +0,0 @@ -{ - "level1": { - "1_Square_matrix_multiplication_.py": { - "mean": 2.66, - "std": 0.00178, - "min": 2.66, - "max": 2.67, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "2_Standard_matrix_multiplication_.py": { - "mean": 2.64, - "std": 0.0039, - "min": 2.64, - "max": 2.67, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "3_Batched_matrix_multiplication.py": { - "mean": 5.34, - "std": 0.00552, - "min": 5.33, - "max": 5.38, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "4_Matrix_vector_multiplication_.py": { - "mean": 2.78, - "std": 0.00237, - "min": 2.78, - "max": 2.79, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "5_Matrix_scalar_multiplication.py": { - "mean": 2.84, - "std": 0.00653, - "min": 2.83, - "max": 2.87, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "6_Matmul_with_large_K_dimension_.py": { - "mean": 1.32, - "std": 0.00464, - "min": 1.31, - "max": 1.34, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "7_Matmul_with_small_K_dimension_.py": { - "mean": 4.12, - "std": 0.00954, - "min": 4.11, - "max": 4.21, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "8_Matmul_with_irregular_shapes_.py": { - "mean": 6.42, - "std": 0.00544, - "min": 6.41, - "max": 6.45, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "9_Tall_skinny_matrix_multiplication_.py": { - "mean": 2.61, - "std": 0.00374, - "min": 2.61, - "max": 2.63, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "10_3D_tensor_matrix_multiplication.py": { - "mean": 1.05, - "std": 0.00159, - "min": 1.04, - "max": 1.06, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "11_4D_tensor_matrix_multiplication.py": { - "mean": 11.1, - "std": 0.957, - "min": 10.1, - "max": 13.6, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "12_Matmul_with_diagonal_matrices_.py": { - "mean": 2.69, - "std": 0.00425, - "min": 2.68, - "max": 2.7, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "13_Matmul_for_symmetric_matrices.py": { - "mean": 2.65, - "std": 0.0045, - "min": 2.65, - "max": 2.67, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "14_Matmul_for_upper_triangular_matrices.py": { - "mean": 2.71, - "std": 0.00376, - "min": 2.7, - "max": 2.73, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "15_Matmul_for_lower_triangular_matrices.py": { - "mean": 2.71, - "std": 0.00182, - "min": 2.71, - "max": 2.72, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "16_Matmul_with_transposed_A.py": { - "mean": 2.62, - "std": 0.00207, - "min": 2.61, - "max": 2.62, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "17_Matmul_with_transposed_B.py": { - "mean": 2.74, - "std": 0.00801, - "min": 2.71, - "max": 2.76, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "18_Matmul_with_transposed_both.py": { - "mean": 2.78, - "std": 0.00828, - "min": 2.76, - "max": 2.81, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "19_ReLU.py": { - "mean": 4.27, - "std": 0.00845, - "min": 4.26, - "max": 4.35, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "20_LeakyReLU.py": { - "mean": 4.27, - "std": 0.00191, - "min": 4.26, - "max": 4.27, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "21_Sigmoid.py": { - "mean": 4.26, - "std": 0.00198, - "min": 4.26, - "max": 4.27, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "22_Tanh.py": { - "mean": 3.05, - "std": 0.00172, - "min": 3.04, - "max": 3.05, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "23_Softmax.py": { - "mean": 7.12, - "std": 0.0142, - "min": 7.11, - "max": 7.18, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "24_LogSoftmax.py": { - "mean": 6.18, - "std": 0.0645, - "min": 6.06, - "max": 6.33, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "25_Swish.py": { - "mean": 10.6, - "std": 0.00389, - "min": 10.6, - "max": 10.6, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "26_GELU_.py": { - "mean": 4.24, - "std": 0.00177, - "min": 4.23, - "max": 4.24, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "27_SELU_.py": { - "mean": 3.02, - "std": 0.00174, - "min": 3.02, - "max": 3.03, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "28_HardSigmoid.py": { - "mean": 4.26, - "std": 0.00202, - "min": 4.26, - "max": 4.27, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "29_Softplus.py": { - "mean": 4.23, - "std": 0.00216, - "min": 4.23, - "max": 4.24, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "30_Softsign.py": { - "mean": 10.4, - "std": 0.00375, - "min": 10.4, - "max": 10.4, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "31_ELU.py": { - "mean": 4.24, - "std": 0.00229, - "min": 4.24, - "max": 4.25, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "32_HardTanh.py": { - "mean": 4.24, - "std": 0.00168, - "min": 4.23, - "max": 4.24, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "33_BatchNorm.py": { - "mean": 8.8, - "std": 0.016, - "min": 8.77, - "max": 8.85, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "34_InstanceNorm.py": { - "mean": 9.57, - "std": 0.0119, - "min": 9.55, - "max": 9.6, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "35_GroupNorm_.py": { - "mean": 9.94, - "std": 0.0103, - "min": 9.93, - "max": 10.0, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "36_RMSNorm_.py": { - "mean": 14.2, - "std": 0.00266, - "min": 14.2, - "max": 14.2, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "37_FrobeniusNorm_.py": { - "mean": 8.42, - "std": 0.00264, - "min": 8.41, - "max": 8.42, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "38_L1Norm_.py": { - "mean": 15.5, - "std": 0.00742, - "min": 15.5, - "max": 15.5, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "39_L2Norm_.py": { - "mean": 10.0, - "std": 0.0024, - "min": 10.0, - "max": 10.0, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "40_LayerNorm.py": { - "mean": 8.12, - "std": 0.00649, - "min": 8.11, - "max": 8.16, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "41_Max_Pooling_1D.py": { - "mean": 10.7, - "std": 0.00997, - "min": 10.7, - "max": 10.7, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "42_Max_Pooling_2D.py": { - "mean": 10.7, - "std": 0.00823, - "min": 10.7, - "max": 10.8, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "43_Max_Pooling_3D.py": { - "mean": 3.93, - "std": 0.00239, - "min": 3.93, - "max": 3.94, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "44_Average_Pooling_1D.py": { - "mean": 8.02, - "std": 0.0057, - "min": 8.01, - "max": 8.06, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "45_Average_Pooling_2D.py": { - "mean": 6.6, - "std": 0.0259, - "min": 6.55, - "max": 6.7, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "46_Average_Pooling_3D.py": { - "mean": 8.66, - "std": 0.00616, - "min": 8.65, - "max": 8.69, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "47_Sum_reduction_over_a_dimension.py": { - "mean": 2.11, - "std": 0.0154, - "min": 2.08, - "max": 2.16, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "48_Mean_reduction_over_a_dimension.py": { - "mean": 2.89, - "std": 0.0163, - "min": 2.86, - "max": 2.94, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "49_Max_reduction_over_a_dimension.py": { - "mean": 3.17, - "std": 0.00295, - "min": 3.16, - "max": 3.17, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "50_conv_standard_2D__square_input__square_kernel.py": { - "mean": 2.09, - "std": 0.0172, - "min": 2.08, - "max": 2.15, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "51_Argmax_over_a_dimension.py": { - "mean": 3.25, - "std": 0.00328, - "min": 3.25, - "max": 3.27, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "52_Argmin_over_a_dimension.py": { - "mean": 3.24, - "std": 0.00269, - "min": 3.24, - "max": 3.25, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "53_Min_reduction_over_a_dimension.py": { - "mean": 3.18, - "std": 0.00328, - "min": 3.17, - "max": 3.19, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "54_conv_standard_3D__square_input__square_kernel.py": { - "mean": 1.36, - "std": 0.00258, - "min": 1.36, - "max": 1.38, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "55_conv_standard_2D__asymmetric_input__square_kernel.py": { - "mean": 4.18, - "std": 0.0626, - "min": 4.0, - "max": 4.27, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 3.44, - "std": 0.046, - "min": 3.37, - "max": 3.54, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "57_conv_transposed_2D__square_input__square_kernel.py": { - "mean": 6.56, - "std": 0.0393, - "min": 6.53, - "max": 6.86, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 2.32, - "std": 0.0145, - "min": 2.29, - "max": 2.37, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "59_conv_standard_3D__asymmetric_input__square_kernel.py": { - "mean": 2.09, - "std": 0.0049, - "min": 2.08, - "max": 2.11, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "60_conv_standard_3D__square_input__asymmetric_kernel.py": { - "mean": 5.29, - "std": 0.0129, - "min": 5.26, - "max": 5.31, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "61_conv_transposed_3D__square_input__square_kernel.py": { - "mean": 5.5, - "std": 0.011, - "min": 5.48, - "max": 5.58, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "62_conv_standard_2D__square_input__asymmetric_kernel.py": { - "mean": 3.64, - "std": 0.0747, - "min": 3.56, - "max": 3.85, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "63_conv_standard_2D__square_input__square_kernel.py": { - "mean": 7.05, - "std": 0.0139, - "min": 7.03, - "max": 7.11, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "64_conv_transposed_1D.py": { - "mean": 5.28, - "std": 0.0103, - "min": 5.25, - "max": 5.31, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { - "mean": 2.71, - "std": 0.0129, - "min": 2.68, - "max": 2.75, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 2.6, - "std": 0.00198, - "min": 2.6, - "max": 2.61, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "67_conv_standard_1D.py": { - "mean": 2.66, - "std": 0.0156, - "min": 2.63, - "max": 2.68, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { - "mean": 9.54, - "std": 0.0113, - "min": 9.52, - "max": 9.61, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 2.75, - "std": 0.0209, - "min": 2.72, - "max": 2.81, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { - "mean": 9.85, - "std": 0.0364, - "min": 9.83, - "max": 10.1, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { - "mean": 1.59, - "std": 0.00375, - "min": 1.58, - "max": 1.6, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { - "mean": 2.89, - "std": 0.00619, - "min": 2.88, - "max": 2.91, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { - "mean": 2.08, - "std": 0.00526, - "min": 2.07, - "max": 2.09, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "74_conv_transposed_1D_dilated.py": { - "mean": 1.89, - "std": 0.017, - "min": 1.87, - "max": 2.03, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { - "mean": 6.68, - "std": 0.00872, - "min": 6.67, - "max": 6.75, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "76_conv_standard_1D_dilated_strided__.py": { - "mean": 12.2, - "std": 0.0506, - "min": 12.2, - "max": 12.4, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { - "mean": 1.95, - "std": 0.0152, - "min": 1.91, - "max": 1.99, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { - "mean": 2.42, - "std": 0.00491, - "min": 2.41, - "max": 2.43, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { - "mean": 1.93, - "std": 0.00786, - "min": 1.92, - "max": 1.95, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { - "mean": 3.53, - "std": 0.00861, - "min": 3.52, - "max": 3.56, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { - "mean": 1.81, - "std": 0.0117, - "min": 1.78, - "max": 1.83, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "82_conv_depthwise_2D_square_input_square_kernel.py": { - "mean": 2.54, - "std": 0.00128, - "min": 2.54, - "max": 2.54, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { - "mean": 1.47, - "std": 0.00158, - "min": 1.47, - "max": 1.48, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { - "mean": 10.1, - "std": 0.00491, - "min": 10.1, - "max": 10.2, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { - "mean": 2.31, - "std": 0.00502, - "min": 2.31, - "max": 2.34, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "86_conv_depthwise_separable_2D.py": { - "mean": 3.7, - "std": 0.013, - "min": 3.67, - "max": 3.72, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "87_conv_pointwise_2D.py": { - "mean": 4.67, - "std": 0.00559, - "min": 4.66, - "max": 4.69, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "88_MinGPTNewGelu.py": { - "mean": 1.61, - "std": 0.00118, - "min": 1.6, - "max": 1.61, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "89_cumsum.py": { - "mean": 4.65, - "std": 0.00818, - "min": 4.64, - "max": 4.67, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "90_cumprod.py": { - "mean": 4.64, - "std": 0.00386, - "min": 4.63, - "max": 4.65, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "91_cumsum_reverse.py": { - "mean": 11.4, - "std": 0.0112, - "min": 11.4, - "max": 11.4, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "92_cumsum_exclusive.py": { - "mean": 8.86, - "std": 0.0191, - "min": 8.83, - "max": 8.93, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "93_masked_cumsum.py": { - "mean": 8.67, - "std": 0.00487, - "min": 8.66, - "max": 8.7, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "94_MSELoss.py": { - "mean": 8.43, - "std": 0.00298, - "min": 8.42, - "max": 8.44, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "95_CrossEntropyLoss.py": { - "mean": 1.45, - "std": 0.0022, - "min": 1.44, - "max": 1.45, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "96_HuberLoss.py": { - "mean": 5.52, - "std": 0.00201, - "min": 5.51, - "max": 5.53, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "97_ScaledDotProductAttention.py": { - "mean": 8.23, - "std": 0.193, - "min": 8.01, - "max": 9.38, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "98_KLDivLoss.py": { - "mean": 3.89, - "std": 0.00194, - "min": 3.89, - "max": 3.9, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "99_TripletMarginLoss.py": { - "mean": 4.25, - "std": 0.0129, - "min": 4.24, - "max": 4.37, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "100_HingeLoss.py": { - "mean": 10.4, - "std": 0.00488, - "min": 10.4, - "max": 10.5, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - } - } -} \ No newline at end of file diff --git a/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json b/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json deleted file mode 100644 index ee1fb338..00000000 --- a/results/timing/H100_modal/baseline_time_torch_compile_inductor_default.json +++ /dev/null @@ -1,904 +0,0 @@ -{ - "level1": { - "1_Square_matrix_multiplication_.py": { - "mean": 2.66, - "std": 0.00503, - "min": 2.66, - "max": 2.68, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "2_Standard_matrix_multiplication_.py": { - "mean": 2.67, - "std": 0.00828, - "min": 2.65, - "max": 2.68, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "3_Batched_matrix_multiplication.py": { - "mean": 5.32, - "std": 0.0181, - "min": 5.3, - "max": 5.37, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "4_Matrix_vector_multiplication_.py": { - "mean": 2.9, - "std": 0.00233, - "min": 2.9, - "max": 2.91, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "5_Matrix_scalar_multiplication.py": { - "mean": 2.88, - "std": 0.0132, - "min": 2.86, - "max": 2.98, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "6_Matmul_with_large_K_dimension_.py": { - "mean": 1.33, - "std": 0.00488, - "min": 1.33, - "max": 1.35, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "7_Matmul_with_small_K_dimension_.py": { - "mean": 4.14, - "std": 0.0273, - "min": 4.12, - "max": 4.35, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "8_Matmul_with_irregular_shapes_.py": { - "mean": 6.44, - "std": 0.00478, - "min": 6.43, - "max": 6.46, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "9_Tall_skinny_matrix_multiplication_.py": { - "mean": 2.63, - "std": 0.0036, - "min": 2.63, - "max": 2.65, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "10_3D_tensor_matrix_multiplication.py": { - "mean": 1.07, - "std": 0.00272, - "min": 1.06, - "max": 1.08, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "11_4D_tensor_matrix_multiplication.py": { - "mean": 10.8, - "std": 1.01, - "min": 10.0, - "max": 13.2, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "12_Matmul_with_diagonal_matrices_.py": { - "mean": 2.69, - "std": 0.0112, - "min": 2.68, - "max": 2.72, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "13_Matmul_for_symmetric_matrices.py": { - "mean": 2.66, - "std": 0.00314, - "min": 2.66, - "max": 2.68, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "14_Matmul_for_upper_triangular_matrices.py": { - "mean": 2.72, - "std": 0.0045, - "min": 2.72, - "max": 2.75, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "15_Matmul_for_lower_triangular_matrices.py": { - "mean": 2.73, - "std": 0.005, - "min": 2.72, - "max": 2.75, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "16_Matmul_with_transposed_A.py": { - "mean": 2.65, - "std": 0.00467, - "min": 2.64, - "max": 2.67, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "17_Matmul_with_transposed_B.py": { - "mean": 2.77, - "std": 0.0083, - "min": 2.76, - "max": 2.8, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "18_Matmul_with_transposed_both.py": { - "mean": 2.79, - "std": 0.0101, - "min": 2.77, - "max": 2.82, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "19_ReLU.py": { - "mean": 4.31, - "std": 0.00483, - "min": 4.29, - "max": 4.33, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "20_LeakyReLU.py": { - "mean": 4.29, - "std": 0.00853, - "min": 4.28, - "max": 4.33, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "21_Sigmoid.py": { - "mean": 4.29, - "std": 0.0126, - "min": 4.27, - "max": 4.39, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "22_Tanh.py": { - "mean": 3.07, - "std": 0.0213, - "min": 3.06, - "max": 3.27, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "23_Softmax.py": { - "mean": 8.68, - "std": 0.0881, - "min": 8.65, - "max": 9.55, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "24_LogSoftmax.py": { - "mean": 8.65, - "std": 0.0181, - "min": 8.63, - "max": 8.74, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "25_Swish.py": { - "mean": 4.28, - "std": 0.00737, - "min": 4.27, - "max": 4.32, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "26_GELU_.py": { - "mean": 4.27, - "std": 0.00737, - "min": 4.26, - "max": 4.29, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "27_SELU_.py": { - "mean": 4.27, - "std": 0.0127, - "min": 4.26, - "max": 4.37, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "28_HardSigmoid.py": { - "mean": 4.29, - "std": 0.03, - "min": 4.27, - "max": 4.58, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "29_Softplus.py": { - "mean": 4.29, - "std": 0.0411, - "min": 4.27, - "max": 4.58, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "30_Softsign.py": { - "mean": 3.06, - "std": 0.00498, - "min": 3.05, - "max": 3.08, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "31_ELU.py": { - "mean": 4.28, - "std": 0.0248, - "min": 4.27, - "max": 4.52, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "32_HardTanh.py": { - "mean": 4.29, - "std": 0.0059, - "min": 4.28, - "max": 4.31, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "33_BatchNorm.py": { - "mean": 4.24, - "std": 0.00629, - "min": 4.22, - "max": 4.26, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "34_InstanceNorm.py": { - "mean": 7.66, - "std": 0.027, - "min": 7.64, - "max": 7.9, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "35_GroupNorm_.py": { - "mean": 7.49, - "std": 0.00591, - "min": 7.48, - "max": 7.51, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "36_RMSNorm_.py": { - "mean": 7.8, - "std": 0.00398, - "min": 7.8, - "max": 7.82, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "37_FrobeniusNorm_.py": { - "mean": 7.32, - "std": 0.0091, - "min": 7.31, - "max": 7.36, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "38_L1Norm_.py": { - "mean": 13.0, - "std": 0.0182, - "min": 13.0, - "max": 13.1, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "39_L2Norm_.py": { - "mean": 13.4, - "std": 0.029, - "min": 13.3, - "max": 13.6, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "40_LayerNorm.py": { - "mean": 0.476, - "std": 0.0031, - "min": 0.472, - "max": 0.491, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "41_Max_Pooling_1D.py": { - "mean": 10.7, - "std": 0.0108, - "min": 10.7, - "max": 10.8, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "42_Max_Pooling_2D.py": { - "mean": 4.45, - "std": 0.00646, - "min": 4.44, - "max": 4.49, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "43_Max_Pooling_3D.py": { - "mean": 3.95, - "std": 0.00396, - "min": 3.94, - "max": 3.97, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "44_Average_Pooling_1D.py": { - "mean": 1.89, - "std": 0.00262, - "min": 1.88, - "max": 1.9, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "45_Average_Pooling_2D.py": { - "mean": 6.43, - "std": 0.0493, - "min": 6.36, - "max": 6.79, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "46_Average_Pooling_3D.py": { - "mean": 8.71, - "std": 0.0143, - "min": 8.69, - "max": 8.81, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "47_Sum_reduction_over_a_dimension.py": { - "mean": 3.5, - "std": 0.0161, - "min": 3.45, - "max": 3.54, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "48_Mean_reduction_over_a_dimension.py": { - "mean": 3.39, - "std": 0.0227, - "min": 3.34, - "max": 3.55, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "49_Max_reduction_over_a_dimension.py": { - "mean": 3.49, - "std": 0.0149, - "min": 3.45, - "max": 3.56, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "50_conv_standard_2D__square_input__square_kernel.py": { - "mean": 1.66, - "std": 0.0152, - "min": 1.64, - "max": 1.72, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "51_Argmax_over_a_dimension.py": { - "mean": 2.94, - "std": 0.00652, - "min": 2.93, - "max": 2.96, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "52_Argmin_over_a_dimension.py": { - "mean": 3.07, - "std": 0.00532, - "min": 3.06, - "max": 3.09, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "53_Min_reduction_over_a_dimension.py": { - "mean": 3.48, - "std": 0.0141, - "min": 3.44, - "max": 3.52, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "54_conv_standard_3D__square_input__square_kernel.py": { - "mean": 1.4, - "std": 0.00443, - "min": 1.39, - "max": 1.42, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "55_conv_standard_2D__asymmetric_input__square_kernel.py": { - "mean": 4.63, - "std": 0.006, - "min": 4.61, - "max": 4.64, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "56_conv_standard_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 3.68, - "std": 0.037, - "min": 3.62, - "max": 3.76, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "57_conv_transposed_2D__square_input__square_kernel.py": { - "mean": 6.56, - "std": 0.00741, - "min": 6.54, - "max": 6.58, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "58_conv_transposed_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 2.32, - "std": 0.0159, - "min": 2.29, - "max": 2.37, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "59_conv_standard_3D__asymmetric_input__square_kernel.py": { - "mean": 2.11, - "std": 0.00394, - "min": 2.1, - "max": 2.13, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "60_conv_standard_3D__square_input__asymmetric_kernel.py": { - "mean": 5.33, - "std": 0.0602, - "min": 5.32, - "max": 5.79, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "61_conv_transposed_3D__square_input__square_kernel.py": { - "mean": 5.51, - "std": 0.0104, - "min": 5.48, - "max": 5.54, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "62_conv_standard_2D__square_input__asymmetric_kernel.py": { - "mean": 2.68, - "std": 0.0374, - "min": 2.66, - "max": 3.0, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "63_conv_standard_2D__square_input__square_kernel.py": { - "mean": 13.9, - "std": 0.0149, - "min": 13.9, - "max": 14.0, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "64_conv_transposed_1D.py": { - "mean": 5.32, - "std": 0.0562, - "min": 5.29, - "max": 5.72, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "65_conv_transposed_2D__square_input__asymmetric_kernel.py": { - "mean": 2.72, - "std": 0.0112, - "min": 2.68, - "max": 2.74, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "66_conv_standard_3D__asymmetric_input__asymmetric_kernel.py": { - "mean": 2.65, - "std": 0.0043, - "min": 2.65, - "max": 2.68, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "67_conv_standard_1D.py": { - "mean": 2.69, - "std": 0.0361, - "min": 2.65, - "max": 3.01, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "68_conv_transposed_3D__square_input__asymmetric_kernel.py": { - "mean": 9.55, - "std": 0.00915, - "min": 9.53, - "max": 9.58, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "69_conv_transposed_2D__asymmetric_input__asymmetric_kernel.py": { - "mean": 2.74, - "std": 0.0113, - "min": 2.72, - "max": 2.78, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "70_conv_transposed_3D__asymmetric_input__square_kernel.py": { - "mean": 10.0, - "std": 0.0098, - "min": 9.98, - "max": 10.0, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "71_conv_transposed_2D__asymmetric_input__square_kernel.py": { - "mean": 1.59, - "std": 0.00411, - "min": 1.58, - "max": 1.6, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "72_conv_transposed_3D_asymmetric_input_asymmetric_kernel___strided_padded_grouped_.py": { - "mean": 2.93, - "std": 0.00823, - "min": 2.92, - "max": 2.96, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "73_conv_transposed_3D_asymmetric_input_square_kernel__strided_padded__grouped.py": { - "mean": 2.07, - "std": 0.00509, - "min": 2.06, - "max": 2.09, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "74_conv_transposed_1D_dilated.py": { - "mean": 1.93, - "std": 0.00876, - "min": 1.91, - "max": 1.95, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "75_conv_transposed_2D_asymmetric_input_asymmetric_kernel_strided__grouped____padded____dilated__.py": { - "mean": 6.6, - "std": 0.0337, - "min": 6.58, - "max": 6.91, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "76_conv_standard_1D_dilated_strided__.py": { - "mean": 12.4, - "std": 0.0826, - "min": 12.4, - "max": 12.7, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "77_conv_transposed_3D_square_input_square_kernel___padded____dilated____strided__.py": { - "mean": 1.98, - "std": 0.0146, - "min": 1.95, - "max": 2.03, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "78_conv_transposed_2D_asymmetric_input_asymmetric_kernel___padded__.py": { - "mean": 2.37, - "std": 0.0217, - "min": 2.35, - "max": 2.55, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "79_conv_transposed_1D_asymmetric_input_square_kernel___padded____strided____dilated__.py": { - "mean": 1.93, - "std": 0.00781, - "min": 1.92, - "max": 1.95, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "80_conv_standard_2D_square_input_asymmetric_kernel___dilated____padded__.py": { - "mean": 2.65, - "std": 0.0159, - "min": 2.63, - "max": 2.68, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "81_conv_transposed_2D_asymmetric_input_square_kernel___dilated____padded____strided__.py": { - "mean": 1.71, - "std": 0.00818, - "min": 1.7, - "max": 1.76, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "82_conv_depthwise_2D_square_input_square_kernel.py": { - "mean": 2.57, - "std": 0.0708, - "min": 2.53, - "max": 3.09, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "83_conv_depthwise_2D_square_input_asymmetric_kernel.py": { - "mean": 19.0, - "std": 0.273, - "min": 18.8, - "max": 20.3, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "84_conv_depthwise_2D_asymmetric_input_square_kernel.py": { - "mean": 9.79, - "std": 0.00959, - "min": 9.77, - "max": 9.81, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "85_conv_depthwise_2D_asymmetric_input_asymmetric_kernel.py": { - "mean": 13.9, - "std": 0.00974, - "min": 13.9, - "max": 14.0, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "86_conv_depthwise_separable_2D.py": { - "mean": 3.41, - "std": 0.28, - "min": 3.31, - "max": 5.34, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "87_conv_pointwise_2D.py": { - "mean": 10.6, - "std": 0.098, - "min": 10.5, - "max": 10.9, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "88_MinGPTNewGelu.py": { - "mean": 0.161, - "std": 0.00496, - "min": 0.154, - "max": 0.178, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "89_cumsum.py": { - "mean": 2.69, - "std": 0.0867, - "min": 2.66, - "max": 3.45, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "90_cumprod.py": { - "mean": 2.64, - "std": 0.0149, - "min": 2.63, - "max": 2.73, - "num_trials": 100, - "hardware": "NVIDIA H200", - "device": "cuda:0" - }, - "91_cumsum_reverse.py": { - "mean": 5.87, - "std": 0.00331, - "min": 5.86, - "max": 5.88, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "92_cumsum_exclusive.py": { - "mean": 5.67, - "std": 0.01, - "min": 5.66, - "max": 5.75, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "93_masked_cumsum.py": { - "mean": 3.31, - "std": 0.0057, - "min": 3.3, - "max": 3.33, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "94_MSELoss.py": { - "mean": 2.78, - "std": 0.00263, - "min": 2.78, - "max": 2.79, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "95_CrossEntropyLoss.py": { - "mean": 0.234, - "std": 0.0313, - "min": 0.223, - "max": 0.541, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "96_HuberLoss.py": { - "mean": 2.79, - "std": 0.00316, - "min": 2.78, - "max": 2.8, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "97_ScaledDotProductAttention.py": { - "mean": 8.25, - "std": 0.228, - "min": 8.1, - "max": 9.25, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "98_KLDivLoss.py": { - "mean": 0.745, - "std": 0.0237, - "min": 0.735, - "max": 0.965, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "99_TripletMarginLoss.py": { - "mean": 1.04, - "std": 0.00274, - "min": 1.04, - "max": 1.05, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - }, - "100_HingeLoss.py": { - "mean": 1.41, - "std": 0.00369, - "min": 1.4, - "max": 1.42, - "num_trials": 100, - "hardware": "NVIDIA H100 80GB HBM3", - "device": "cuda:0" - } - } -} \ No newline at end of file From 3e31530662468e82dd3e30365777f766b41f0d9b Mon Sep 17 00:00:00 2001 From: pythonomar22 Date: Thu, 20 Nov 2025 21:00:22 -0800 Subject: [PATCH 5/5] fallback correcting --- src/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eval.py b/src/eval.py index 1ae8b83c..7e73479e 100644 --- a/src/eval.py +++ b/src/eval.py @@ -890,7 +890,8 @@ def fetch_baseline_time( if hasattr(dataset, "get_problem_by_id"): problem_path = dataset.get_problem_by_id(problem_id) else: - problem_path = dataset[problem_id] + # Fallback for old list-based API: problem_id is 1-indexed but lists are 0-indexed + problem_path = dataset[problem_id - 1] problem_name = os.path.basename(problem_path) baseline_time = baseline_json[level_name].get(problem_name, None)