diff --git a/scripts/benchmark_eval_analysis.py b/scripts/benchmark_eval_analysis.py index e2bea005..3eb8f831 100644 --- a/scripts/benchmark_eval_analysis.py +++ b/scripts/benchmark_eval_analysis.py @@ -19,7 +19,15 @@ ``` python3 scripts/benchmark_eval_analysis.py run_name= level= hardware= baseline= ``` -hardware + baseline should correspond to the results/timing/hardware/baseline.json file +hardware + baseline should correspond to the results/timing/hardware/baseline.json file + +Optional path overrides (for external tools like leaderboards): +``` +python3 scripts/benchmark_eval_analysis.py run_name= level= hardware= baseline= \ + baseline_file=/path/to/baseline.json \ + eval_results_dir=/path/to/runs \ + output_file=/path/to/output.json +``` """ @@ -32,6 +40,11 @@ def __init__(self): self.hardware = REQUIRED # hardware to evaluate self.baseline = REQUIRED # baseline to compare against + # Optional path overrides (defaults to standard KernelBench paths) + self.baseline_file = None # Override: direct path to baseline JSON + self.eval_results_dir = None # Override: path to runs directory + self.output_file = None # Write JSON output to file + def __repr__(self): return f"AnalysisConfig({self.to_dict()})" @@ -54,24 +67,36 @@ def patch(eval_results, dataset): return eval_results -def analyze_greedy_eval(run_name, hardware, baseline, level): +def analyze_greedy_eval(run_name, hardware, baseline, level, + baseline_file=None, eval_results_dir=None) -> dict: """ - Analyze the greedy eval results for a run of a particular level + Analyze the greedy eval results for a run of a particular level. + + Returns a dict with all computed metrics. """ dataset = construct_kernelbench_dataset(level) - # load json - eval_file_path = f"runs/{run_name}/eval_results.json" + # Resolve eval results path (use override if provided) + if eval_results_dir: + eval_file_path = os.path.join(eval_results_dir, run_name, "eval_results.json") + pass_at_k_file_path = os.path.join(eval_results_dir, run_name, "pass_at_k_results.json") + else: + eval_file_path = f"runs/{run_name}/eval_results.json" + pass_at_k_file_path = f"runs/{run_name}/pass_at_k_results.json" + assert os.path.exists( eval_file_path ), f"Eval file does not exist at {eval_file_path}" - # Check if pass@k results exist - pass_at_k_file_path = f"runs/{run_name}/pass_at_k_results.json" has_pass_at_k_results = os.path.exists(pass_at_k_file_path) - baseline_file_path = f"results/timing/{hardware}/{baseline}.json" + # Resolve baseline path (use override if provided) + if baseline_file: + baseline_file_path = baseline_file + else: + baseline_file_path = f"results/timing/{hardware}/{baseline}.json" + assert os.path.exists( baseline_file_path ), f"Baseline file does not exist at {baseline_file_path}" @@ -157,7 +182,7 @@ def analyze_greedy_eval(run_name, hardware, baseline, level): # list of speedup thresholds p p_values = [0.0, 0.5, 0.8, 1.0, 1.5, 2.0] - results = [ + fast_p_results = [ [p, fastp(is_correct, baseline_speed, actual_speed, n, p)] for p in p_values ] @@ -169,7 +194,7 @@ def analyze_greedy_eval(run_name, hardware, baseline, level): print("\nFast_p Results:") print( tabulate( - results, headers=["Speedup Threshold (p)", "Fast_p Score"], tablefmt="grid" + fast_p_results, headers=["Speedup Threshold (p)", "Fast_p Score"], tablefmt="grid" ) ) @@ -193,10 +218,49 @@ def analyze_greedy_eval(run_name, hardware, baseline, level): avg_table = [[k, v] for k, v in averages.items()] print(tabulate(avg_table, headers=["Metric", "Value"], tablefmt="grid")) + # Build and return results dict + results = { + "run_name": run_name, + "level": level, + "hardware": hardware, + "total_count": total_count, + "total_eval": total_eval, + "compiled_count": compiled_count, + "correct_count": correct_count, + "compilation_rate": compiled_count / total_count if total_count > 0 else 0.0, + "correctness_rate": correct_count / total_count if total_count > 0 else 0.0, + "geo_mean_speedup": float(gmsr_correct), + "fast_p": {str(p): float(score) for p, score in fast_p_results}, + } + + # Include pass@k if available + if pass_at_k_results: + results["pass_at_k"] = { + "metadata": pass_at_k_results.get("metadata", {}), + "averages": pass_at_k_results.get("averages", {}) + } + + return results + @pydra.main(base=AnalysisConfig) def main(config: AnalysisConfig): - analyze_greedy_eval(config.run_name, config.hardware, config.baseline, config.level) + results = analyze_greedy_eval( + config.run_name, + config.hardware, + config.baseline, + config.level, + baseline_file=config.baseline_file, + eval_results_dir=config.eval_results_dir + ) + + # Write JSON output if requested + if config.output_file: + with open(config.output_file, 'w') as f: + json.dump(results, f, indent=2) + print(f"\nResults written to: {config.output_file}") + + return results if __name__ == "__main__":