From 75b946566119cda2977182370da5e31aa3def9dd Mon Sep 17 00:00:00 2001 From: Arav Agarwal Date: Fri, 20 Feb 2026 12:49:21 -0500 Subject: [PATCH] Update summarizer to utilize id_json_file for public ids --- .../result_summarizer/result_summarizer.py | 63 ++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 1cd3fb7d..4cdcec8c 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -13,6 +13,7 @@ import itertools import pandas as pd import yaml +import hashlib from ..compliance_checker import mlp_compliance from ..compliance_checker.mlp_compliance import usage_choices, rule_choices @@ -691,6 +692,57 @@ def _fill_empty_benchmark_scores( benchmark_scores[benchmark] = None +def _add_id_to_summary(summary, id_json_path): + """Add public ids to the summary file based on the json in id_json_path, which is a list of sha256 ids where the position in the list is the id. If id_json_path is specified but the file does not exist, it is created from scratch. + + Args: + summary (pd.DataFrame): Summary dataframe. + id_json_path (str): Path to json file. + """ + + id_json = [] + if os.path.exists(id_json_path): + with open(id_json_path, 'r') as f: + id_json = json.load(f) + + print(id_json) + + def get_hash(row): + columns_for_hashing = [ + 'division', + 'submitter', + 'system', + 'number_of_nodes', + 'host_processor_model_name', + 'host_processors_count', + 'accelerator_model_name', + 'accelerators_count', + 'framework' + ] + to_hash = ''.join(str(row[c]) for c in columns_for_hashing) + return hashlib.sha256(to_hash.encode('utf-8')).hexdigest() + + summary['hash'] = summary.apply(get_hash, axis=1) + + id_list = [] + + for elem in summary['hash']: + if elem in id_json: + id_list.append(id_json.index(elem) + 1) + else: + id_json.append(elem) + id_list.append(len(id_json)) + + summary['id'] = id_list + + with open(id_json_path, 'w') as f: + json.dump(id_json, f, indent=4) + + return summary + + + + def summarize_results(folder, usage, ruleset, csv_file=None, **kwargs): """Summarizes a set of results. @@ -837,6 +889,8 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None): return strong_scaling_summary, weak_scaling_summary, power_summary, power_weak_scaling_summary + + def get_parser(): parser = argparse.ArgumentParser( prog='mlperf_logging.result_summarizer', @@ -857,6 +911,11 @@ def get_parser(): type=str, choices=rule_choices(), help='the ruleset such as 0.6.0, 0.7.0, or 1.0.0') + + parser.add_argument('--id_json_path', + type=str, + help='Path to id_json file to map runs to public ids. If specified but path is not found, file is created from scratch.') + parser.add_argument('--werror', action='store_true', help='Treat warnings as errors') @@ -874,6 +933,7 @@ def get_parser(): '--xlsx', type=str, help='Exports a xlsx of the results to the path specified') + return parser @@ -1042,7 +1102,8 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False): # Sort rows by their values summaries = summaries.sort_values(by=cols) - print(summaries) + summaries = _add_id_to_summary(summaries, args.id_json_path) + if args.csv is not None: csv = args.csv assert csv.endswith(".csv")