diff --git a/scripts/retrieve_ci_failures.py b/scripts/retrieve_ci_failures.py index f03fd18207..c7086af658 100644 --- a/scripts/retrieve_ci_failures.py +++ b/scripts/retrieve_ci_failures.py @@ -282,6 +282,48 @@ def diff_failure_vs_fix(repo, failure_commits, fix_commits): return None +def process_diff(bug_id, obj, upload, repo_path): + diff_path = os.path.join("data", "ci_failures_diffs", f"{bug_id}.diff") + diff_zst_path = f"{diff_path}.zst" + + if os.path.exists(diff_path) or os.path.exists(diff_zst_path): + return (0, 0) + + if upload and utils.exists_s3(diff_zst_path): + return (0, 0) + + try: + diff = diff_failure_vs_fix( + repo_path, + [ + utils.hg2git(commit["node"]) + for commit in obj["commits"] + if commit["backedoutby"] + ], + [ + utils.hg2git(commit["node"]) + for commit in obj["commits"] + if not commit["backedoutby"] and not commit["backsout"] + ], + ) + except requests.exceptions.HTTPError as e: + logger.error("Failure mapping hg commit hash to git commit hash %s", e) + return (1, 0) + + if diff is not None and len(diff) > 0: + with open(diff_path, "wb") as f: + f.write(diff) + + utils.zstd_compress(diff_path) + + os.remove(diff_path) + + if upload: + utils.upload_s3([diff_zst_path]) + else: + return (0, 1) + + def generate_diffs(repo_url, repo_path, fixed_by_commit_pushes, upload): if not os.path.exists(repo_path): for attempt in tenacity.Retrying( @@ -298,48 +340,18 @@ def generate_diffs(repo_url, repo_path, fixed_by_commit_pushes, upload): diff_errors = 0 mapping_errors = 0 - for bug_id, obj in tqdm( - fixed_by_commit_pushes.items(), total=len(fixed_by_commit_pushes) - ): - diff_path = os.path.join("data", "ci_failures_diffs", f"{bug_id}.diff") - diff_zst_path = f"{diff_path}.zst" - if os.path.exists(diff_path) or os.path.exists(diff_zst_path): - continue + diffs = [(bug_id, obj) for bug_id, obj in fixed_by_commit_pushes.items()] - if upload and utils.exists_s3(diff_zst_path): - continue - - try: - diff = diff_failure_vs_fix( - repo_path, - [ - utils.hg2git(commit["node"]) - for commit in obj["commits"] - if commit["backedoutby"] - ], - [ - utils.hg2git(commit["node"]) - for commit in obj["commits"] - if not commit["backedoutby"] and not commit["backsout"] - ], - ) - except requests.exceptions.HTTPError as e: - logger.error("Failure mapping hg commit hash to git commit hash %s", e) - mapping_errors += 1 - continue - - if diff is not None and len(diff) > 0: - with open(diff_path, "wb") as f: - f.write(diff) - - utils.zstd_compress(diff_path) - - os.remove(diff_path) + with ThreadPoolExecutor() as executor: + futures = [ + executor.submit(process_diff, bug_id, obj, upload, repo_path) + for bug_id, obj in diffs + ] - if upload: - utils.upload_s3([diff_zst_path]) - else: - diff_errors += 1 + for future in tqdm(as_completed(futures), totla=len(futures)): + mapping_error, diff_error = future.result() + mapping_errors += mapping_error + diff_errors += diff_error logger.info(f"Failed mapping {mapping_errors} hashes") logger.info(f"Failed generating {diff_errors} diffs")