diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 420c6f8..47a9c84 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -29,9 +29,12 @@ class CliArgs: n_instructions: int | None = None provide_explanation: bool = False + swap_mode: bool = False ignore_cache: bool = False use_tqdm: bool = False + result_folder: str = "results" + @classmethod def parse_args(cls): parser = argparse.ArgumentParser( @@ -69,6 +72,15 @@ def parse_args(cls): help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve" "the accuracy of the judge but enables some result interpretation.", ) + parser.add_argument( + "--swap_mode", + type=str, + choices=["fixed", "both"], + default="fixed", + help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order " + "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account " + "for judge position bias. Default is 'fixed'.", + ) parser.add_argument( "--ignore_cache", action="store_true", @@ -79,7 +91,14 @@ def parse_args(cls): action="store_true", help="If specified, use tqdm, does not work with all model providers, vLLM in particular.", ) - + parser.add_argument( + "--result_folder", + type=str, + required=False, + default="results", + help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in" + " `[result_folder]/[evaluation_name]`.", + ) args = parser.parse_args() return cls( @@ -89,6 +108,7 @@ def parse_args(cls): judge_model=args.judge_model, n_instructions=args.n_instructions, provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, ignore_cache=args.ignore_cache, use_tqdm=args.use_tqdm, ) @@ -216,11 +236,27 @@ def main(args: CliArgs): use_tqdm=args.use_tqdm, ) - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}".replace( + if args.swap_mode: + print("Correction for judge bias towards a certain model position is set.") + print(f"Evaluating completions with models reversed with judge {args.judge_model}.") + annotations_reversed = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions.head(n_instructions).tolist(), + completions_A=completions_B.head(n_instructions).tolist(), + completions_B=completions_A.head(n_instructions).tolist(), + provide_explanation=args.provide_explanation, + system_prompt=system_prompt, + max_len=200, + use_tqdm=args.use_tqdm, + ) + + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + name = name.replace( "/", "_" ) - res_folder = Path("results") / name + res_folder = Path(args.result_folder) / name res_folder.mkdir(parents=True, exist_ok=True) # save argument for results analysis @@ -233,6 +269,15 @@ def main(args: CliArgs): df["model_A"] = args.model_A df["model_B"] = args.model_B df["judge"] = args.judge_model + + if args.swap_mode: + df_reversed = pd.DataFrame(annotations_reversed) + df_reversed["instruction_index"] = instructions.head(n_instructions).index.tolist() + df_reversed["model_A"] = args.model_B + df_reversed["model_B"] = args.model_A + df_reversed["judge"] = args.judge_model + df = pd.concat([df, df_reversed]) + df.to_csv(res_folder / f"{name}-annotations.csv", index=False) # compute preferences between A and B @@ -244,6 +289,15 @@ def main(args: CliArgs): ] ) + if args.swap_mode: + prefs_reversed = pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations_reversed + ] + ) + prefs = (prefs + (1 - prefs_reversed)) / 2.0 + # compute and report statistics num_wins = sum(prefs < 0.5) num_losses = sum(prefs > 0.5)