Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions openjury/generate_and_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ class CliArgs:

n_instructions: int | None = None
provide_explanation: bool = False
swap_mode: bool = False
ignore_cache: bool = False
use_tqdm: bool = False

result_folder: str = "results"

@classmethod
def parse_args(cls):
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -69,6 +72,15 @@ def parse_args(cls):
help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve"
"the accuracy of the judge but enables some result interpretation.",
)
parser.add_argument(
"--swap_mode",
type=str,
choices=["fixed", "both"],
default="fixed",
help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order "
"bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account "
"for judge position bias. Default is 'fixed'.",
)
parser.add_argument(
"--ignore_cache",
action="store_true",
Expand All @@ -79,7 +91,14 @@ def parse_args(cls):
action="store_true",
help="If specified, use tqdm, does not work with all model providers, vLLM in particular.",
)

parser.add_argument(
"--result_folder",
type=str,
required=False,
default="results",
help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in"
" `[result_folder]/[evaluation_name]`.",
)
args = parser.parse_args()

return cls(
Expand All @@ -89,6 +108,7 @@ def parse_args(cls):
judge_model=args.judge_model,
n_instructions=args.n_instructions,
provide_explanation=args.provide_explanation,
swap_mode=args.swap_mode,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point args.swap_mode is a string but we map it to a boolean, we should probably rather test args.swap_mode == "fixed" rather?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are right. This got lost in the change from correct_order_bias (bool) to swap_mode (str).

ignore_cache=args.ignore_cache,
use_tqdm=args.use_tqdm,
)
Expand Down Expand Up @@ -216,11 +236,27 @@ def main(args: CliArgs):
use_tqdm=args.use_tqdm,
)

name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}".replace(
if args.swap_mode:
print("Correction for judge bias towards a certain model position is set.")
print(f"Evaluating completions with models reversed with judge {args.judge_model}.")
annotations_reversed = annotate_battles(
judge_chat_model=judge_chat_model,
instructions=instructions.head(n_instructions).tolist(),
completions_A=completions_B.head(n_instructions).tolist(),
completions_B=completions_A.head(n_instructions).tolist(),
provide_explanation=args.provide_explanation,
system_prompt=system_prompt,
max_len=200,
use_tqdm=args.use_tqdm,
)

name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}"
name += f"-{args.swap_mode}"
name = name.replace(
"/", "_"
)

res_folder = Path("results") / name
res_folder = Path(args.result_folder) / name
res_folder.mkdir(parents=True, exist_ok=True)

# save argument for results analysis
Expand All @@ -233,6 +269,15 @@ def main(args: CliArgs):
df["model_A"] = args.model_A
df["model_B"] = args.model_B
df["judge"] = args.judge_model

if args.swap_mode:
df_reversed = pd.DataFrame(annotations_reversed)
df_reversed["instruction_index"] = instructions.head(n_instructions).index.tolist()
df_reversed["model_A"] = args.model_B
df_reversed["model_B"] = args.model_A
df_reversed["judge"] = args.judge_model
df = pd.concat([df, df_reversed])

df.to_csv(res_folder / f"{name}-annotations.csv", index=False)

# compute preferences between A and B
Expand All @@ -244,6 +289,15 @@ def main(args: CliArgs):
]
)

if args.swap_mode:
prefs_reversed = pd.Series(
[
score_parser.parse_model_raw(annotation.judge_completion)
for annotation in annotations_reversed
]
)
prefs = (prefs + (1 - prefs_reversed)) / 2.0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it work in case of nans? Seems like return 2n battles would be cleaner?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it does:

>>> pd.Series([1, 2, 3, None]) + pd.Series([1, 2, None, 4])
0    2.0
1    4.0
2    NaN
3    NaN
dtype: float64

Nonetheless, I think your suggestion is, generally, cleaner. How about the following?

prefs = pd.concat([prefs, (1 - prefs_reversed)]).reset_index(drop=True)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that sounds good to me


# compute and report statistics
num_wins = sum(prefs < 0.5)
num_losses = sum(prefs > 0.5)
Expand Down
Loading