Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
YOU_API_KEY=
OPENAI_API_KEY=
EXA_API_KEY=
PARALLEL_API_KEY=
PERPLEXITY_API_KEY=
SERP_API_KEY=
TAVILY_API_KEY=
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ venv/
venv*

# Files
src/simpleqa/results/*
src/evals/results/*
tests/results/*

# Environment Variables
.env
Expand Down
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,40 +12,40 @@ If you would like to reproduce the numbers or add new samplers, follow the instr
cd evals
```

2. Install the required dependencies:
2. Create a virtual environment with the tool of your choice, then install the required dependencies:
```bash
# create and activate virtual environment
pip install -r requirements.txt
pip install -e .
```

3. Set up environment variables as environment variables or an .env file:
3. Set up your `.env` file and insert the appropriate API keys:
```bash
export OPENAI_API_KEY=your_openai_api_key
export YOU_API_KEY=your_you_api_key
export TAVILY_API_KEY=your_you_api_key
export EXA_API_KEY=your_you_api_key
export SERP_API_KEY=your_you_api_key
cp .env.example .env
```

## Running a SimpleQA evaluation
To run a SimpleQA evaluation, simply run the `simpleqa_runner.py` file with your desired arguments.

View available arguments and samplers
```bash
python src/simpleqa/simpleqa_runner.py --help
python src/evals/eval_runner.py --help
```

Run the SimpleQA evaluation on the entire problem set for all available samplers with default settings
```bash
python src/simpleqa/simpleqa_runner.py
python src/evals/eval_runner.py
```

Run the SimpleQA evaluation on just You.com for 5 random problems
```bash
python src/simpleqa/simpleqa_runner.py --samplers you --limit 5
python src/evals/eval_runner.py --samplers you_unified_search --limit 5
```

## Interpreting Results
Results files will be placed in `simpleqa/results` after a successful run of SimpleQA. Files following the pattern
`raw_results_{sampler}.csv` are the raw results for each individual sampler. The file `simpleqa_results.csv` contains
aggregated results with various metrics useful for analysis.
aggregated results with various metrics useful for analysis.

Please note that latency numbers include the total time it takes to run the API request on your machine, so your network
speeds will impact reported numbers and may fluctuate between runs.
825 changes: 825 additions & 0 deletions data/frames_full_dataset.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ classifiers = [

[project.urls]
Homepage = "https://github.com/youdotcom-oss/evals.git"

[tool.pytest.ini_options]
asyncio_default_fixture_loop_scope = "function"
12 changes: 10 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
aiohttp==3.12.15
exa-py==2.4.0
openai==1.78.1
pydantic==2.11.4
pandas==2.2.3
tqdm==4.67.1
parallel-web==0.4.1
perplexityai==0.29.1
pydantic==2.11.4
pytest==8.3.4
pytest-asyncio==0.24.0
python-dotenv==1.0.1
retry==0.9.2
tavily-python==0.7.21
tqdm==4.67.1
youdotcom==2.2.0
3 changes: 3 additions & 0 deletions src/evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import dotenv

dotenv.load_dotenv()
33 changes: 33 additions & 0 deletions src/evals/configs/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from dataclasses import dataclass
from collections.abc import Callable

import pandas as pd

from evals.processing.evaluate_answer import AnswerGrader


evaluator = AnswerGrader()


@dataclass
class Dataset:
dataset_name: str
csv_path: str
grader: Callable
df: pd.DataFrame | None


DATASETS = [
Dataset(
dataset_name="frames",
csv_path="data/frames_full_dataset.csv",
grader=evaluator.evaluate_single_frames,
df=None,
),
Dataset(
dataset_name="simpleqa",
csv_path="data/simpleqa_full_dataset.csv",
grader=evaluator.evaluate_single_simpleqa,
df=None,
),
]
44 changes: 44 additions & 0 deletions src/evals/configs/samplers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

from evals.samplers.applied_samplers.exa_sampler import ExaSampler
from evals.samplers.applied_samplers.parallel_sampler import ParallelSearchSampler
from evals.samplers.applied_samplers.perplexity_sampler import PerplexitySearchSampler
from evals.samplers.applied_samplers.tavily_sampler import TavilySampler
from evals.samplers.applied_samplers.you_livecrawl_sampler import YouLivecrawlSampler
from evals.samplers.applied_samplers.you_search_sampler import YouSearchSnippetsSampler


SAMPLERS = [
YouLivecrawlSampler(
sampler_name="you_search_livecrawl",
api_key=os.getenv("YOU_API_KEY"),
),
YouSearchSnippetsSampler(
sampler_name="you_search_snippets",
api_key=os.getenv("YOU_API_KEY"),
),
ExaSampler(
sampler_name="exa_search_with_contents",
api_key=os.getenv("EXA_API_KEY"),
text=True,
),
ParallelSearchSampler(
sampler_name="parallel_fast",
api_key=os.getenv("PARALLEL_API_KEY"),
mode="fast",
),
PerplexitySearchSampler(
sampler_name="perplexity_search",
api_key=os.getenv("PERPLEXITY_API_KEY"),
),
TavilySampler(
sampler_name="tavily_basic",
api_key=os.getenv("TAVILY_API_KEY"),
search_depth="basic",
),
TavilySampler(
sampler_name="tavily_advanced",
api_key=os.getenv("TAVILY_API_KEY"),
search_depth="advanced",
),
]
148 changes: 49 additions & 99 deletions src/simpleqa/processing/evaluate_answer.py → src/evals/constants.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
"""
This class is used to evaluate the correctness of the response. No changes have been made to the grading prompt.

To view or edit the model used for grading, see evals.simpleqa.constants
"""
# We used a weaker model for synthesis and a stronger model for grading to ensure fairness.
SYNTHESIS_MODEL = "gpt-4o-mini"
GRADER_MODEL = "gpt-4.1"

import asyncio
import logging
import os
import re
from typing import Dict, Any
# Maximum tokens available for search results (leaving room for prompt and response)
MAX_SEARCH_RESULT_TOKENS = 127750

import httpx

from simpleqa import constants
SYNTHESIS_PROMPT = """
You are an AI assistant that answers questions using search results.
Read the provided search snippets carefully and answer based only on information found in the snippets.
Keep your response clear and concise.
"""

# Prompt is from OpenAI's simple-evals repository https://github.com/openai/simple-evals/blob/ee3b0318d8d1d9d72755a4120879be65f7c07e9e/simpleqa_eval.py#L13
ANSWER_GRADER_TEMPLATE = """
SIMPLEQA_ANSWER_GRADER_TEMPLATE = """
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
First, I will give examples of each grade, and then you will grade a new example.

Expand Down Expand Up @@ -96,89 +93,42 @@
Just return the letters "A", "B", or "C", with no text around it.
""".strip()

FRAMES_ANSWER_GRADER_TEMPLATE = """
===Task===


I need your help in evaluating an answer provided by an LLM against a ground truth
answer. Your task is to determine if the ground truth answer is present in the LLM’s response.
Please analyze the provided data and make a decision.


===Instructions===


1. Carefully compare the "Predicted Answer" with the "Ground Truth Answer".
2. Consider the substance of the answers – look for equivalent information or correct answers. Do
not focus on exact wording unless the exact wording is crucial to the meaning.
3. Your final decision should be based on whether the meaning and the vital facts of the "Ground
Truth Answer" are present in the "Predicted Answer:"


class AnswerGrader:
def __init__(self, model: str = constants.GRADER_MODEL, max_retries: int = 3):
self.logger = logging.getLogger(self.__class__.__name__)

self.model = model
self.max_retries = max_retries
self.api_key = os.getenv("OPENAI_API_KEY")
self.headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}

async def call_openai_async(self, client: httpx.AsyncClient, prompt: str) -> str:
"""Make async call to OpenAI API"""
for trial in range(self.max_retries + 1):
try:
payload = {
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.0,
"max_tokens": 1024,
}

response = await client.post(
"https://api.openai.com/v1/chat/completions",
headers=self.headers,
json=payload,
)

if response.status_code == 200:
result = response.json()
content = result["choices"][0]["message"]["content"]
if content is None:
raise ValueError("OpenAI API returned empty response")
return content
else:
raise Exception(
f"API error {response.status_code}: {response.text}"
)

except Exception as e:
if trial >= self.max_retries:
self.logger.error(f"Failed after {self.max_retries} retries: {e}")
raise

backoff = 2**trial
self.logger.warning(f"Evaluation retry {trial + 1} in {backoff}s: {e}")
await asyncio.sleep(backoff)

raise ValueError("Failed to call OpenAI API")

async def evaluate_single(
self, question: str, target: str, predicted_answer: str
) -> Dict[str, Any]:
"""Evaluate a single response asynchronously"""
grader_prompt = ANSWER_GRADER_TEMPLATE.format(
question=question,
target=target,
predicted_answer=predicted_answer,
)

async with httpx.AsyncClient(timeout=30.0) as client:
grading_response = await self.call_openai_async(client, grader_prompt)

# Parse the grade
match = re.search(r"(A|B|C)", grading_response)
grade_letter = match.group(0) if match else "C"

# Convert to readable format
score_name = {"A": "is_correct", "B": "is_incorrect", "C": "is_not_attempted"}[
grade_letter
]

is_correct = grade_letter == "A"
is_incorrect = grade_letter == "B"
is_not_attempted = grade_letter == "C"

return {
"grade": grade_letter,
"score_name": score_name,
"is_correct": is_correct,
"is_incorrect": is_incorrect,
"is_not_attempted": is_not_attempted,
"score": is_correct,
}
===Input Data===


- Question: {question}


- Predicted Answer: {predicted_answer}


- Ground Truth Answer: {target}


===Output Format===


Provide your final evaluation in the following format:
"Explanation:" (How you made the decision?)
"Decision:" ("TRUE" or "FALSE")
Please proceed with the evaluation.
"""
Loading