youdotcom-oss · eddyn-you · Jan 29, 2026 · Jan 29, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,7 @@
+YOU_API_KEY=
+OPENAI_API_KEY=
+EXA_API_KEY=
+PARALLEL_API_KEY=
+PERPLEXITY_API_KEY=
+SERP_API_KEY=
+TAVILY_API_KEY=
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,8 @@ venv/
 venv*
 
 # Files
-src/simpleqa/results/*
+src/evals/results/*
+tests/results/*
 
 # Environment Variables
 .env

diff --git a/README.md b/README.md
@@ -12,40 +12,40 @@ If you would like to reproduce the numbers or add new samplers, follow the instr
    cd evals
    ```
 
-2. Install the required dependencies:
+2. Create a virtual environment with the tool of your choice, then install the required dependencies:
    ```bash
+   # create and activate virtual environment
    pip install -r requirements.txt
    pip install -e .
    ```
 
-3. Set up environment variables as environment variables or an .env file:
+3. Set up your `.env` file and insert the appropriate API keys:
    ```bash
-   export OPENAI_API_KEY=your_openai_api_key
-   export YOU_API_KEY=your_you_api_key
-   export TAVILY_API_KEY=your_you_api_key
-   export EXA_API_KEY=your_you_api_key
-   export SERP_API_KEY=your_you_api_key
+   cp .env.example .env
    ```
 
 ## Running a SimpleQA evaluation
 To run a SimpleQA evaluation, simply run the `simpleqa_runner.py` file with your desired arguments.
 
 View available arguments and samplers
    ```bash
-   python src/simpleqa/simpleqa_runner.py --help
+   python src/evals/eval_runner.py --help
    ```
 
 Run the SimpleQA evaluation on the entire problem set for all available samplers with default settings
    ```bash
-   python src/simpleqa/simpleqa_runner.py
+   python src/evals/eval_runner.py
    ```
 
 Run the SimpleQA evaluation on just You.com for 5 random problems
    ```bash
-   python src/simpleqa/simpleqa_runner.py --samplers you --limit 5
+   python src/evals/eval_runner.py --samplers you_unified_search --limit 5
    ```
 
 ## Interpreting Results 
 Results files will be placed in `simpleqa/results` after a successful run of SimpleQA. Files following the pattern
 `raw_results_{sampler}.csv` are the raw results for each individual sampler. The file `simpleqa_results.csv` contains
-aggregated results with various metrics useful for analysis.
+aggregated results with various metrics useful for analysis.
+
+Please note that latency numbers include the total time it takes to run the API request on your machine, so your network
+speeds will impact reported numbers and may fluctuate between runs.
diff --git a/data/frames_full_dataset.csv b/data/frames_full_dataset.csv
diff --git a/src/simpleqa/data/simple_qa_test_set.csv → data/simpleqa_full_dataset.csv b/src/simpleqa/data/simple_qa_test_set.csv → data/simpleqa_full_dataset.csv
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,3 +15,6 @@ classifiers = [
 
 [project.urls]
 Homepage = "https://github.com/youdotcom-oss/evals.git"
+
+[tool.pytest.ini_options]
+asyncio_default_fixture_loop_scope = "function"
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,14 @@
 aiohttp==3.12.15
+exa-py==2.4.0
 openai==1.78.1
-pydantic==2.11.4
 pandas==2.2.3
-tqdm==4.67.1
+parallel-web==0.4.1
+perplexityai==0.29.1
+pydantic==2.11.4
+pytest==8.3.4
+pytest-asyncio==0.24.0
+python-dotenv==1.0.1
 retry==0.9.2
+tavily-python==0.7.21
+tqdm==4.67.1
+youdotcom==2.2.0
diff --git a/src/evals/__init__.py b/src/evals/__init__.py
@@ -0,0 +1,3 @@
+import dotenv
+
+dotenv.load_dotenv()
diff --git a/src/evals/configs/datasets.py b/src/evals/configs/datasets.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass
+from collections.abc import Callable
+
+import pandas as pd
+
+from evals.processing.evaluate_answer import AnswerGrader
+
+
+evaluator = AnswerGrader()
+
+
+@dataclass
+class Dataset:
+    dataset_name: str
+    csv_path: str
+    grader: Callable
+    df: pd.DataFrame | None
+
+
+DATASETS = [
+    Dataset(
+        dataset_name="frames",
+        csv_path="data/frames_full_dataset.csv",
+        grader=evaluator.evaluate_single_frames,
+        df=None,
+    ),
+    Dataset(
+        dataset_name="simpleqa",
+        csv_path="data/simpleqa_full_dataset.csv",
+        grader=evaluator.evaluate_single_simpleqa,
+        df=None,
+    ),
+]
diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py
@@ -0,0 +1,44 @@
+import os
+
+from evals.samplers.applied_samplers.exa_sampler import ExaSampler
+from evals.samplers.applied_samplers.parallel_sampler import ParallelSearchSampler
+from evals.samplers.applied_samplers.perplexity_sampler import PerplexitySearchSampler
+from evals.samplers.applied_samplers.tavily_sampler import TavilySampler
+from evals.samplers.applied_samplers.you_livecrawl_sampler import YouLivecrawlSampler
+from evals.samplers.applied_samplers.you_search_sampler import YouSearchSnippetsSampler
+
+
+SAMPLERS = [
+    YouLivecrawlSampler(
+        sampler_name="you_search_livecrawl",
+        api_key=os.getenv("YOU_API_KEY"),
+    ),
+    YouSearchSnippetsSampler(
+        sampler_name="you_search_snippets",
+        api_key=os.getenv("YOU_API_KEY"),
+    ),
+    ExaSampler(
+        sampler_name="exa_search_with_contents",
+        api_key=os.getenv("EXA_API_KEY"),
+        text=True,
+    ),
+    ParallelSearchSampler(
+        sampler_name="parallel_fast",
+        api_key=os.getenv("PARALLEL_API_KEY"),
+        mode="fast",
+    ),
+    PerplexitySearchSampler(
+        sampler_name="perplexity_search",
+        api_key=os.getenv("PERPLEXITY_API_KEY"),
+    ),
+    TavilySampler(
+        sampler_name="tavily_basic",
+        api_key=os.getenv("TAVILY_API_KEY"),
+        search_depth="basic",
+    ),
+    TavilySampler(
+        sampler_name="tavily_advanced",
+        api_key=os.getenv("TAVILY_API_KEY"),
+        search_depth="advanced",
+    ),
+]
diff --git a/src/simpleqa/processing/evaluate_answer.py → src/evals/constants.py b/src/simpleqa/processing/evaluate_answer.py → src/evals/constants.py
@@ -1,21 +1,18 @@
-"""
-This class is used to evaluate the correctness of the response. No changes have been made to the grading prompt.
-
-To view or edit the model used for grading, see evals.simpleqa.constants
-"""
+# We used a weaker model for synthesis and a stronger model for grading to ensure fairness.
+SYNTHESIS_MODEL = "gpt-4o-mini"
+GRADER_MODEL = "gpt-4.1"
 
-import asyncio
-import logging
-import os
-import re
-from typing import Dict, Any
+# Maximum tokens available for search results (leaving room for prompt and response)
+MAX_SEARCH_RESULT_TOKENS = 127750
 
-import httpx
-
-from simpleqa import constants
+SYNTHESIS_PROMPT = """
+    You are an AI assistant that answers questions using search results.
+    Read the provided search snippets carefully and answer based only on information found in the snippets.
+    Keep your response clear and concise.
+"""
 
 # Prompt is from OpenAI's simple-evals repository https://github.com/openai/simple-evals/blob/ee3b0318d8d1d9d72755a4120879be65f7c07e9e/simpleqa_eval.py#L13
-ANSWER_GRADER_TEMPLATE = """
+SIMPLEQA_ANSWER_GRADER_TEMPLATE = """
 Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
 First, I will give examples of each grade, and then you will grade a new example.
 
@@ -96,89 +93,42 @@
 Just return the letters "A", "B", or "C", with no text around it.
 """.strip()
 
+FRAMES_ANSWER_GRADER_TEMPLATE = """
+===Task===
+
+
+I need your help in evaluating an answer provided by an LLM against a ground truth
+answer. Your task is to determine if the ground truth answer is present in the LLM’s response.
+Please analyze the provided data and make a decision.
+
+
+===Instructions===
+
+
+1. Carefully compare the "Predicted Answer" with the "Ground Truth Answer".
+2. Consider the substance of the answers – look for equivalent information or correct answers. Do
+not focus on exact wording unless the exact wording is crucial to the meaning.
+3. Your final decision should be based on whether the meaning and the vital facts of the "Ground
+Truth Answer" are present in the "Predicted Answer:"
+
 
-class AnswerGrader:
-    def __init__(self, model: str = constants.GRADER_MODEL, max_retries: int = 3):
-        self.logger = logging.getLogger(self.__class__.__name__)
-
-        self.model = model
-        self.max_retries = max_retries
-        self.api_key = os.getenv("OPENAI_API_KEY")
-        self.headers = {
-            "Authorization": f"Bearer {self.api_key}",
-            "Content-Type": "application/json",
-        }
-
-    async def call_openai_async(self, client: httpx.AsyncClient, prompt: str) -> str:
-        """Make async call to OpenAI API"""
-        for trial in range(self.max_retries + 1):
-            try:
-                payload = {
-                    "model": self.model,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "temperature": 0.0,
-                    "max_tokens": 1024,
-                }
-
-                response = await client.post(
-                    "https://api.openai.com/v1/chat/completions",
-                    headers=self.headers,
-                    json=payload,
-                )
-
-                if response.status_code == 200:
-                    result = response.json()
-                    content = result["choices"][0]["message"]["content"]
-                    if content is None:
-                        raise ValueError("OpenAI API returned empty response")
-                    return content
-                else:
-                    raise Exception(
-                        f"API error {response.status_code}: {response.text}"
-                    )
-
-            except Exception as e:
-                if trial >= self.max_retries:
-                    self.logger.error(f"Failed after {self.max_retries} retries: {e}")
-                    raise
-
-                backoff = 2**trial
-                self.logger.warning(f"Evaluation retry {trial + 1} in {backoff}s: {e}")
-                await asyncio.sleep(backoff)
-
-        raise ValueError("Failed to call OpenAI API")
-
-    async def evaluate_single(
-        self, question: str, target: str, predicted_answer: str
-    ) -> Dict[str, Any]:
-        """Evaluate a single response asynchronously"""
-        grader_prompt = ANSWER_GRADER_TEMPLATE.format(
-            question=question,
-            target=target,
-            predicted_answer=predicted_answer,
-        )
-
-        async with httpx.AsyncClient(timeout=30.0) as client:
-            grading_response = await self.call_openai_async(client, grader_prompt)
-
-        # Parse the grade
-        match = re.search(r"(A|B|C)", grading_response)
-        grade_letter = match.group(0) if match else "C"
-
-        # Convert to readable format
-        score_name = {"A": "is_correct", "B": "is_incorrect", "C": "is_not_attempted"}[
-            grade_letter
-        ]
-
-        is_correct = grade_letter == "A"
-        is_incorrect = grade_letter == "B"
-        is_not_attempted = grade_letter == "C"
-
-        return {
-            "grade": grade_letter,
-            "score_name": score_name,
-            "is_correct": is_correct,
-            "is_incorrect": is_incorrect,
-            "is_not_attempted": is_not_attempted,
-            "score": is_correct,
-        }
+===Input Data===
+
+
+- Question: {question}
+
+
+- Predicted Answer: {predicted_answer}
+
+
+- Ground Truth Answer: {target}
+
+
+===Output Format===
+
+
+Provide your final evaluation in the following format:
+"Explanation:" (How you made the decision?)
+"Decision:" ("TRUE" or "FALSE")
+Please proceed with the evaluation.
+"""