From deee5eee25fec1fb9895dabf17a35a7653391753 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 14:24:41 -0800 Subject: [PATCH 01/23] Migrate YDC to SDK --- .env.example | 7 + .gitignore | 2 +- README.md | 11 +- .../simple_qa.csv | 0 pyproject.toml | 3 + requirements.txt | 10 +- src/{simpleqa => evals}/constants.py | 0 .../processing/evaluate_answer.py | 2 +- .../processing/synthesize_answer.py | 2 +- .../samplers/applied_samplers}/exa_sampler.py | 2 +- .../serp_api_google_sampler.py | 2 +- .../applied_samplers}/tavily_sampler.py | 2 +- .../samplers/applied_samplers/you_sampler.py | 55 ++++ .../samplers/base_samplers/base_sampler.py | 95 +++++++ .../base_samplers/base_sdk_sampler.py | 190 +++++++++++++ src/{simpleqa => evals}/simpleqa_runner.py | 63 +++-- src/simpleqa/sampler/base_sampler.py | 255 ------------------ src/simpleqa/sampler/you_sampler.py | 68 ----- tests/test_simpleqa.py | 138 ++++++++++ 19 files changed, 541 insertions(+), 366 deletions(-) create mode 100644 .env.example rename src/simpleqa/data/simple_qa_test_set.csv => data/simple_qa.csv (100%) rename src/{simpleqa => evals}/constants.py (100%) rename src/{simpleqa => evals}/processing/evaluate_answer.py (99%) rename src/{simpleqa => evals}/processing/synthesize_answer.py (99%) rename src/{simpleqa/sampler => evals/samplers/applied_samplers}/exa_sampler.py (96%) rename src/{simpleqa/sampler => evals/samplers/applied_samplers}/serp_api_google_sampler.py (96%) rename src/{simpleqa/sampler => evals/samplers/applied_samplers}/tavily_sampler.py (96%) create mode 100644 src/evals/samplers/applied_samplers/you_sampler.py create mode 100644 src/evals/samplers/base_samplers/base_sampler.py create mode 100644 src/evals/samplers/base_samplers/base_sdk_sampler.py rename src/{simpleqa => evals}/simpleqa_runner.py (81%) delete mode 100644 src/simpleqa/sampler/base_sampler.py delete mode 100644 src/simpleqa/sampler/you_sampler.py create mode 100644 tests/test_simpleqa.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..0fb7487 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +YOU_API_KEY= +OPENAI_API_KEY= +EXA_API_KEY= +PERPLEXITY_API_KEY= +PARALLEL_API_KEY= +SERP_API_KEY= +TAVILY_API_KEY= diff --git a/.gitignore b/.gitignore index 30448e5..ae7b630 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ venv/ venv* # Files -src/simpleqa/results/* +src/evals/results/* # Environment Variables .env diff --git a/README.md b/README.md index 7af3913..ae11e07 100644 --- a/README.md +++ b/README.md @@ -12,19 +12,16 @@ If you would like to reproduce the numbers or add new samplers, follow the instr cd evals ``` -2. Install the required dependencies: +2. Create a virtual environment with the tool of your choice, then install the required dependencies: ```bash + # create and activate virtual environment pip install -r requirements.txt pip install -e . ``` -3. Set up environment variables as environment variables or an .env file: +3. Set up your `.env` file and insert the appropriate API keys: ```bash - export OPENAI_API_KEY=your_openai_api_key - export YOU_API_KEY=your_you_api_key - export TAVILY_API_KEY=your_you_api_key - export EXA_API_KEY=your_you_api_key - export SERP_API_KEY=your_you_api_key + cp .env.example .env ``` ## Running a SimpleQA evaluation diff --git a/src/simpleqa/data/simple_qa_test_set.csv b/data/simple_qa.csv similarity index 100% rename from src/simpleqa/data/simple_qa_test_set.csv rename to data/simple_qa.csv diff --git a/pyproject.toml b/pyproject.toml index c52bbc7..584b344 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,3 +15,6 @@ classifiers = [ [project.urls] Homepage = "https://github.com/youdotcom-oss/evals.git" + +[tool.pytest.ini_options] +asyncio_default_fixture_loop_scope = "function" diff --git a/requirements.txt b/requirements.txt index 8cb74a0..bb29995 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,12 @@ aiohttp==3.12.15 +exa-py==2.2.0 openai==1.78.1 -pydantic==2.11.4 pandas==2.2.3 -tqdm==4.67.1 +pydantic==2.11.4 +pytest==8.3.4 +pytest-asyncio==0.24.0 +python-dotenv==1.0.1 retry==0.9.2 +tavily-python==0.7.20 +tqdm==4.67.1 +youdotcom==2.1.0 diff --git a/src/simpleqa/constants.py b/src/evals/constants.py similarity index 100% rename from src/simpleqa/constants.py rename to src/evals/constants.py diff --git a/src/simpleqa/processing/evaluate_answer.py b/src/evals/processing/evaluate_answer.py similarity index 99% rename from src/simpleqa/processing/evaluate_answer.py rename to src/evals/processing/evaluate_answer.py index bdf9bbb..8b75bac 100644 --- a/src/simpleqa/processing/evaluate_answer.py +++ b/src/evals/processing/evaluate_answer.py @@ -12,7 +12,7 @@ import httpx -from simpleqa import constants +from evals import constants # Prompt is from OpenAI's simple-evals repository https://github.com/openai/simple-evals/blob/ee3b0318d8d1d9d72755a4120879be65f7c07e9e/simpleqa_eval.py#L13 ANSWER_GRADER_TEMPLATE = """ diff --git a/src/simpleqa/processing/synthesize_answer.py b/src/evals/processing/synthesize_answer.py similarity index 99% rename from src/simpleqa/processing/synthesize_answer.py rename to src/evals/processing/synthesize_answer.py index f697f88..6ca43fb 100644 --- a/src/simpleqa/processing/synthesize_answer.py +++ b/src/evals/processing/synthesize_answer.py @@ -14,7 +14,7 @@ import httpx -from simpleqa import constants +from evals import constants @dataclass diff --git a/src/simpleqa/sampler/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py similarity index 96% rename from src/simpleqa/sampler/exa_sampler.py rename to src/evals/samplers/applied_samplers/exa_sampler.py index d7fcee0..c2adede 100644 --- a/src/simpleqa/sampler/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -1,7 +1,7 @@ import os from typing import Any, Dict -from simpleqa.sampler.base_sampler import BaseSampler +from evals.samplers.base_samplers.base_sampler import BaseSampler class ExaSampler(BaseSampler): diff --git a/src/simpleqa/sampler/serp_api_google_sampler.py b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py similarity index 96% rename from src/simpleqa/sampler/serp_api_google_sampler.py rename to src/evals/samplers/applied_samplers/serp_api_google_sampler.py index 2061b3c..b10c9cd 100644 --- a/src/simpleqa/sampler/serp_api_google_sampler.py +++ b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py @@ -1,7 +1,7 @@ import os from typing import Any, Dict -from simpleqa.sampler.base_sampler import BaseSampler +from evals.samplers.base_samplers.base_sampler import BaseSampler class SerpApiGoogleSampler(BaseSampler): diff --git a/src/simpleqa/sampler/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py similarity index 96% rename from src/simpleqa/sampler/tavily_sampler.py rename to src/evals/samplers/applied_samplers/tavily_sampler.py index c5bf008..bc3c762 100644 --- a/src/simpleqa/sampler/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -1,7 +1,7 @@ import os from typing import Any, Dict -from simpleqa.sampler.base_sampler import BaseSampler +from evals.samplers.base_samplers.base_sampler import BaseSampler class TavilySampler(BaseSampler): diff --git a/src/evals/samplers/applied_samplers/you_sampler.py b/src/evals/samplers/applied_samplers/you_sampler.py new file mode 100644 index 0000000..762f960 --- /dev/null +++ b/src/evals/samplers/applied_samplers/you_sampler.py @@ -0,0 +1,55 @@ +"""Run evals using the you.com Search SDK https://docs.you.com/api-reference/search/v1-search""" +import os +from typing import Any, Dict + +from youdotcom import You + +from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler + + +class YouSampler(BaseSDKSampler): + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_retries: int = 3, + num_results: int = 5, + max_concurrency: int = 10, + needs_synthesis: bool = True, + custom_args: Dict[str, Any] | None = None, + ): + super().__init__( + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + num_results=num_results, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, + ) + + def _initialize_client(self): + self.client = You(self.api_key) + + def _get_search_results_impl(self, query: str) -> Any: + return self.client.search.unified( + query=query, + count=self.num_results, + ) + + def format_results(self, results: Any) -> str: + formatted_results = [] + if results.results and results.results.web: + for result in results.results.web: + title = getattr(result, "title", "") + url = getattr(result, "url", "") + description = getattr(result, "description", "") + snippets = getattr(result, "snippets", "") + if snippets and isinstance(snippets, list): + snippets = " ".join(snippets) + formatted_results.append( + f"[{title}]({url})\n snippets: {snippets}\n description: {description}" + ) + return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py new file mode 100644 index 0000000..8df0626 --- /dev/null +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -0,0 +1,95 @@ +from abc import ABC, abstractmethod +from typing import Any, Dict + +import httpx + +from evals.processing.synthesize_answer import SynthesizeAnswer + + +class BaseSampler(ABC): + """Base class for all samplers with common functionality""" + + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_retries: int = 3, + num_results: int = 5, + max_concurrency: int = 10, + needs_synthesis: bool = True, + custom_args=None, + ): + self.sampler_name = sampler_name + self.timeout = timeout + self.max_retries = max_retries + self.num_results = num_results + self.max_concurrency = max_concurrency + self.needs_synthesis = needs_synthesis + self.custom_args = custom_args + + if api_key: + self.api_key = api_key + else: + # You do not want to raise an error here, or else you can not run an eval without ALL env variables + print(f'API Key for sampler "{sampler_name}" is not set') + self.api_key = None + + @abstractmethod + def get_search_results(self, query: str) -> Any: + """ + Get raw search results from the API or SDK. + + Args: + query: The search query string + + Returns: + Raw search results in provider-specific format + """ + pass + + @abstractmethod + def format_results(self, results: Any) -> str: + """ + Format search results. + + Args: + results: Raw search results from get_search_results + + Returns: + tuple: (formatted_results) where formatted_results is either: + - str: Already synthesized answer (no further synthesis needed) + - list[str]: List of individual search results (needs synthesis) + """ + pass + + @staticmethod + def __extract_query_from_messages__(message_list: list[dict]) -> str: + """Extract query from message list""" + if isinstance(message_list, list) and len(message_list) > 0: + last_message = message_list[-1] + if isinstance(last_message, dict) and "content" in last_message: + return last_message["content"] + return str(message_list) + + + async def __synthesize_response(self, query: str, formatted_context: str) -> str: + """ + Private method for synthesizing responses from search results using OpenAI + """ + answer_synthesizer = SynthesizeAnswer(max_retries=3) + async with httpx.AsyncClient(timeout=30.0) as client: + result = await answer_synthesizer.process_single( + client, query, formatted_context + ) + return result.response_text if result else f"Synthesis failed for: {query}" + + @staticmethod + async def __evaluate_response( + query: str, ground_truth: str, generated_answer: str + ) -> Dict[str, Any]: + """Evaluate the generated response against ground truth""" + from evals.processing.evaluate_answer import AnswerGrader + + evaluator = AnswerGrader() + return await evaluator.evaluate_single(query, ground_truth, generated_answer) diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py new file mode 100644 index 0000000..93402cd --- /dev/null +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -0,0 +1,190 @@ +from abc import abstractmethod +import asyncio +import logging +import sys +import time +import traceback +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict + +from evals.samplers.base_samplers.base_sampler import BaseSampler + + +class BaseSDKSampler(BaseSampler): + """Base class for SDK-based samplers that use provider SDKs""" + + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_retries: int = 3, + num_results: int = 5, + max_concurrency: int = 10, + needs_synthesis: bool = True, + custom_args=None, + ): + super().__init__( + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + num_results=num_results, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, + ) + self.client = None + if self.api_key: + self._initialize_client() + else: + raise ValueError("API key not provided") + + @abstractmethod + def _initialize_client(self): + """ + Initialize the SDK client with the API key. + + Returns: + Initialized SDK client instance + """ + pass + + @abstractmethod + def _get_search_results_impl(self, query: str) -> Any: + """ + Implementation of getting raw search results using the SDK client. + This method should be implemented by derived classes. + + Args: + query: The search query string + + Returns: + Raw search results in provider-specific format + """ + pass + + def get_search_results(self, query: str) -> Any: + """ + Get raw search results using the SDK client. + This method wraps _get_search_results_impl with error handling and timeout. + + Args: + query: The search query string + + Returns: + Raw search results in provider-specific format + + Raises: + TimeoutError: If the search operation exceeds the timeout + Exception: Re-raises any exception encountered during search + """ + try: + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(self._get_search_results_impl, query) + return future.result(timeout=self.timeout) + except TimeoutError: + error_msg = f"{self.sampler_name} timed out after {self.timeout} seconds" + logging.error(error_msg) + raise TimeoutError(error_msg) + except Exception as e: + logging.error(f"{self.sampler_name} failed with error {e}") + raise e + + async def _retry_with_backoff_async(self, func, *args, **kwargs): + """Generic async retry logic with exponential backoff""" + trial = 0 + while True: + try: + return await func(*args, **kwargs) + except Exception as e: + _, _, traceback_ = sys.exc_info() + if trial >= self.max_retries: + logging.error(f"Failed after {self.max_retries} retries: {str(e)}") + raise + + trial += 1 + backoff_time = 2**trial + logging.warning( + f"Attempt {trial}/{self.max_retries} failed: {traceback.print_tb(traceback_)}. Retrying in {backoff_time}s..." + ) + await asyncio.sleep(backoff_time) + + async def __call__( + self, query_input, ground_truth: str = "", overwrite: bool = False + ) -> Dict[str, Any]: + """Main execution pipeline""" + + if isinstance(query_input, list): + query = self.__extract_query_from_messages__(query_input) + else: + query = str(query_input) + + # if self.custom_args: + # payload = self._get_payload(query=query, custom_args=self.custom_args) + # else: + # payload = self._get_payload(query=query) + # + # method = self._get_method() + # endpoint = self._get_endpoint() + + # Get raw results + try: + # Run synchronous SDK call in thread pool + start_time = time.time() + raw_results = await asyncio.to_thread( + self.get_search_results, + query + ) + response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms + formatted_results = self.format_results(raw_results) + except Exception as e: + raw_results, response_time_no_retries, formatted_results = ( + "FAILED", + "FAILED", + "FAILED", + ) + breakpoint() + logging.exception(e) + + # Synthesize raw results + try: + if self.needs_synthesis: + generated_answer = await self.__synthesize_response( + query, formatted_results + ) + else: + generated_answer = formatted_results # Already synthesized by API + except Exception as e: + generated_answer = "FAILED" + logging.exception(e) + + # Evaluated synthesized results against ground truth + try: + if ground_truth: + evaluation_result_dict = await self.__evaluate_response( + query, ground_truth, generated_answer + ) + evaluation_result = evaluation_result_dict["score_name"] + else: + raise ValueError("Ground truth is missing") + except Exception as e: + evaluation_result = "FAILED" + logging.exception(e) + + # Format result + result = { + "query": query, + "response_time_ms": response_time_no_retries, + "evaluation_result": evaluation_result, + "generated_answer": generated_answer, + "ground_truth": ground_truth, + "raw_results": raw_results, + "formatted_results": formatted_results, + } + return result + + # async def close(self): + # """Cleanup resources""" + # if hasattr(self, "client"): + # await self.client.close() diff --git a/src/simpleqa/simpleqa_runner.py b/src/evals/simpleqa_runner.py similarity index 81% rename from src/simpleqa/simpleqa_runner.py rename to src/evals/simpleqa_runner.py index 15f06b9..4503905 100644 --- a/src/simpleqa/simpleqa_runner.py +++ b/src/evals/simpleqa_runner.py @@ -15,10 +15,10 @@ import pandas as pd from tqdm import tqdm -from simpleqa.sampler.exa_sampler import ExaSampler -from simpleqa.sampler.serp_api_google_sampler import SerpApiGoogleSampler -from simpleqa.sampler.tavily_sampler import TavilySampler -from simpleqa.sampler.you_sampler import YouSampler +from evals.samplers.applied_samplers.exa_sampler import ExaSampler +from evals.samplers.applied_samplers.serp_api_google_sampler import SerpApiGoogleSampler +from evals.samplers.applied_samplers.tavily_sampler import TavilySampler +from evals.samplers.applied_samplers.you_sampler import YouSampler logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -27,27 +27,31 @@ def get_sampler_filepath(sampler_name): - return Path(os.getcwd(), f"src/simpleqa/results/raw_results_{sampler_name}.csv") + return Path(os.getcwd(), f"src/evals/results/raw_results_{sampler_name}.csv") def get_samplers(args: argparse.Namespace): """Initialize requested samplers""" samplers = { - "you": YouSampler(sampler_name="you", num_results=args.num_results), - "tavily": TavilySampler(sampler_name="tavily", num_results=args.num_results), - "google": SerpApiGoogleSampler( - sampler_name="google", num_results=args.num_results - ), - "exa": ExaSampler( - sampler_name="exa_highlights", - num_results=args.num_results, - custom_args={"type_": None}, - ), - "exa_fast": ExaSampler( - sampler_name="exa_highlights_fast", + "you_unified_search": YouSampler( + sampler_name="you_unified_search", num_results=args.num_results, - custom_args={"type_": "fast"}, + api_key=os.getenv("YOU_API_KEY"), ), + # "tavily": TavilySampler(sampler_name="tavily", num_results=args.num_results), + # "google": SerpApiGoogleSampler( + # sampler_name="google", num_results=args.num_results + # ), + # "exa": ExaSampler( + # sampler_name="exa_highlights", + # num_results=args.num_results, + # custom_args={"type_": None}, + # ), + # "exa_fast": ExaSampler( + # sampler_name="exa_highlights_fast", + # num_results=args.num_results, + # custom_args={"type_": "fast"}, + # ), } sampler_list = [] @@ -63,7 +67,7 @@ def get_samplers(args: argparse.Namespace): def clean_results_folder(): - results_folder_path = Path(os.getcwd(), "src/simpleqa/results") + results_folder_path = Path(os.getcwd(), "src/evals/results") if os.path.isdir(results_folder_path): shutil.rmtree(results_folder_path) @@ -71,7 +75,7 @@ def clean_results_folder(): def get_remaining_problems(df, sampler_name): """In case of failure, only run problems from the dataset that have not been run yet""" sampler_results_filepath = get_sampler_filepath(sampler_name) - results_folder_path = Path(os.getcwd(), "src/simpleqa/results") + results_folder_path = Path(os.getcwd(), "src/evals/results") if os.path.isdir(results_folder_path) and os.path.isfile(sampler_results_filepath): sampler_results = pd.read_csv(sampler_results_filepath) return df[~df["problem"].isin(sampler_results["query"].tolist())] @@ -127,7 +131,7 @@ async def get_search_results_and_run_evals( ) df = remaining_problems - # Run SimpleQA problems in batches + # Run problems in batches with tqdm( total=len(df), desc=f"Running sampler: {sampler.sampler_name}", @@ -156,7 +160,7 @@ async def get_search_results_and_run_evals( # Write results of each batch so we can keep progress in case of a failure write_raw_sampler_results(batch_results, sampler.sampler_name) - await sampler.close() + # await sampler.close() def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str): @@ -166,8 +170,8 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st This takes the raw results list, not the full results dictionary in case an individual sampler fails. """ df_sampler_results = pd.DataFrame(sampler_results) - if not os.path.isdir("src/simpleqa/results"): - os.mkdir("src/simpleqa/results") + if not os.path.isdir(Path(os.getcwd(), "src/evals/results")): + os.mkdir(Path(os.getcwd(), "src/evals/results")) sampler_results_filepath = get_sampler_filepath(sampler_name) if os.path.isfile(sampler_results_filepath): @@ -187,7 +191,7 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st def write_metrics(): """Calculate metrics from raw results such as average score, P50 latency""" - results_path = Path(os.getcwd(), "src/simpleqa/results") + results_path = Path(os.getcwd(), "src/evals/results") files = glob.glob(f"{results_path}/raw_results_*.csv") metric_rows = [] for sampler_results_file in files: @@ -202,6 +206,9 @@ def write_metrics(): df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"] ) count_answered = len(successful_df) + if count_answered == 0: + breakpoint() + raise ValueError("No rows found in raw results file") average_score = round((correct / count_answered) * 100, 2) metric_rows.append( @@ -213,12 +220,12 @@ def write_metrics(): } ) - write_path = Path(os.getcwd(), "src/simpleqa/results/simpleqa_results.csv") + write_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") pd.DataFrame(metric_rows).to_csv(write_path, index=False) async def main(): - available_samplers = ["you", "exa", "exa_fast", "google", "tavily"] + available_samplers = ["you_unified_search", "exa", "exa_fast", "google", "tavily"] parser = argparse.ArgumentParser(description="Run SimpleQA eval") parser.add_argument( "--samplers", @@ -253,7 +260,7 @@ async def main(): ) parser.add_argument( "--csv-path", - default="src/simpleqa/data/simple_qa_test_set.csv", + default="src/evals/data/simple_qa_test_set.csv", type=str, help="Used to define the filepath of the test set", ) diff --git a/src/simpleqa/sampler/base_sampler.py b/src/simpleqa/sampler/base_sampler.py deleted file mode 100644 index 096f4ad..0000000 --- a/src/simpleqa/sampler/base_sampler.py +++ /dev/null @@ -1,255 +0,0 @@ -from abc import ABC, abstractmethod -import asyncio -import logging -import sys -import time -import traceback -from typing import Any, Dict - -import httpx - -from simpleqa.processing.synthesize_answer import SynthesizeAnswer - - -class BaseSampler(ABC): - """Base class for all samplers""" - - def __init__( - self, - sampler_name: str, - api_key: str, - max_retries: int = 3, - timeout: float = 60.0, - num_results: int = 5, - custom_args: Dict[str, Any] | None = None, - ): - self.sampler_name = sampler_name - if api_key: - self.api_key = api_key - else: - raise ValueError(f'API Key for sampler "{sampler_name}" is not set') - self.max_retries = max_retries - self.timeout = timeout - self.num_results = num_results - self.custom_args = custom_args - - self.logger = logging.getLogger(self.__class__.__name__) - if not self.logger.handlers: - self._setup_logger() - - self.client = self._get_client() - - def _setup_logger(self): - """Set up logger, set logging level, and disable noisy loggers""" - handler = logging.StreamHandler() - formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s") - handler.setFormatter(formatter) - self.logger.addHandler(handler) - self.logger.setLevel(logging.INFO) - - # Disable noisy third-party logging - logging.getLogger("httpx").setLevel(logging.WARNING) - logging.getLogger("urllib3").setLevel(logging.WARNING) - - def _get_client(self) -> httpx.AsyncClient: - """Setup async HTTP client with provider-specific headers""" - base_url = self._get_base_url() - headers = self._get_headers() - return httpx.AsyncClient( - base_url=base_url, - headers=headers, - timeout=self.timeout, - ) - - @staticmethod - @abstractmethod - def _get_base_url(): - """Get provider specific base url""" - pass - - @abstractmethod - def _get_headers(self) -> Dict[str, str]: - """Get provider specific headers""" - pass - - @abstractmethod - def _get_payload(self, query: str, **kwargs) -> Dict[str, Any]: - """Get provider specific request payload""" - pass - - @staticmethod - @abstractmethod - def _get_endpoint() -> Dict[str, str]: - """Get provider specific API endpoint""" - pass - - @staticmethod - @abstractmethod - def _get_method() -> Dict[str, str]: - """Get provider specific HTTP method""" - pass - - @staticmethod - @abstractmethod - def __format_context__(results: Any) -> str: - """Format search results into context string""" - pass - - async def _retry_with_backoff_async(self, func, *args, **kwargs): - """Generic async retry logic with exponential backoff""" - trial = 0 - while True: - try: - return await func(*args, **kwargs) - except Exception as e: - _, _, traceback_ = sys.exc_info() - if trial >= self.max_retries: - logging.error(f"Failed after {self.max_retries} retries: {str(e)}") - raise - - trial += 1 - backoff_time = 2**trial - logging.warning( - f"Attempt {trial}/{self.max_retries} failed: {traceback.print_tb(traceback_)}. Retrying in {backoff_time}s..." - ) - await asyncio.sleep(backoff_time) - - @property - @abstractmethod - def needs_synthesis(self) -> bool: - """Whether this provider needs response synthesis""" - pass - - async def __synthesize_response(self, query: str, formatted_context: str) -> str: - """ - Private method for synthesizing responses from search results using OpenAI - """ - answer_synthesizer = SynthesizeAnswer(max_retries=3) - async with httpx.AsyncClient(timeout=30.0) as client: - result = await answer_synthesizer.process_single( - client, query, formatted_context - ) - return result.response_text if result else f"Synthesis failed for: {query}" - - async def _get_search_results(self, method, endpoint, payload): - """Get raw search results and response time from the API""" - start_time = time.time() - if method == "POST": - response = await self.client.post(endpoint, json=payload) - elif method == "GET": - response = await self.client.get(endpoint, params=payload) - else: - raise ValueError( - 'Unsupported method, please select between ["POST", "GET"]' - ) - - end_time = time.time() - data = response.json() - response_time_ms = round((end_time - start_time) * 1000) - - if response.status_code != 200: - raise httpx.HTTPStatusError( - f"Error {response.status_code}: {data.get('error', 'Unknown error')}", - request=response.request, - response=response, - ) - - return data, response_time_ms - - @staticmethod - def __extract_query_from_messages__(message_list: list[Dict[str, Any]]) -> str: - """Extract query from message list""" - if isinstance(message_list, list) and len(message_list) > 0: - last_message = message_list[-1] - if isinstance(last_message, dict) and "content" in last_message: - return last_message["content"] - return str(message_list) - - @staticmethod - async def __evaluate_response( - query: str, ground_truth: str, generated_answer: str - ) -> Dict[str, Any]: - """Evaluate the generated response against ground truth""" - from simpleqa.processing.evaluate_answer import AnswerGrader - - evaluator = AnswerGrader() - return await evaluator.evaluate_single(query, ground_truth, generated_answer) - - async def __call__( - self, query_input, ground_truth: str = "", overwrite: bool = False - ) -> Dict[str, Any]: - """Main execution pipeline""" - - if isinstance(query_input, list): - query = self.__extract_query_from_messages__(query_input) - else: - query = str(query_input) - - if self.custom_args: - payload = self._get_payload(query=query, custom_args=self.custom_args) - else: - payload = self._get_payload(query=query) - - method = self._get_method() - endpoint = self._get_endpoint() - - # Get raw results - try: - raw_results, response_time_no_retries = ( - await self._retry_with_backoff_async( - self._get_search_results, - method=method, - endpoint=endpoint, - payload=payload, - ) - ) - formatted_results = self.__format_context__(raw_results) - except Exception as e: - raw_results, response_time_no_retries, formatted_results = ( - "FAILED", - "FAILED", - "FAILED", - ) - logging.exception(e) - - # Synthesize raw results - try: - if self.needs_synthesis: - generated_answer = await self.__synthesize_response( - query, formatted_results - ) - else: - generated_answer = formatted_results # Already synthesized by API - except Exception as e: - generated_answer = "FAILED" - logging.exception(e) - - # Evaluated synthesized results against ground truth - try: - if ground_truth: - evaluation_result_dict = await self.__evaluate_response( - query, ground_truth, generated_answer - ) - evaluation_result = evaluation_result_dict["score_name"] - else: - raise ValueError("Ground truth is missing") - except Exception as e: - evaluation_result = "FAILED" - logging.exception(e) - - # Format result - result = { - "query": query, - "response_time_ms": response_time_no_retries, - "evaluation_result": evaluation_result, - "generated_answer": generated_answer, - "ground_truth": ground_truth, - "raw_results": raw_results, - "formatted_results": formatted_results, - } - return result - - async def close(self): - """Cleanup resources""" - if hasattr(self, "client"): - await self.client.aclose() diff --git a/src/simpleqa/sampler/you_sampler.py b/src/simpleqa/sampler/you_sampler.py deleted file mode 100644 index d7132e1..0000000 --- a/src/simpleqa/sampler/you_sampler.py +++ /dev/null @@ -1,68 +0,0 @@ -import os -from typing import Any, Dict - -from simpleqa.sampler.base_sampler import BaseSampler - - -class YouSampler(BaseSampler): - - @property - def needs_synthesis(self) -> bool: - return True # Search provider, needs answer synthesis - - def __init__( - self, - sampler_name: str, - max_retries: int = 3, - timeout: float = 60.0, - num_results: int = 5, - custom_args: Dict[str, Any] | None = None, - ): - super().__init__( - sampler_name, - os.getenv("YOU_API_KEY"), - max_retries, - timeout, - num_results, - custom_args, - ) - - @staticmethod - def _get_base_url(): - return "https://api.ydc-index.io" - - def _get_headers(self) -> Dict[str, str]: - return {"x-api-key": self.api_key} - - @staticmethod - def _get_endpoint() -> str: - return "v1/search/" - - @staticmethod - def _get_method() -> str: - return "GET" - - def _get_payload( - self, query: str, custom_args: Dict[str, Any] | None = None - ) -> Dict[str, Any]: - return { - "query": query, - "num_web_results": self.num_results, - } - - @staticmethod - def __format_context__(results: Any) -> str: - formatted_results = [] - if "results" in results: - for result in results['results']['web']: - if isinstance(result, dict): - title = result.get("title", "") - url = result.get("url", "") - description = result.get("description", "") - snippet = result.get("snippets", "") - if snippet and isinstance(snippet, list): - snippet = " ".join(snippet) - formatted_results.append( - f"[{title}]({url})\n snippet: {snippet}\n description: {description}" - ) - return "\n---\n".join(formatted_results) diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py new file mode 100644 index 0000000..7ec3c1c --- /dev/null +++ b/tests/test_simpleqa.py @@ -0,0 +1,138 @@ +"""Tests for SimpleQA evaluation runner""" +import argparse +import asyncio +import os +from pathlib import Path +import shutil + +import dotenv +import pandas as pd +import pytest + +from evals.simpleqa_runner import ( + get_search_results_and_run_evals, + get_sampler_filepath, + write_metrics, +) + + +dotenv.load_dotenv() + + +@pytest.fixture +def test_results_cleanup(): + """Cleanup test results before and after test""" + results_folder_path = Path(os.getcwd(), "src/evals/results") + + # Clean before test + if os.path.isdir(results_folder_path): + shutil.rmtree(results_folder_path) + + yield + + # Clean after test + if os.path.isdir(results_folder_path): + shutil.rmtree(results_folder_path) + + +@pytest.mark.asyncio +async def test_simpleqa_runner_you_unified_search(test_results_cleanup): + """ + Test running simpleqa_runner with you_unified_search sampler + on the simple_qa_n100 dataset. + + This test verifies that: + 1. The runner can process queries using the you_unified_search sampler + 2. Results are written to the correct output file + 3. Results contain expected columns and data + 4. Metrics are calculated correctly + """ + # Skip test if YOU_API_KEY is not set + if not os.getenv("YOU_API_KEY"): + pytest.skip("YOU_API_KEY not set - skipping test") + + # Create test arguments + num_problems = 10 + args = argparse.Namespace( + samplers=["you_unified_search"], + csv_path="data/simple_qa.csv", + limit=num_problems, # Test with small subset for speed + batch_size=5, + max_concurrent_tasks=5, + num_results=5, + clean=True, + ) + + # Run the evaluation + await get_search_results_and_run_evals(args) + + # Verify results file was created + results_filepath = get_sampler_filepath("you_unified_search") + assert os.path.isfile(results_filepath), f"Results file not created at {results_filepath}" + + # Read and verify results + df_results = pd.read_csv(results_filepath) + + # Check expected columns exist + expected_columns = ["query", "response_time_ms", "evaluation_result"] + for col in expected_columns: + assert col in df_results.columns, f"Expected column '{col}' not found in results" + + # Verify we got results for the queries + assert len(df_results) == num_problems, f"Expected {num_problems} results, got {len(df_results)}" + + # Verify all queries have non-null values + assert df_results["query"].notna().all(), "Some queries are null" + + # Write and verify metrics + write_metrics() + metrics_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") + assert os.path.isfile(metrics_path), "Metrics file not created" + + df_metrics = pd.read_csv(metrics_path) + assert len(df_metrics) == 1, "Expected 1 sampler in metrics" + assert df_metrics.iloc[0]["provider"] == "you_unified_search" + assert "average_score" in df_metrics.columns + assert "p50_latency" in df_metrics.columns + assert "problem_count" in df_metrics.columns + + +@pytest.mark.asyncio +async def test_simpleqa_runner_resume_capability(test_results_cleanup): + """ + Test that the runner can resume from partial results. + + This test verifies that if a run is interrupted, it can continue + from where it left off without re-processing completed queries. + """ + # Skip test if YOU_API_KEY is not set + if not os.getenv("YOU_API_KEY"): + pytest.skip("YOU_API_KEY not set - skipping test") + + num_problems = 10 + # Create test arguments for first run (partial) + args = argparse.Namespace( + samplers=["you_unified_search"], + csv_path="data/simple_qa.csv", + limit=num_problems, + batch_size=5, + max_concurrent_tasks=5, + num_results=5, + clean=True, + ) + + # First run + await get_search_results_and_run_evals(args) + results_filepath = get_sampler_filepath("you_unified_search") + df_first = pd.read_csv(results_filepath) + first_run_count = len(df_first) + assert first_run_count == num_problems, f"Expected {num_problems} results from first run, got {first_run_count}" + + # Second run with more queries (should add new results) + args.limit = 5 + args.clean = False # Don't clean, resume from existing + await get_search_results_and_run_evals(args) + + df_second = pd.read_csv(results_filepath) + second_run_count = len(df_second) + assert second_run_count == num_problems + args.limit, f"Expected {num_problems} total results after second run, got {second_run_count}" From 5f9bd39ab5cf2ef7e9b3beafa03c45fb1bf5e848 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 15:45:53 -0800 Subject: [PATCH 02/23] Migrate Exa and Tavily to SDK --- src/evals/configs/samplers.py | 31 +++++++ .../samplers/applied_samplers/exa_sampler.py | 85 +++++++++---------- .../serp_api_google_sampler.py | 2 +- .../applied_samplers/tavily_sampler.py | 78 ++++++++--------- .../samplers/applied_samplers/you_sampler.py | 30 ++++--- .../samplers/base_samplers/base_sampler.py | 1 - .../base_samplers/base_sdk_sampler.py | 3 +- src/evals/simpleqa_runner.py | 53 +++--------- tests/test_simpleqa.py | 84 +++++++++--------- 9 files changed, 174 insertions(+), 193 deletions(-) create mode 100644 src/evals/configs/samplers.py diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py new file mode 100644 index 0000000..6ce1515 --- /dev/null +++ b/src/evals/configs/samplers.py @@ -0,0 +1,31 @@ +import os + +from evals.samplers.applied_samplers.exa_sampler import ExaSampler +from evals.samplers.applied_samplers.serp_api_google_sampler import SerpApiGoogleSampler +from evals.samplers.applied_samplers.tavily_sampler import TavilySampler +from evals.samplers.applied_samplers.you_sampler import YouSampler + +SAMPLERS = [ + YouSampler( + sampler_name="you_unified_search", + api_key=os.getenv("YOU_API_KEY"), + ), + ExaSampler( + sampler_name="exa_search_with_contents", + api_key=os.getenv("EXA_API_KEY"), + custom_args={"text": True}, + ), + TavilySampler( + sampler_name="tavily_basic", + api_key=os.getenv("TAVILY_API_KEY"), + custom_args={"search_depth": "basic"}, + ), + TavilySampler( + sampler_name="tavily_advanced", + api_key=os.getenv("TAVILY_API_KEY"), + custom_args={"search_depth": "advanced"}, + ), + # SerpApiGoogleSampler( + # sampler_name="google", num_results=args.num_results + # ), +] \ No newline at end of file diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index c2adede..d7f04db 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -1,68 +1,59 @@ -import os +"""Run evals using the Exa SDK""" from typing import Any, Dict -from evals.samplers.base_samplers.base_sampler import BaseSampler +from exa_py import Exa +from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler -class ExaSampler(BaseSampler): - - @property - def needs_synthesis(self) -> bool: - return True # Search provider, needs answer synthesis +class ExaSampler(BaseSDKSampler): def __init__( self, sampler_name: str, - max_retries: int = 3, + api_key: str = None, timeout: float = 60.0, + max_retries: int = 3, num_results: int = 5, + max_concurrency: int = 10, + needs_synthesis: bool = True, custom_args: Dict[str, Any] | None = None, ): super().__init__( - sampler_name, - os.getenv("EXA_API_KEY"), - max_retries, - timeout, - num_results, - custom_args, + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + num_results=num_results, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, ) - @staticmethod - def _get_base_url(): - return "https://api.exa.ai" + def _initialize_client(self): + self.client = Exa(self.api_key) - def _get_headers(self) -> Dict[str, str]: - return {"x-api-key": self.api_key} + def _get_search_results_impl(self, query: str) -> Any: + if self.custom_args and self.custom_args["text"]: + return self.client.search( + query=query, + num_results=5, + contents={ + "text": True + } + ) - def _get_payload( - self, query: str, custom_args: Dict[str, Any] | None = None - ) -> Dict[str, Any]: - payload = { - "query": query, - "numResults": self.num_results, - "contents": {"highlights": True}, - } - if custom_args and "type_" in custom_args and custom_args["type_"] is not None: - payload["type"] = custom_args["type_"] + raise ValueError("Unknown configuration for Exa") - return payload - - @staticmethod - def _get_endpoint() -> str: - return "search/" + def format_results(self, results: Any) -> str: + formatted_results = [] + raw_results = getattr(results, "results", None) - @staticmethod - def _get_method() -> str: - return "POST" + for result in raw_results: + if isinstance(result, dict): + title = getattr(result, "title", "") + url = getattr(result, "url", "") + text = getattr(result, "text", "") + if text: + formatted_results.append(f"[{title}]({url})\ntext: \"{text}\"\n") - @staticmethod - def __format_context__(results: Any) -> str: - formatted_results = [] - if "results" in results: - for result in results["results"]: - if isinstance(result, dict): - title = result.get("title", "") - url = result.get("url", "") - highlights = result.get("highlights", "") - formatted_results.append(f"[{title}]({url})\n{highlights}\n") return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py index b10c9cd..0c988f0 100644 --- a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py +++ b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py @@ -48,7 +48,7 @@ def _get_payload( return { "q": query, "engine": "google", - "num": self.num_results, + "num": 5, "api_key": self.api_key, } diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index bc3c762..009a517 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -1,62 +1,56 @@ -import os +"""Run evals using the Tavily SDK""" from typing import Any, Dict -from evals.samplers.base_samplers.base_sampler import BaseSampler +from tavily import TavilyClient +from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler -class TavilySampler(BaseSampler): - @property - def needs_synthesis(self) -> bool: - return True # Search provider, needs answer synthesis +class TavilySampler(BaseSDKSampler): def __init__( self, sampler_name: str, - max_retries: int = 3, + api_key: str = None, timeout: float = 60.0, + max_retries: int = 3, num_results: int = 5, + max_concurrency: int = 10, + needs_synthesis: bool = True, custom_args: Dict[str, Any] | None = None, ): super().__init__( - sampler_name, - os.getenv("TAVILY_API_KEY"), - max_retries, - timeout, - num_results, - custom_args, + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + num_results=num_results, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, ) - @staticmethod - def _get_base_url(): - return "https://api.tavily.com/" - - @staticmethod - def _get_endpoint() -> str: - return "search" + def _initialize_client(self): + self.client = TavilyClient(self.api_key) - @staticmethod - def _get_method() -> str: - return "POST" + def _get_search_results_impl(self, query: str) -> Any: + if self.custom_args and self.custom_args["search_depth"]: + return self.client.search( + query=query, + max_results=5, + search_depth=self.custom_args["search_depth"], + ) + raise ValueError("Unknown configuration for Tavily") - def _get_headers(self) -> Dict[str, str]: - return {"Authorization": f"Bearer {self.api_key}"} + def format_results(self, results: Any) -> str: + formatted_results = [] + raw_results = results["results"] - def _get_payload( - self, query: str, custom_args: Dict[str, Any] | None = None - ) -> Dict[str, Any]: - return { - "query": query, - "max_results": self.num_results, - } + for result in raw_results: + if isinstance(result, dict): + title = getattr(result, "title", "") + url = getattr(result, "url", "") + content = getattr(result, "content", "") + if content: + formatted_results.append(f"[{title}]({url})\ncontent: {content}\n") - @staticmethod - def __format_context__(results: Any) -> str: - formatted_results = [] - if "results" in results: - for result in results["results"]: - if isinstance(result, dict): - title = result.get("title", "") - url = result.get("url", "") - content = result.get("content", "") - formatted_results.append(f"[{title}]({url})\n{content}\n") return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/applied_samplers/you_sampler.py b/src/evals/samplers/applied_samplers/you_sampler.py index 762f960..b0fd2f1 100644 --- a/src/evals/samplers/applied_samplers/you_sampler.py +++ b/src/evals/samplers/applied_samplers/you_sampler.py @@ -1,5 +1,4 @@ """Run evals using the you.com Search SDK https://docs.you.com/api-reference/search/v1-search""" -import os from typing import Any, Dict from youdotcom import You @@ -14,7 +13,6 @@ def __init__( api_key: str = None, timeout: float = 60.0, max_retries: int = 3, - num_results: int = 5, max_concurrency: int = 10, needs_synthesis: bool = True, custom_args: Dict[str, Any] | None = None, @@ -24,7 +22,6 @@ def __init__( api_key=api_key, max_retries=max_retries, timeout=timeout, - num_results=num_results, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, custom_args=custom_args, @@ -36,20 +33,25 @@ def _initialize_client(self): def _get_search_results_impl(self, query: str) -> Any: return self.client.search.unified( query=query, - count=self.num_results, + count=5, ) def format_results(self, results: Any) -> str: formatted_results = [] + raw_results = [] if results.results and results.results.web: - for result in results.results.web: - title = getattr(result, "title", "") - url = getattr(result, "url", "") - description = getattr(result, "description", "") - snippets = getattr(result, "snippets", "") - if snippets and isinstance(snippets, list): - snippets = " ".join(snippets) - formatted_results.append( - f"[{title}]({url})\n snippets: {snippets}\n description: {description}" - ) + raw_results.append(results.results.web) + if results.results and results.results.news: + raw_results.append(results.results.news) + + for result in raw_results: + title = getattr(result, "title", "") + url = getattr(result, "url", "") + description = getattr(result, "description", "") + snippets = getattr(result, "snippets", "") + if snippets and isinstance(snippets, list): + snippets = " ".join(snippets) + formatted_results.append( + f"[{title}]({url})\n snippets: {snippets}\n description: {description}" + ) return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 8df0626..b46cea2 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -23,7 +23,6 @@ def __init__( self.sampler_name = sampler_name self.timeout = timeout self.max_retries = max_retries - self.num_results = num_results self.max_concurrency = max_concurrency self.needs_synthesis = needs_synthesis self.custom_args = custom_args diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py index 93402cd..11386f1 100644 --- a/src/evals/samplers/base_samplers/base_sdk_sampler.py +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -38,7 +38,7 @@ def __init__( if self.api_key: self._initialize_client() else: - raise ValueError("API key not provided") + raise ValueError(f"API key not provided for sampler {sampler_name}. Ensure .env file is configured and contains necessary API keys") @abstractmethod def _initialize_client(self): @@ -144,6 +144,7 @@ async def __call__( "FAILED", "FAILED", ) + # TODO: Remove breakpoint() logging.exception(e) diff --git a/src/evals/simpleqa_runner.py b/src/evals/simpleqa_runner.py index 4503905..fa1278e 100644 --- a/src/evals/simpleqa_runner.py +++ b/src/evals/simpleqa_runner.py @@ -15,10 +15,7 @@ import pandas as pd from tqdm import tqdm -from evals.samplers.applied_samplers.exa_sampler import ExaSampler -from evals.samplers.applied_samplers.serp_api_google_sampler import SerpApiGoogleSampler -from evals.samplers.applied_samplers.tavily_sampler import TavilySampler -from evals.samplers.applied_samplers.you_sampler import YouSampler +from evals.configs import samplers logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -30,40 +27,14 @@ def get_sampler_filepath(sampler_name): return Path(os.getcwd(), f"src/evals/results/raw_results_{sampler_name}.csv") -def get_samplers(args: argparse.Namespace): +def get_sampler(sampler_name: str): """Initialize requested samplers""" - samplers = { - "you_unified_search": YouSampler( - sampler_name="you_unified_search", - num_results=args.num_results, - api_key=os.getenv("YOU_API_KEY"), - ), - # "tavily": TavilySampler(sampler_name="tavily", num_results=args.num_results), - # "google": SerpApiGoogleSampler( - # sampler_name="google", num_results=args.num_results - # ), - # "exa": ExaSampler( - # sampler_name="exa_highlights", - # num_results=args.num_results, - # custom_args={"type_": None}, - # ), - # "exa_fast": ExaSampler( - # sampler_name="exa_highlights_fast", - # num_results=args.num_results, - # custom_args={"type_": "fast"}, - # ), - } - - sampler_list = [] - for sampler in args.samplers: - if sampler in samplers: - sampler_list.append(samplers[sampler]) - else: - raise ValueError( - f"Could not find sampler {sampler}. Please select from {list(samplers.keys())}" - ) - - return sampler_list + sampler = next( + (sampler for sampler in samplers.SAMPLERS if sampler.sampler_name == sampler_name), None + ) + if sampler is None: + raise ValueError(f"Sampler '{sampler_name}' not found") + return sampler def clean_results_folder(): @@ -97,7 +68,7 @@ async def process_query_with_semaphore( async def get_search_results_and_run_evals( args: argparse.Namespace, -) -> Dict[str, list[str | Any]]: +): """ Run SimpleQA benchmark for each sampler. @@ -112,9 +83,9 @@ async def get_search_results_and_run_evals( if args.clean: clean_results_folder() - samplers = get_samplers(args) results = {} - for sampler in samplers: + for sampler_name in args.samplers: + sampler = get_sampler(sampler_name) # Only run on problems that are not already in results folder remaining_problems = get_remaining_problems(df, sampler.sampler_name) if len(remaining_problems) == 0: @@ -160,8 +131,6 @@ async def get_search_results_and_run_evals( # Write results of each batch so we can keep progress in case of a failure write_raw_sampler_results(batch_results, sampler.sampler_name) - # await sampler.close() - def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str): """ diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py index 7ec3c1c..de69a9e 100644 --- a/tests/test_simpleqa.py +++ b/tests/test_simpleqa.py @@ -36,25 +36,21 @@ def test_results_cleanup(): @pytest.mark.asyncio -async def test_simpleqa_runner_you_unified_search(test_results_cleanup): +async def test_simpleqa_runner(test_results_cleanup): """ - Test running simpleqa_runner with you_unified_search sampler - on the simple_qa_n100 dataset. + Test running simpleqa_runner for all samplers + on the simple_qa dataset. This test verifies that: - 1. The runner can process queries using the you_unified_search sampler + 1. The runner can process queries using all samplers 2. Results are written to the correct output file 3. Results contain expected columns and data 4. Metrics are calculated correctly """ - # Skip test if YOU_API_KEY is not set - if not os.getenv("YOU_API_KEY"): - pytest.skip("YOU_API_KEY not set - skipping test") - # Create test arguments num_problems = 10 args = argparse.Namespace( - samplers=["you_unified_search"], + samplers=["you_unified_search", "exa_search_with_contents", "tavily_basic", "tavily_advanced"], csv_path="data/simple_qa.csv", limit=num_problems, # Test with small subset for speed batch_size=5, @@ -65,36 +61,36 @@ async def test_simpleqa_runner_you_unified_search(test_results_cleanup): # Run the evaluation await get_search_results_and_run_evals(args) + for sampler in args.samplers: + # Verify results file was created + results_filepath = get_sampler_filepath(sampler) + assert os.path.isfile(results_filepath), f"Results file not created at {results_filepath}" - # Verify results file was created - results_filepath = get_sampler_filepath("you_unified_search") - assert os.path.isfile(results_filepath), f"Results file not created at {results_filepath}" - - # Read and verify results - df_results = pd.read_csv(results_filepath) + # Read and verify results + df_results = pd.read_csv(results_filepath) - # Check expected columns exist - expected_columns = ["query", "response_time_ms", "evaluation_result"] - for col in expected_columns: - assert col in df_results.columns, f"Expected column '{col}' not found in results" + # Check expected columns exist + expected_columns = ["query", "response_time_ms", "evaluation_result"] + for col in expected_columns: + assert col in df_results.columns, f"Expected column '{col}' not found in results" - # Verify we got results for the queries - assert len(df_results) == num_problems, f"Expected {num_problems} results, got {len(df_results)}" + # Verify we got results for the queries + assert len(df_results) == num_problems, f"Expected {num_problems} results, got {len(df_results)}" - # Verify all queries have non-null values - assert df_results["query"].notna().all(), "Some queries are null" + # Verify all queries have non-null values + assert df_results["query"].notna().all(), "Some queries are null" - # Write and verify metrics - write_metrics() - metrics_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") - assert os.path.isfile(metrics_path), "Metrics file not created" + # Write and verify metrics + write_metrics() + metrics_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") + assert os.path.isfile(metrics_path), "Metrics file not created" - df_metrics = pd.read_csv(metrics_path) - assert len(df_metrics) == 1, "Expected 1 sampler in metrics" - assert df_metrics.iloc[0]["provider"] == "you_unified_search" - assert "average_score" in df_metrics.columns - assert "p50_latency" in df_metrics.columns - assert "problem_count" in df_metrics.columns + df_metrics = pd.read_csv(metrics_path) + assert len(df_metrics) == len(args.samplers), f"Expected {len(args.samplers)} sampler in metrics" + assert df_metrics["provider"].drop_duplicates().tolist().sort() == args.samplers.sort() + assert "average_score" in df_metrics.columns + assert "p50_latency" in df_metrics.columns + assert "problem_count" in df_metrics.columns @pytest.mark.asyncio @@ -105,10 +101,6 @@ async def test_simpleqa_runner_resume_capability(test_results_cleanup): This test verifies that if a run is interrupted, it can continue from where it left off without re-processing completed queries. """ - # Skip test if YOU_API_KEY is not set - if not os.getenv("YOU_API_KEY"): - pytest.skip("YOU_API_KEY not set - skipping test") - num_problems = 10 # Create test arguments for first run (partial) args = argparse.Namespace( @@ -123,16 +115,18 @@ async def test_simpleqa_runner_resume_capability(test_results_cleanup): # First run await get_search_results_and_run_evals(args) - results_filepath = get_sampler_filepath("you_unified_search") - df_first = pd.read_csv(results_filepath) - first_run_count = len(df_first) - assert first_run_count == num_problems, f"Expected {num_problems} results from first run, got {first_run_count}" + for sampler in args.samplers: + results_filepath = get_sampler_filepath(sampler) + df_first = pd.read_csv(results_filepath) + first_run_count = len(df_first) + assert first_run_count == num_problems, f"Expected {num_problems} results from first run, got {first_run_count}" # Second run with more queries (should add new results) args.limit = 5 args.clean = False # Don't clean, resume from existing await get_search_results_and_run_evals(args) - - df_second = pd.read_csv(results_filepath) - second_run_count = len(df_second) - assert second_run_count == num_problems + args.limit, f"Expected {num_problems} total results after second run, got {second_run_count}" + for sampler in args.samplers: + results_filepath = get_sampler_filepath(sampler) + df_second = pd.read_csv(results_filepath) + second_run_count = len(df_second) + assert second_run_count == num_problems + args.limit, f"Expected {num_problems} total results after second run, got {second_run_count}" From 49241abe04a7bf6cd4d86997e890f81025574200 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 16:15:18 -0800 Subject: [PATCH 03/23] Refactored SERP --- src/evals/configs/samplers.py | 7 +- .../samplers/applied_samplers/exa_sampler.py | 2 - .../serp_api_google_sampler.py | 32 ++-- .../applied_samplers/tavily_sampler.py | 2 - .../base_samplers/base_api_sampler.py | 165 ++++++++++++++++++ .../samplers/base_samplers/base_sampler.py | 79 ++++++++- .../base_samplers/base_sdk_sampler.py | 157 ++++++++--------- src/evals/simpleqa_runner.py | 1 + tests/test_simpleqa.py | 2 +- 9 files changed, 339 insertions(+), 108 deletions(-) create mode 100644 src/evals/samplers/base_samplers/base_api_sampler.py diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py index 6ce1515..fc9e219 100644 --- a/src/evals/configs/samplers.py +++ b/src/evals/configs/samplers.py @@ -15,6 +15,10 @@ api_key=os.getenv("EXA_API_KEY"), custom_args={"text": True}, ), + SerpApiGoogleSampler( + sampler_name="serp_google", + api_key=os.getenv("SERP_API_KEY"), + ), TavilySampler( sampler_name="tavily_basic", api_key=os.getenv("TAVILY_API_KEY"), @@ -25,7 +29,4 @@ api_key=os.getenv("TAVILY_API_KEY"), custom_args={"search_depth": "advanced"}, ), - # SerpApiGoogleSampler( - # sampler_name="google", num_results=args.num_results - # ), ] \ No newline at end of file diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index d7f04db..fabd42e 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -13,7 +13,6 @@ def __init__( api_key: str = None, timeout: float = 60.0, max_retries: int = 3, - num_results: int = 5, max_concurrency: int = 10, needs_synthesis: bool = True, custom_args: Dict[str, Any] | None = None, @@ -23,7 +22,6 @@ def __init__( api_key=api_key, max_retries=max_retries, timeout=timeout, - num_results=num_results, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, custom_args=custom_args, diff --git a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py index 0c988f0..511e1a5 100644 --- a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py +++ b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py @@ -1,30 +1,29 @@ import os from typing import Any, Dict -from evals.samplers.base_samplers.base_sampler import BaseSampler +from evals.samplers.base_samplers.base_api_sampler import BaseAPISampler -class SerpApiGoogleSampler(BaseSampler): - - @property - def needs_synthesis(self) -> bool: - return True # Search provider, needs answer synthesis +class SerpApiGoogleSampler(BaseAPISampler): def __init__( self, sampler_name: str, - max_retries: int = 3, + api_key: str = None, timeout: float = 60.0, - num_results: int = 5, + max_retries: int = 3, + max_concurrency: int = 10, + needs_synthesis: bool = True, custom_args: Dict[str, Any] | None = None, ): super().__init__( - sampler_name, - os.getenv("SERP_API_KEY"), - max_retries, - timeout, - num_results, - custom_args, + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, ) @staticmethod @@ -33,7 +32,7 @@ def _get_base_url(): @staticmethod def _get_endpoint() -> str: - return "search/" + return "/search" @staticmethod def _get_method() -> str: @@ -52,8 +51,7 @@ def _get_payload( "api_key": self.api_key, } - @staticmethod - def __format_context__(results: Any) -> str: + def format_results(self, results: Any) -> str: formatted_results = [] if "organic_results" in results: for result in results["organic_results"]: diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index 009a517..6f717cc 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -13,7 +13,6 @@ def __init__( api_key: str = None, timeout: float = 60.0, max_retries: int = 3, - num_results: int = 5, max_concurrency: int = 10, needs_synthesis: bool = True, custom_args: Dict[str, Any] | None = None, @@ -23,7 +22,6 @@ def __init__( api_key=api_key, max_retries=max_retries, timeout=timeout, - num_results=num_results, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, custom_args=custom_args, diff --git a/src/evals/samplers/base_samplers/base_api_sampler.py b/src/evals/samplers/base_samplers/base_api_sampler.py new file mode 100644 index 0000000..2463c0f --- /dev/null +++ b/src/evals/samplers/base_samplers/base_api_sampler.py @@ -0,0 +1,165 @@ +from abc import abstractmethod +from typing import Any, Dict + +import requests + +from evals.samplers.base_samplers.base_sampler import BaseSampler + + +class BaseAPISampler(BaseSampler): + """Base class for API-based samplers that make HTTP requests""" + + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_retries: int = 3, + max_concurrency: int = 10, + needs_synthesis: bool = True, + custom_args: Dict[str, Any] | None = None, + ): + super().__init__( + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, + ) + + def _set_params(self): + """Set API parameters before making a request""" + self.base_url = self._get_base_url() + self.method = self._get_method() + self.headers = self._get_headers() + self.endpoint = self._get_endpoint() + + @staticmethod + @abstractmethod + def _get_base_url() -> str: + """Get provider specific base url""" + pass + + @abstractmethod + def _get_headers(self) -> Dict[str, str]: + """Get provider specific headers""" + pass + + @abstractmethod + def _get_payload(self, query: str) -> Dict[str, Any]: + """Get provider specific request payload""" + pass + + @staticmethod + @abstractmethod + def _get_endpoint() -> str: + """Get provider specific API endpoint""" + pass + + @staticmethod + @abstractmethod + def _get_method() -> str: + """Get provider specific HTTP method""" + pass + + def get_search_results(self, query: str) -> Any: + """Get raw search results from the API""" + try: + self._set_params() + payload = self._get_payload(query) + + if self.method == "POST": + response = requests.post( + self.base_url + self.endpoint, + json=payload, + headers=self.headers, + timeout=self.timeout, + ) + elif self.method == "GET": + response = requests.get( + self.base_url + self.endpoint, + params=payload, + headers=self.headers, + timeout=self.timeout, + ) + else: + raise ValueError( + 'Unsupported method, please select between ["POST", "GET"]' + ) + + response.raise_for_status() + data = response.json() + + return data + except Exception as e: + print(f"{self.sampler_name} failed with error {e}") + raise e + + # async def __call__( + # self, query_input, ground_truth: str = "", overwrite: bool = False + # ) -> Dict[str, Any]: + # """Main execution pipeline""" + # + # if isinstance(query_input, list): + # query = self.__extract_query_from_messages__(query_input) + # else: + # query = str(query_input) + # + # # Get raw results + # try: + # # Run synchronous SDK call in thread pool + # start_time = time.time() + # raw_results = await asyncio.to_thread( + # self.get_search_results, + # query + # ) + # response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms + # formatted_results = self.format_results(raw_results) + # except Exception as e: + # raw_results, response_time_no_retries, formatted_results = ( + # "FAILED", + # "FAILED", + # "FAILED", + # ) + # # TODO: Remove + # breakpoint() + # logging.exception(e) + # + # # Synthesize raw results + # try: + # if self.needs_synthesis: + # generated_answer = await self.__synthesize_response( + # query, formatted_results + # ) + # else: + # generated_answer = formatted_results # Already synthesized by API + # except Exception as e: + # generated_answer = "FAILED" + # logging.exception(e) + # + # # Evaluated synthesized results against ground truth + # try: + # if ground_truth: + # evaluation_result_dict = await self.__evaluate_response( + # query, ground_truth, generated_answer + # ) + # evaluation_result = evaluation_result_dict["score_name"] + # else: + # raise ValueError("Ground truth is missing") + # except Exception as e: + # evaluation_result = "FAILED" + # logging.exception(e) + # + # # Format result + # result = { + # "query": query, + # "response_time_ms": response_time_no_retries, + # "evaluation_result": evaluation_result, + # "generated_answer": generated_answer, + # "ground_truth": ground_truth, + # "raw_results": raw_results, + # "formatted_results": formatted_results, + # } + # return result \ No newline at end of file diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index b46cea2..6dab597 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -1,4 +1,7 @@ from abc import ABC, abstractmethod +import asyncio +import logging +import time from typing import Any, Dict import httpx @@ -15,7 +18,6 @@ def __init__( api_key: str = None, timeout: float = 60.0, max_retries: int = 3, - num_results: int = 5, max_concurrency: int = 10, needs_synthesis: bool = True, custom_args=None, @@ -92,3 +94,78 @@ async def __evaluate_response( evaluator = AnswerGrader() return await evaluator.evaluate_single(query, ground_truth, generated_answer) + + async def __call__( + self, query_input, ground_truth: str = "", overwrite: bool = False + ) -> Dict[str, Any]: + """Main execution pipeline""" + + if isinstance(query_input, list): + query = self.__extract_query_from_messages__(query_input) + else: + query = str(query_input) + + # if self.custom_args: + # payload = self._get_payload(query=query, custom_args=self.custom_args) + # else: + # payload = self._get_payload(query=query) + # + # method = self._get_method() + # endpoint = self._get_endpoint() + + # Get raw results + try: + # Run synchronous SDK call in thread pool + start_time = time.time() + raw_results = await asyncio.to_thread( + self.get_search_results, + query + ) + response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms + formatted_results = self.format_results(raw_results) + except Exception as e: + raw_results, response_time_no_retries, formatted_results = ( + "FAILED", + "FAILED", + "FAILED", + ) + # TODO: Remove + breakpoint() + logging.exception(e) + + # Synthesize raw results + try: + if self.needs_synthesis: + generated_answer = await self.__synthesize_response( + query, formatted_results + ) + else: + generated_answer = formatted_results # Already synthesized by API + except Exception as e: + generated_answer = "FAILED" + logging.exception(e) + + # Evaluated synthesized results against ground truth + try: + if ground_truth: + evaluation_result_dict = await self.__evaluate_response( + query, ground_truth, generated_answer + ) + evaluation_result = evaluation_result_dict["score_name"] + else: + raise ValueError("Ground truth is missing") + except Exception as e: + evaluation_result = "FAILED" + logging.exception(e) + + # Format result + result = { + "query": query, + "response_time_ms": response_time_no_retries, + "evaluation_result": evaluation_result, + "generated_answer": generated_answer, + "ground_truth": ground_truth, + "raw_results": raw_results, + "formatted_results": formatted_results, + } + return result \ No newline at end of file diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py index 11386f1..6d42db9 100644 --- a/src/evals/samplers/base_samplers/base_sdk_sampler.py +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -19,7 +19,6 @@ def __init__( api_key: str = None, timeout: float = 60.0, max_retries: int = 3, - num_results: int = 5, max_concurrency: int = 10, needs_synthesis: bool = True, custom_args=None, @@ -29,7 +28,6 @@ def __init__( api_key=api_key, max_retries=max_retries, timeout=timeout, - num_results=num_results, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, custom_args=custom_args, @@ -109,83 +107,78 @@ async def _retry_with_backoff_async(self, func, *args, **kwargs): f"Attempt {trial}/{self.max_retries} failed: {traceback.print_tb(traceback_)}. Retrying in {backoff_time}s..." ) await asyncio.sleep(backoff_time) - - async def __call__( - self, query_input, ground_truth: str = "", overwrite: bool = False - ) -> Dict[str, Any]: - """Main execution pipeline""" - - if isinstance(query_input, list): - query = self.__extract_query_from_messages__(query_input) - else: - query = str(query_input) - - # if self.custom_args: - # payload = self._get_payload(query=query, custom_args=self.custom_args) - # else: - # payload = self._get_payload(query=query) - # - # method = self._get_method() - # endpoint = self._get_endpoint() - - # Get raw results - try: - # Run synchronous SDK call in thread pool - start_time = time.time() - raw_results = await asyncio.to_thread( - self.get_search_results, - query - ) - response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms - formatted_results = self.format_results(raw_results) - except Exception as e: - raw_results, response_time_no_retries, formatted_results = ( - "FAILED", - "FAILED", - "FAILED", - ) - # TODO: Remove - breakpoint() - logging.exception(e) - - # Synthesize raw results - try: - if self.needs_synthesis: - generated_answer = await self.__synthesize_response( - query, formatted_results - ) - else: - generated_answer = formatted_results # Already synthesized by API - except Exception as e: - generated_answer = "FAILED" - logging.exception(e) - - # Evaluated synthesized results against ground truth - try: - if ground_truth: - evaluation_result_dict = await self.__evaluate_response( - query, ground_truth, generated_answer - ) - evaluation_result = evaluation_result_dict["score_name"] - else: - raise ValueError("Ground truth is missing") - except Exception as e: - evaluation_result = "FAILED" - logging.exception(e) - - # Format result - result = { - "query": query, - "response_time_ms": response_time_no_retries, - "evaluation_result": evaluation_result, - "generated_answer": generated_answer, - "ground_truth": ground_truth, - "raw_results": raw_results, - "formatted_results": formatted_results, - } - return result - - # async def close(self): - # """Cleanup resources""" - # if hasattr(self, "client"): - # await self.client.close() + # + # async def __call__( + # self, query_input, ground_truth: str = "", overwrite: bool = False + # ) -> Dict[str, Any]: + # """Main execution pipeline""" + # + # if isinstance(query_input, list): + # query = self.__extract_query_from_messages__(query_input) + # else: + # query = str(query_input) + # + # # if self.custom_args: + # # payload = self._get_payload(query=query, custom_args=self.custom_args) + # # else: + # # payload = self._get_payload(query=query) + # # + # # method = self._get_method() + # # endpoint = self._get_endpoint() + # + # # Get raw results + # try: + # # Run synchronous SDK call in thread pool + # start_time = time.time() + # raw_results = await asyncio.to_thread( + # self.get_search_results, + # query + # ) + # response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms + # formatted_results = self.format_results(raw_results) + # except Exception as e: + # raw_results, response_time_no_retries, formatted_results = ( + # "FAILED", + # "FAILED", + # "FAILED", + # ) + # # TODO: Remove + # breakpoint() + # logging.exception(e) + # + # # Synthesize raw results + # try: + # if self.needs_synthesis: + # generated_answer = await self.__synthesize_response( + # query, formatted_results + # ) + # else: + # generated_answer = formatted_results # Already synthesized by API + # except Exception as e: + # generated_answer = "FAILED" + # logging.exception(e) + # + # # Evaluated synthesized results against ground truth + # try: + # if ground_truth: + # evaluation_result_dict = await self.__evaluate_response( + # query, ground_truth, generated_answer + # ) + # evaluation_result = evaluation_result_dict["score_name"] + # else: + # raise ValueError("Ground truth is missing") + # except Exception as e: + # evaluation_result = "FAILED" + # logging.exception(e) + # + # # Format result + # result = { + # "query": query, + # "response_time_ms": response_time_no_retries, + # "evaluation_result": evaluation_result, + # "generated_answer": generated_answer, + # "ground_truth": ground_truth, + # "raw_results": raw_results, + # "formatted_results": formatted_results, + # } + # return result diff --git a/src/evals/simpleqa_runner.py b/src/evals/simpleqa_runner.py index fa1278e..c38c7d0 100644 --- a/src/evals/simpleqa_runner.py +++ b/src/evals/simpleqa_runner.py @@ -170,6 +170,7 @@ def write_metrics(): df_sampler_results["response_time_ms"] != "FAILED" ] + # TODO: Mean or median? p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() correct = len( df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"] diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py index de69a9e..6d54c29 100644 --- a/tests/test_simpleqa.py +++ b/tests/test_simpleqa.py @@ -50,7 +50,7 @@ async def test_simpleqa_runner(test_results_cleanup): # Create test arguments num_problems = 10 args = argparse.Namespace( - samplers=["you_unified_search", "exa_search_with_contents", "tavily_basic", "tavily_advanced"], + samplers=["you_unified_search", "exa_search_with_contents", "tavily_basic", "tavily_advanced", "serp_google"], csv_path="data/simple_qa.csv", limit=num_problems, # Test with small subset for speed batch_size=5, From 0989bcffc5b2d9dc3655791c7c46179c7218b73f Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 16:15:55 -0800 Subject: [PATCH 04/23] Consolidate call function --- .../base_samplers/base_api_sampler.py | 67 ----------------- .../samplers/base_samplers/base_sampler.py | 8 -- .../base_samplers/base_sdk_sampler.py | 75 ------------------- 3 files changed, 150 deletions(-) diff --git a/src/evals/samplers/base_samplers/base_api_sampler.py b/src/evals/samplers/base_samplers/base_api_sampler.py index 2463c0f..31244f3 100644 --- a/src/evals/samplers/base_samplers/base_api_sampler.py +++ b/src/evals/samplers/base_samplers/base_api_sampler.py @@ -96,70 +96,3 @@ def get_search_results(self, query: str) -> Any: except Exception as e: print(f"{self.sampler_name} failed with error {e}") raise e - - # async def __call__( - # self, query_input, ground_truth: str = "", overwrite: bool = False - # ) -> Dict[str, Any]: - # """Main execution pipeline""" - # - # if isinstance(query_input, list): - # query = self.__extract_query_from_messages__(query_input) - # else: - # query = str(query_input) - # - # # Get raw results - # try: - # # Run synchronous SDK call in thread pool - # start_time = time.time() - # raw_results = await asyncio.to_thread( - # self.get_search_results, - # query - # ) - # response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms - # formatted_results = self.format_results(raw_results) - # except Exception as e: - # raw_results, response_time_no_retries, formatted_results = ( - # "FAILED", - # "FAILED", - # "FAILED", - # ) - # # TODO: Remove - # breakpoint() - # logging.exception(e) - # - # # Synthesize raw results - # try: - # if self.needs_synthesis: - # generated_answer = await self.__synthesize_response( - # query, formatted_results - # ) - # else: - # generated_answer = formatted_results # Already synthesized by API - # except Exception as e: - # generated_answer = "FAILED" - # logging.exception(e) - # - # # Evaluated synthesized results against ground truth - # try: - # if ground_truth: - # evaluation_result_dict = await self.__evaluate_response( - # query, ground_truth, generated_answer - # ) - # evaluation_result = evaluation_result_dict["score_name"] - # else: - # raise ValueError("Ground truth is missing") - # except Exception as e: - # evaluation_result = "FAILED" - # logging.exception(e) - # - # # Format result - # result = { - # "query": query, - # "response_time_ms": response_time_no_retries, - # "evaluation_result": evaluation_result, - # "generated_answer": generated_answer, - # "ground_truth": ground_truth, - # "raw_results": raw_results, - # "formatted_results": formatted_results, - # } - # return result \ No newline at end of file diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 6dab597..dfa3fa3 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -105,14 +105,6 @@ async def __call__( else: query = str(query_input) - # if self.custom_args: - # payload = self._get_payload(query=query, custom_args=self.custom_args) - # else: - # payload = self._get_payload(query=query) - # - # method = self._get_method() - # endpoint = self._get_endpoint() - # Get raw results try: # Run synchronous SDK call in thread pool diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py index 6d42db9..e065fff 100644 --- a/src/evals/samplers/base_samplers/base_sdk_sampler.py +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -107,78 +107,3 @@ async def _retry_with_backoff_async(self, func, *args, **kwargs): f"Attempt {trial}/{self.max_retries} failed: {traceback.print_tb(traceback_)}. Retrying in {backoff_time}s..." ) await asyncio.sleep(backoff_time) - # - # async def __call__( - # self, query_input, ground_truth: str = "", overwrite: bool = False - # ) -> Dict[str, Any]: - # """Main execution pipeline""" - # - # if isinstance(query_input, list): - # query = self.__extract_query_from_messages__(query_input) - # else: - # query = str(query_input) - # - # # if self.custom_args: - # # payload = self._get_payload(query=query, custom_args=self.custom_args) - # # else: - # # payload = self._get_payload(query=query) - # # - # # method = self._get_method() - # # endpoint = self._get_endpoint() - # - # # Get raw results - # try: - # # Run synchronous SDK call in thread pool - # start_time = time.time() - # raw_results = await asyncio.to_thread( - # self.get_search_results, - # query - # ) - # response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms - # formatted_results = self.format_results(raw_results) - # except Exception as e: - # raw_results, response_time_no_retries, formatted_results = ( - # "FAILED", - # "FAILED", - # "FAILED", - # ) - # # TODO: Remove - # breakpoint() - # logging.exception(e) - # - # # Synthesize raw results - # try: - # if self.needs_synthesis: - # generated_answer = await self.__synthesize_response( - # query, formatted_results - # ) - # else: - # generated_answer = formatted_results # Already synthesized by API - # except Exception as e: - # generated_answer = "FAILED" - # logging.exception(e) - # - # # Evaluated synthesized results against ground truth - # try: - # if ground_truth: - # evaluation_result_dict = await self.__evaluate_response( - # query, ground_truth, generated_answer - # ) - # evaluation_result = evaluation_result_dict["score_name"] - # else: - # raise ValueError("Ground truth is missing") - # except Exception as e: - # evaluation_result = "FAILED" - # logging.exception(e) - # - # # Format result - # result = { - # "query": query, - # "response_time_ms": response_time_no_retries, - # "evaluation_result": evaluation_result, - # "generated_answer": generated_answer, - # "ground_truth": ground_truth, - # "raw_results": raw_results, - # "formatted_results": formatted_results, - # } - # return result From 923f789e33fde4a65d4ea1d996a1e5a88d2cab39 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 16:56:18 -0800 Subject: [PATCH 05/23] bug fixes --- README.md | 6 ++--- src/evals/__init__.py | 3 +++ src/evals/configs/samplers.py | 2 +- .../{simpleqa_runner.py => eval_runner.py} | 22 +++++++++++++------ .../samplers/applied_samplers/exa_sampler.py | 11 +++++----- .../applied_samplers/tavily_sampler.py | 6 ++--- .../samplers/applied_samplers/you_sampler.py | 4 ++-- .../samplers/base_samplers/base_sampler.py | 2 -- tests/test_simpleqa.py | 2 +- 9 files changed, 33 insertions(+), 25 deletions(-) create mode 100644 src/evals/__init__.py rename src/evals/{simpleqa_runner.py => eval_runner.py} (92%) diff --git a/README.md b/README.md index ae11e07..2d300e6 100644 --- a/README.md +++ b/README.md @@ -29,17 +29,17 @@ To run a SimpleQA evaluation, simply run the `simpleqa_runner.py` file with your View available arguments and samplers ```bash - python src/simpleqa/simpleqa_runner.py --help + python src/evals/eval_runner.py --help ``` Run the SimpleQA evaluation on the entire problem set for all available samplers with default settings ```bash - python src/simpleqa/simpleqa_runner.py + python src/evals/eval_runner.py ``` Run the SimpleQA evaluation on just You.com for 5 random problems ```bash - python src/simpleqa/simpleqa_runner.py --samplers you --limit 5 + python src/evals/eval_runner.py --samplers you_unified_search --limit 5 ``` ## Interpreting Results diff --git a/src/evals/__init__.py b/src/evals/__init__.py new file mode 100644 index 0000000..76acad3 --- /dev/null +++ b/src/evals/__init__.py @@ -0,0 +1,3 @@ +import dotenv + +dotenv.load_dotenv() diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py index fc9e219..e154610 100644 --- a/src/evals/configs/samplers.py +++ b/src/evals/configs/samplers.py @@ -29,4 +29,4 @@ api_key=os.getenv("TAVILY_API_KEY"), custom_args={"search_depth": "advanced"}, ), -] \ No newline at end of file +] diff --git a/src/evals/simpleqa_runner.py b/src/evals/eval_runner.py similarity index 92% rename from src/evals/simpleqa_runner.py rename to src/evals/eval_runner.py index c38c7d0..1da32be 100644 --- a/src/evals/simpleqa_runner.py +++ b/src/evals/eval_runner.py @@ -10,16 +10,20 @@ import os from pathlib import Path import shutil -from typing import Any, Dict +from typing import Any import pandas as pd from tqdm import tqdm from evals.configs import samplers + logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) +# Mute noisy HTTP client logs +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) logger = logging.getLogger(__name__) @@ -170,8 +174,8 @@ def write_metrics(): df_sampler_results["response_time_ms"] != "FAILED" ] - # TODO: Mean or median? p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() + avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() correct = len( df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"] ) @@ -179,19 +183,23 @@ def write_metrics(): if count_answered == 0: breakpoint() raise ValueError("No rows found in raw results file") - average_score = round((correct / count_answered) * 100, 2) + accuracy_score = round((correct / count_answered) * 100, 2) metric_rows.append( { "provider": sampler_name, - "average_score": average_score, - "p50_latency": p50_latency, + "accuracy_score": accuracy_score, + "p50_latency": round(float(p50_latency), 2), + "avg_latency": round(float(avg_latency), 2), "problem_count": count_answered, } ) write_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") - pd.DataFrame(metric_rows).to_csv(write_path, index=False) + metric_df = pd.DataFrame(metric_rows) + metric_df.to_csv(write_path, index=False) + print(f"Results were written to {write_path}") + print(metric_df) async def main(): @@ -230,7 +238,7 @@ async def main(): ) parser.add_argument( "--csv-path", - default="src/evals/data/simple_qa_test_set.csv", + default="data/simple_qa.csv", type=str, help="Used to define the filepath of the test set", ) diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index fabd42e..a939bcd 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -47,11 +47,10 @@ def format_results(self, results: Any) -> str: raw_results = getattr(results, "results", None) for result in raw_results: - if isinstance(result, dict): - title = getattr(result, "title", "") - url = getattr(result, "url", "") - text = getattr(result, "text", "") - if text: - formatted_results.append(f"[{title}]({url})\ntext: \"{text}\"\n") + title = getattr(result, "title", "") + url = getattr(result, "url", "") + text = getattr(result, "text", "") + if text: + formatted_results.append(f"[{title}]({url})\ntext: \"{text}\"\n") return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index 6f717cc..c9f74bb 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -45,9 +45,9 @@ def format_results(self, results: Any) -> str: for result in raw_results: if isinstance(result, dict): - title = getattr(result, "title", "") - url = getattr(result, "url", "") - content = getattr(result, "content", "") + title = result.get("title", "") + url = result.get("url", "") + content = result.get("content", "") if content: formatted_results.append(f"[{title}]({url})\ncontent: {content}\n") diff --git a/src/evals/samplers/applied_samplers/you_sampler.py b/src/evals/samplers/applied_samplers/you_sampler.py index b0fd2f1..cabf942 100644 --- a/src/evals/samplers/applied_samplers/you_sampler.py +++ b/src/evals/samplers/applied_samplers/you_sampler.py @@ -40,9 +40,9 @@ def format_results(self, results: Any) -> str: formatted_results = [] raw_results = [] if results.results and results.results.web: - raw_results.append(results.results.web) + raw_results.extend(results.results.web) if results.results and results.results.news: - raw_results.append(results.results.news) + raw_results.extend(results.results.news) for result in raw_results: title = getattr(result, "title", "") diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index dfa3fa3..aa42f09 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -121,8 +121,6 @@ async def __call__( "FAILED", "FAILED", ) - # TODO: Remove - breakpoint() logging.exception(e) # Synthesize raw results diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py index 6d54c29..a1df2cf 100644 --- a/tests/test_simpleqa.py +++ b/tests/test_simpleqa.py @@ -9,7 +9,7 @@ import pandas as pd import pytest -from evals.simpleqa_runner import ( +from evals.eval_runner import ( get_search_results_and_run_evals, get_sampler_filepath, write_metrics, From b23e4b78e7f4a94916497eec89d83bebe1ac958a Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 16:56:50 -0800 Subject: [PATCH 06/23] Ran ruff --- src/evals/eval_runner.py | 58 ++++++------------- src/evals/processing/evaluate_answer.py | 12 +--- src/evals/processing/synthesize_answer.py | 4 +- .../samplers/applied_samplers/exa_sampler.py | 11 +--- .../serp_api_google_sampler.py | 5 +- .../applied_samplers/tavily_sampler.py | 1 + .../samplers/applied_samplers/you_sampler.py | 5 +- .../base_samplers/base_api_sampler.py | 4 +- .../samplers/base_samplers/base_sampler.py | 28 +++------ .../base_samplers/base_sdk_sampler.py | 4 +- tests/test_simpleqa.py | 5 +- 11 files changed, 43 insertions(+), 94 deletions(-) diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index 1da32be..9cf44c0 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -18,9 +18,7 @@ from evals.configs import samplers -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # Mute noisy HTTP client logs logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) @@ -33,9 +31,7 @@ def get_sampler_filepath(sampler_name): def get_sampler(sampler_name: str): """Initialize requested samplers""" - sampler = next( - (sampler for sampler in samplers.SAMPLERS if sampler.sampler_name == sampler_name), None - ) + sampler = next((sampler for sampler in samplers.SAMPLERS if sampler.sampler_name == sampler_name), None) if sampler is None: raise ValueError(f"Sampler '{sampler_name}' not found") return sampler @@ -57,16 +53,12 @@ def get_remaining_problems(df, sampler_name): return df -async def process_query_with_semaphore( - semaphore, sampler, target_query, target_ground_truth -): +async def process_query_with_semaphore(semaphore, sampler, target_query, target_ground_truth): async with semaphore: try: return await sampler(target_query, ground_truth=target_ground_truth) except Exception as e: - logging.error( - f"Failed to run {sampler.sampler_name} for query: {target_query}" - ) + logging.error(f"Failed to run {sampler.sampler_name} for query: {target_query}") return e @@ -93,17 +85,11 @@ async def get_search_results_and_run_evals( # Only run on problems that are not already in results folder remaining_problems = get_remaining_problems(df, sampler.sampler_name) if len(remaining_problems) == 0: - logging.info( - f"No problems remaining for sampler {sampler.sampler_name}, moving on..." - ) - results[sampler.sampler_name] = pd.read_csv( - get_sampler_filepath(sampler.sampler_name) - ) + logging.info(f"No problems remaining for sampler {sampler.sampler_name}, moving on...") + results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler.sampler_name)) continue - logging.info( - f"Running sampler {sampler.sampler_name} on {len(remaining_problems)} problems" - ) + logging.info(f"Running sampler {sampler.sampler_name} on {len(remaining_problems)} problems") df = remaining_problems # Run problems in batches @@ -121,11 +107,7 @@ async def get_search_results_and_run_evals( for _, row in batch_df.iterrows(): query = row["problem"] ground_truth = row["answer"] - task = asyncio.create_task( - process_query_with_semaphore( - semaphore, sampler, query, ground_truth - ) - ) + task = asyncio.create_task(process_query_with_semaphore(semaphore, sampler, query, ground_truth)) tasks.append(task) batch_results = await asyncio.gather(*tasks, return_exceptions=True) @@ -170,30 +152,24 @@ def write_metrics(): for sampler_results_file in files: sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] df_sampler_results = pd.read_csv(sampler_results_file) - successful_df = df_sampler_results[ - df_sampler_results["response_time_ms"] != "FAILED" - ] + successful_df = df_sampler_results[df_sampler_results["response_time_ms"] != "FAILED"] p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() - correct = len( - df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"] - ) + correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) count_answered = len(successful_df) if count_answered == 0: breakpoint() raise ValueError("No rows found in raw results file") accuracy_score = round((correct / count_answered) * 100, 2) - metric_rows.append( - { - "provider": sampler_name, - "accuracy_score": accuracy_score, - "p50_latency": round(float(p50_latency), 2), - "avg_latency": round(float(avg_latency), 2), - "problem_count": count_answered, - } - ) + metric_rows.append({ + "provider": sampler_name, + "accuracy_score": accuracy_score, + "p50_latency": round(float(p50_latency), 2), + "avg_latency": round(float(avg_latency), 2), + "problem_count": count_answered, + }) write_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") metric_df = pd.DataFrame(metric_rows) diff --git a/src/evals/processing/evaluate_answer.py b/src/evals/processing/evaluate_answer.py index 8b75bac..fb5cd07 100644 --- a/src/evals/processing/evaluate_answer.py +++ b/src/evals/processing/evaluate_answer.py @@ -133,9 +133,7 @@ async def call_openai_async(self, client: httpx.AsyncClient, prompt: str) -> str raise ValueError("OpenAI API returned empty response") return content else: - raise Exception( - f"API error {response.status_code}: {response.text}" - ) + raise Exception(f"API error {response.status_code}: {response.text}") except Exception as e: if trial >= self.max_retries: @@ -148,9 +146,7 @@ async def call_openai_async(self, client: httpx.AsyncClient, prompt: str) -> str raise ValueError("Failed to call OpenAI API") - async def evaluate_single( - self, question: str, target: str, predicted_answer: str - ) -> Dict[str, Any]: + async def evaluate_single(self, question: str, target: str, predicted_answer: str) -> Dict[str, Any]: """Evaluate a single response asynchronously""" grader_prompt = ANSWER_GRADER_TEMPLATE.format( question=question, @@ -166,9 +162,7 @@ async def evaluate_single( grade_letter = match.group(0) if match else "C" # Convert to readable format - score_name = {"A": "is_correct", "B": "is_incorrect", "C": "is_not_attempted"}[ - grade_letter - ] + score_name = {"A": "is_correct", "B": "is_incorrect", "C": "is_not_attempted"}[grade_letter] is_correct = grade_letter == "A" is_incorrect = grade_letter == "B" diff --git a/src/evals/processing/synthesize_answer.py b/src/evals/processing/synthesize_answer.py index 6ca43fb..3ddc29c 100644 --- a/src/evals/processing/synthesize_answer.py +++ b/src/evals/processing/synthesize_answer.py @@ -35,9 +35,7 @@ def __init__(self, max_retries: int = 3): "Content-Type": "application/json", } - async def process_single( - self, session: httpx.AsyncClient, query: str, snippets: str - ) -> SynthesizeAnswerResponse: + async def process_single(self, session: httpx.AsyncClient, query: str, snippets: str) -> SynthesizeAnswerResponse: """Synthesize a single response asynchronously""" for trial in range(self.max_retries + 1): try: diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index a939bcd..f74aac5 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -1,4 +1,5 @@ """Run evals using the Exa SDK""" + from typing import Any, Dict from exa_py import Exa @@ -32,13 +33,7 @@ def _initialize_client(self): def _get_search_results_impl(self, query: str) -> Any: if self.custom_args and self.custom_args["text"]: - return self.client.search( - query=query, - num_results=5, - contents={ - "text": True - } - ) + return self.client.search(query=query, num_results=5, contents={"text": True}) raise ValueError("Unknown configuration for Exa") @@ -51,6 +46,6 @@ def format_results(self, results: Any) -> str: url = getattr(result, "url", "") text = getattr(result, "text", "") if text: - formatted_results.append(f"[{title}]({url})\ntext: \"{text}\"\n") + formatted_results.append(f'[{title}]({url})\ntext: "{text}"\n') return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py index 511e1a5..1864869 100644 --- a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py +++ b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py @@ -5,7 +5,6 @@ class SerpApiGoogleSampler(BaseAPISampler): - def __init__( self, sampler_name: str, @@ -41,9 +40,7 @@ def _get_method() -> str: def _get_headers(self) -> Dict[str, str]: return {} - def _get_payload( - self, query: str, custom_args: Dict[str, Any] | None = None - ) -> Dict[str, Any]: + def _get_payload(self, query: str, custom_args: Dict[str, Any] | None = None) -> Dict[str, Any]: return { "q": query, "engine": "google", diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index c9f74bb..b7a212d 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -1,4 +1,5 @@ """Run evals using the Tavily SDK""" + from typing import Any, Dict from tavily import TavilyClient diff --git a/src/evals/samplers/applied_samplers/you_sampler.py b/src/evals/samplers/applied_samplers/you_sampler.py index cabf942..c4573c7 100644 --- a/src/evals/samplers/applied_samplers/you_sampler.py +++ b/src/evals/samplers/applied_samplers/you_sampler.py @@ -1,4 +1,5 @@ """Run evals using the you.com Search SDK https://docs.you.com/api-reference/search/v1-search""" + from typing import Any, Dict from youdotcom import You @@ -51,7 +52,5 @@ def format_results(self, results: Any) -> str: snippets = getattr(result, "snippets", "") if snippets and isinstance(snippets, list): snippets = " ".join(snippets) - formatted_results.append( - f"[{title}]({url})\n snippets: {snippets}\n description: {description}" - ) + formatted_results.append(f"[{title}]({url})\n snippets: {snippets}\n description: {description}") return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/base_samplers/base_api_sampler.py b/src/evals/samplers/base_samplers/base_api_sampler.py index 31244f3..5708929 100644 --- a/src/evals/samplers/base_samplers/base_api_sampler.py +++ b/src/evals/samplers/base_samplers/base_api_sampler.py @@ -85,9 +85,7 @@ def get_search_results(self, query: str) -> Any: timeout=self.timeout, ) else: - raise ValueError( - 'Unsupported method, please select between ["POST", "GET"]' - ) + raise ValueError('Unsupported method, please select between ["POST", "GET"]') response.raise_for_status() data = response.json() diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index aa42f09..e440137 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -73,31 +73,24 @@ def __extract_query_from_messages__(message_list: list[dict]) -> str: return last_message["content"] return str(message_list) - async def __synthesize_response(self, query: str, formatted_context: str) -> str: """ Private method for synthesizing responses from search results using OpenAI """ answer_synthesizer = SynthesizeAnswer(max_retries=3) async with httpx.AsyncClient(timeout=30.0) as client: - result = await answer_synthesizer.process_single( - client, query, formatted_context - ) + result = await answer_synthesizer.process_single(client, query, formatted_context) return result.response_text if result else f"Synthesis failed for: {query}" @staticmethod - async def __evaluate_response( - query: str, ground_truth: str, generated_answer: str - ) -> Dict[str, Any]: + async def __evaluate_response(query: str, ground_truth: str, generated_answer: str) -> Dict[str, Any]: """Evaluate the generated response against ground truth""" from evals.processing.evaluate_answer import AnswerGrader evaluator = AnswerGrader() return await evaluator.evaluate_single(query, ground_truth, generated_answer) - async def __call__( - self, query_input, ground_truth: str = "", overwrite: bool = False - ) -> Dict[str, Any]: + async def __call__(self, query_input, ground_truth: str = "", overwrite: bool = False) -> Dict[str, Any]: """Main execution pipeline""" if isinstance(query_input, list): @@ -109,10 +102,7 @@ async def __call__( try: # Run synchronous SDK call in thread pool start_time = time.time() - raw_results = await asyncio.to_thread( - self.get_search_results, - query - ) + raw_results = await asyncio.to_thread(self.get_search_results, query) response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms formatted_results = self.format_results(raw_results) except Exception as e: @@ -126,9 +116,7 @@ async def __call__( # Synthesize raw results try: if self.needs_synthesis: - generated_answer = await self.__synthesize_response( - query, formatted_results - ) + generated_answer = await self.__synthesize_response(query, formatted_results) else: generated_answer = formatted_results # Already synthesized by API except Exception as e: @@ -138,9 +126,7 @@ async def __call__( # Evaluated synthesized results against ground truth try: if ground_truth: - evaluation_result_dict = await self.__evaluate_response( - query, ground_truth, generated_answer - ) + evaluation_result_dict = await self.__evaluate_response(query, ground_truth, generated_answer) evaluation_result = evaluation_result_dict["score_name"] else: raise ValueError("Ground truth is missing") @@ -158,4 +144,4 @@ async def __call__( "raw_results": raw_results, "formatted_results": formatted_results, } - return result \ No newline at end of file + return result diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py index e065fff..a116f84 100644 --- a/src/evals/samplers/base_samplers/base_sdk_sampler.py +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -36,7 +36,9 @@ def __init__( if self.api_key: self._initialize_client() else: - raise ValueError(f"API key not provided for sampler {sampler_name}. Ensure .env file is configured and contains necessary API keys") + raise ValueError( + f"API key not provided for sampler {sampler_name}. Ensure .env file is configured and contains necessary API keys" + ) @abstractmethod def _initialize_client(self): diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py index a1df2cf..13c830b 100644 --- a/tests/test_simpleqa.py +++ b/tests/test_simpleqa.py @@ -1,4 +1,5 @@ """Tests for SimpleQA evaluation runner""" + import argparse import asyncio import os @@ -129,4 +130,6 @@ async def test_simpleqa_runner_resume_capability(test_results_cleanup): results_filepath = get_sampler_filepath(sampler) df_second = pd.read_csv(results_filepath) second_run_count = len(df_second) - assert second_run_count == num_problems + args.limit, f"Expected {num_problems} total results after second run, got {second_run_count}" + assert second_run_count == num_problems + args.limit, ( + f"Expected {num_problems} total results after second run, got {second_run_count}" + ) From d1d311616226ee98e1c14a56da5d640daef79200 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 16:58:30 -0800 Subject: [PATCH 07/23] Add note on latency --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d300e6..7b9be14 100644 --- a/README.md +++ b/README.md @@ -45,4 +45,7 @@ Run the SimpleQA evaluation on just You.com for 5 random problems ## Interpreting Results Results files will be placed in `simpleqa/results` after a successful run of SimpleQA. Files following the pattern `raw_results_{sampler}.csv` are the raw results for each individual sampler. The file `simpleqa_results.csv` contains -aggregated results with various metrics useful for analysis. \ No newline at end of file +aggregated results with various metrics useful for analysis. + +Please note that latency numbers include the total time it takes to run the API request on your machine, so your network +speeds will impact reported numbers and may fluctuate between runs. \ No newline at end of file From 4d4d40762be9805923b8de9504de93801f940def Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 17:03:50 -0800 Subject: [PATCH 08/23] Separate out metrics code --- src/evals/eval_results_analyzer.py | 98 ++++++++++++++++++++++++++++++ src/evals/eval_runner.py | 40 +----------- tests/test_simpleqa.py | 2 +- 3 files changed, 102 insertions(+), 38 deletions(-) create mode 100644 src/evals/eval_results_analyzer.py diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py new file mode 100644 index 0000000..7235f80 --- /dev/null +++ b/src/evals/eval_results_analyzer.py @@ -0,0 +1,98 @@ +""" +Analyze and calculate metrics from evaluation results. + +This module provides functionality to process raw evaluation results, +calculate performance metrics, and generate summary reports. +""" + +import glob +import os +from pathlib import Path +from typing import List, Dict, Any + +import pandas as pd + + +def write_metrics(): + """Calculate metrics from raw results such as accuracy score, P50 latency, and average latency""" + results_path = Path(os.getcwd(), "src/evals/results") + files = glob.glob(f"{results_path}/raw_results_*.csv") + metric_rows = [] + + for sampler_results_file in files: + sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] + df_sampler_results = pd.read_csv(sampler_results_file) + successful_df = df_sampler_results[df_sampler_results["response_time_ms"] != "FAILED"] + + p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() + avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() + correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) + count_answered = len(successful_df) + + if count_answered == 0: + raise ValueError(f"No successful results found for sampler {sampler_name}") + + accuracy_score = round((correct / count_answered) * 100, 2) + + metric_rows.append({ + "provider": sampler_name, + "accuracy_score": accuracy_score, + "p50_latency": round(float(p50_latency), 2), + "avg_latency": round(float(avg_latency), 2), + "problem_count": count_answered, + }) + + write_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") + metric_df = pd.DataFrame(metric_rows) + metric_df.to_csv(write_path, index=False) + print(f"Results were written to {write_path}") + print(metric_df) + + +def calculate_sampler_metrics(sampler_results_file: str) -> Dict[str, Any]: + """ + Calculate metrics for a single sampler's results. + + Args: + sampler_results_file: Path to the raw results CSV file + + Returns: + Dictionary containing calculated metrics + """ + sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] + df_sampler_results = pd.read_csv(sampler_results_file) + successful_df = df_sampler_results[df_sampler_results["response_time_ms"] != "FAILED"] + + p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() + avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() + correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) + count_answered = len(successful_df) + + if count_answered == 0: + raise ValueError(f"No successful results found for sampler {sampler_name}") + + accuracy_score = round((correct / count_answered) * 100, 2) + + return { + "provider": sampler_name, + "accuracy_score": accuracy_score, + "p50_latency": round(float(p50_latency), 2), + "avg_latency": round(float(avg_latency), 2), + "problem_count": count_answered, + } + + +def get_results_files(results_dir: Path = None) -> List[str]: + """ + Get all raw results files from the results directory. + + Args: + results_dir: Optional path to results directory. Defaults to src/evals/results + + Returns: + List of file paths to raw results files + """ + if results_dir is None: + results_dir = Path(os.getcwd(), "src/evals/results") + + return glob.glob(f"{results_dir}/raw_results_*.csv") diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index 9cf44c0..3a9d096 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -5,7 +5,6 @@ import argparse import asyncio -import glob import logging import os from pathlib import Path @@ -16,6 +15,7 @@ from tqdm import tqdm from evals.configs import samplers +from evals.eval_results_analyzer import write_metrics logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") @@ -62,7 +62,7 @@ async def process_query_with_semaphore(semaphore, sampler, target_query, target_ return e -async def get_search_results_and_run_evals( +async def run_evals( args: argparse.Namespace, ): """ @@ -144,40 +144,6 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st ) -def write_metrics(): - """Calculate metrics from raw results such as average score, P50 latency""" - results_path = Path(os.getcwd(), "src/evals/results") - files = glob.glob(f"{results_path}/raw_results_*.csv") - metric_rows = [] - for sampler_results_file in files: - sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] - df_sampler_results = pd.read_csv(sampler_results_file) - successful_df = df_sampler_results[df_sampler_results["response_time_ms"] != "FAILED"] - - p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() - avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() - correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) - count_answered = len(successful_df) - if count_answered == 0: - breakpoint() - raise ValueError("No rows found in raw results file") - accuracy_score = round((correct / count_answered) * 100, 2) - - metric_rows.append({ - "provider": sampler_name, - "accuracy_score": accuracy_score, - "p50_latency": round(float(p50_latency), 2), - "avg_latency": round(float(avg_latency), 2), - "problem_count": count_answered, - }) - - write_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") - metric_df = pd.DataFrame(metric_rows) - metric_df.to_csv(write_path, index=False) - print(f"Results were written to {write_path}") - print(metric_df) - - async def main(): available_samplers = ["you_unified_search", "exa", "exa_fast", "google", "tavily"] parser = argparse.ArgumentParser(description="Run SimpleQA eval") @@ -227,7 +193,7 @@ async def main(): args = parser.parse_args() - await get_search_results_and_run_evals(args) + await run_evals(args) write_metrics() diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py index 13c830b..2d79277 100644 --- a/tests/test_simpleqa.py +++ b/tests/test_simpleqa.py @@ -13,8 +13,8 @@ from evals.eval_runner import ( get_search_results_and_run_evals, get_sampler_filepath, - write_metrics, ) +from evals.eval_results_analyzer import write_metrics dotenv.load_dotenv() From 93c7560527e9fb061aa9208d965c715d4594a3c4 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Thu, 29 Jan 2026 17:12:24 -0800 Subject: [PATCH 09/23] Test results write to their own folder --- .gitignore | 1 + src/evals/eval_results_analyzer.py | 28 ++++++++++----- src/evals/eval_runner.py | 55 +++++++++++++++++++----------- tests/test_simpleqa.py | 29 ++++++++++------ 4 files changed, 75 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index ae7b630..16f1d2a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ venv* # Files src/evals/results/* +tests/results/* # Environment Variables .env diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py index 7235f80..dfd4d8a 100644 --- a/src/evals/eval_results_analyzer.py +++ b/src/evals/eval_results_analyzer.py @@ -8,15 +8,27 @@ import glob import os from pathlib import Path -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional import pandas as pd -def write_metrics(): - """Calculate metrics from raw results such as accuracy score, P50 latency, and average latency""" - results_path = Path(os.getcwd(), "src/evals/results") - files = glob.glob(f"{results_path}/raw_results_*.csv") +def get_default_results_dir() -> Path: + """Get the default results directory path.""" + return Path(os.getcwd(), "src/evals/results") + + +def write_metrics(results_dir: Optional[Path] = None): + """ + Calculate metrics from raw results such as accuracy score, P50 latency, and average latency. + + Args: + results_dir: Optional path to results directory. Defaults to src/evals/results + """ + if results_dir is None: + results_dir = get_default_results_dir() + + files = glob.glob(f"{results_dir}/raw_results_*.csv") metric_rows = [] for sampler_results_file in files: @@ -42,7 +54,7 @@ def write_metrics(): "problem_count": count_answered, }) - write_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") + write_path = results_dir / "simpleqa_results.csv" metric_df = pd.DataFrame(metric_rows) metric_df.to_csv(write_path, index=False) print(f"Results were written to {write_path}") @@ -82,7 +94,7 @@ def calculate_sampler_metrics(sampler_results_file: str) -> Dict[str, Any]: } -def get_results_files(results_dir: Path = None) -> List[str]: +def get_results_files(results_dir: Optional[Path] = None) -> List[str]: """ Get all raw results files from the results directory. @@ -93,6 +105,6 @@ def get_results_files(results_dir: Path = None) -> List[str]: List of file paths to raw results files """ if results_dir is None: - results_dir = Path(os.getcwd(), "src/evals/results") + results_dir = get_default_results_dir() return glob.glob(f"{results_dir}/raw_results_*.csv") diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index 3a9d096..4befa26 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -15,7 +15,7 @@ from tqdm import tqdm from evals.configs import samplers -from evals.eval_results_analyzer import write_metrics +from evals.eval_results_analyzer import write_metrics, get_default_results_dir logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") @@ -25,8 +25,11 @@ logger = logging.getLogger(__name__) -def get_sampler_filepath(sampler_name): - return Path(os.getcwd(), f"src/evals/results/raw_results_{sampler_name}.csv") +def get_sampler_filepath(sampler_name: str, results_dir: Path = None) -> Path: + """Get the filepath for a sampler's results file.""" + if results_dir is None: + results_dir = get_default_results_dir() + return results_dir / f"raw_results_{sampler_name}.csv" def get_sampler(sampler_name: str): @@ -37,17 +40,20 @@ def get_sampler(sampler_name: str): return sampler -def clean_results_folder(): - results_folder_path = Path(os.getcwd(), "src/evals/results") - if os.path.isdir(results_folder_path): - shutil.rmtree(results_folder_path) +def clean_results_folder(results_dir: Path = None): + """Clean the results folder.""" + if results_dir is None: + results_dir = get_default_results_dir() + if os.path.isdir(results_dir): + shutil.rmtree(results_dir) -def get_remaining_problems(df, sampler_name): +def get_remaining_problems(df, sampler_name: str, results_dir: Path = None): """In case of failure, only run problems from the dataset that have not been run yet""" - sampler_results_filepath = get_sampler_filepath(sampler_name) - results_folder_path = Path(os.getcwd(), "src/evals/results") - if os.path.isdir(results_folder_path) and os.path.isfile(sampler_results_filepath): + if results_dir is None: + results_dir = get_default_results_dir() + sampler_results_filepath = get_sampler_filepath(sampler_name, results_dir) + if os.path.isdir(results_dir) and os.path.isfile(sampler_results_filepath): sampler_results = pd.read_csv(sampler_results_filepath) return df[~df["problem"].isin(sampler_results["query"].tolist())] return df @@ -64,6 +70,7 @@ async def process_query_with_semaphore(semaphore, sampler, target_query, target_ async def run_evals( args: argparse.Namespace, + results_dir: Path = None, ): """ Run SimpleQA benchmark for each sampler. @@ -72,21 +79,28 @@ async def run_evals( a progress bar to track progress throughout the run. After each sampler is completed, write the results to the results folder in the format "raw_results_.csv". Once all samplers are completed, calculate metrics based on the retrieved results and create a csv called "simpleqa_results.csv". + + Args: + args: Command line arguments + results_dir: Directory to write results to. Defaults to src/evals/results """ + if results_dir is None: + results_dir = get_default_results_dir() + df = pd.read_csv(args.csv_path) if args.limit: df = df.sample(n=args.limit) if args.clean: - clean_results_folder() + clean_results_folder(results_dir) results = {} for sampler_name in args.samplers: sampler = get_sampler(sampler_name) # Only run on problems that are not already in results folder - remaining_problems = get_remaining_problems(df, sampler.sampler_name) + remaining_problems = get_remaining_problems(df, sampler.sampler_name, results_dir) if len(remaining_problems) == 0: logging.info(f"No problems remaining for sampler {sampler.sampler_name}, moving on...") - results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler.sampler_name)) + results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler.sampler_name, results_dir)) continue logging.info(f"Running sampler {sampler.sampler_name} on {len(remaining_problems)} problems") @@ -115,20 +129,23 @@ async def run_evals( await asyncio.gather(*[t for t in tasks if not t.done()]) # Write results of each batch so we can keep progress in case of a failure - write_raw_sampler_results(batch_results, sampler.sampler_name) + write_raw_sampler_results(batch_results, sampler.sampler_name, results_dir) -def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str): +def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str, results_dir: Path = None): """ Write raw results to a csv file. This takes the raw results list, not the full results dictionary in case an individual sampler fails. """ + if results_dir is None: + results_dir = get_default_results_dir() + df_sampler_results = pd.DataFrame(sampler_results) - if not os.path.isdir(Path(os.getcwd(), "src/evals/results")): - os.mkdir(Path(os.getcwd(), "src/evals/results")) + if not os.path.isdir(results_dir): + os.makedirs(results_dir, exist_ok=True) - sampler_results_filepath = get_sampler_filepath(sampler_name) + sampler_results_filepath = get_sampler_filepath(sampler_name, results_dir) if os.path.isfile(sampler_results_filepath): # If file already exists, append df_sampler_results.to_csv( diff --git a/tests/test_simpleqa.py b/tests/test_simpleqa.py index 2d79277..73cb09c 100644 --- a/tests/test_simpleqa.py +++ b/tests/test_simpleqa.py @@ -11,7 +11,7 @@ import pytest from evals.eval_runner import ( - get_search_results_and_run_evals, + run_evals, get_sampler_filepath, ) from evals.eval_results_analyzer import write_metrics @@ -20,10 +20,15 @@ dotenv.load_dotenv() +def get_test_results_dir() -> Path: + """Get the test results directory path.""" + return Path(os.getcwd(), "tests/results") + + @pytest.fixture def test_results_cleanup(): """Cleanup test results before and after test""" - results_folder_path = Path(os.getcwd(), "src/evals/results") + results_folder_path = get_test_results_dir() # Clean before test if os.path.isdir(results_folder_path): @@ -50,6 +55,7 @@ async def test_simpleqa_runner(test_results_cleanup): """ # Create test arguments num_problems = 10 + results_dir = get_test_results_dir() args = argparse.Namespace( samplers=["you_unified_search", "exa_search_with_contents", "tavily_basic", "tavily_advanced", "serp_google"], csv_path="data/simple_qa.csv", @@ -61,10 +67,10 @@ async def test_simpleqa_runner(test_results_cleanup): ) # Run the evaluation - await get_search_results_and_run_evals(args) + await run_evals(args, results_dir=results_dir) for sampler in args.samplers: # Verify results file was created - results_filepath = get_sampler_filepath(sampler) + results_filepath = get_sampler_filepath(sampler, results_dir) assert os.path.isfile(results_filepath), f"Results file not created at {results_filepath}" # Read and verify results @@ -82,14 +88,14 @@ async def test_simpleqa_runner(test_results_cleanup): assert df_results["query"].notna().all(), "Some queries are null" # Write and verify metrics - write_metrics() - metrics_path = Path(os.getcwd(), "src/evals/results/simpleqa_results.csv") + write_metrics(results_dir) + metrics_path = results_dir / "simpleqa_results.csv" assert os.path.isfile(metrics_path), "Metrics file not created" df_metrics = pd.read_csv(metrics_path) assert len(df_metrics) == len(args.samplers), f"Expected {len(args.samplers)} sampler in metrics" assert df_metrics["provider"].drop_duplicates().tolist().sort() == args.samplers.sort() - assert "average_score" in df_metrics.columns + assert "accuracy_score" in df_metrics.columns assert "p50_latency" in df_metrics.columns assert "problem_count" in df_metrics.columns @@ -103,6 +109,7 @@ async def test_simpleqa_runner_resume_capability(test_results_cleanup): from where it left off without re-processing completed queries. """ num_problems = 10 + results_dir = get_test_results_dir() # Create test arguments for first run (partial) args = argparse.Namespace( samplers=["you_unified_search"], @@ -115,9 +122,9 @@ async def test_simpleqa_runner_resume_capability(test_results_cleanup): ) # First run - await get_search_results_and_run_evals(args) + await run_evals(args, results_dir=results_dir) for sampler in args.samplers: - results_filepath = get_sampler_filepath(sampler) + results_filepath = get_sampler_filepath(sampler, results_dir) df_first = pd.read_csv(results_filepath) first_run_count = len(df_first) assert first_run_count == num_problems, f"Expected {num_problems} results from first run, got {first_run_count}" @@ -125,9 +132,9 @@ async def test_simpleqa_runner_resume_capability(test_results_cleanup): # Second run with more queries (should add new results) args.limit = 5 args.clean = False # Don't clean, resume from existing - await get_search_results_and_run_evals(args) + await run_evals(args, results_dir=results_dir) for sampler in args.samplers: - results_filepath = get_sampler_filepath(sampler) + results_filepath = get_sampler_filepath(sampler, results_dir) df_second = pd.read_csv(results_filepath) second_run_count = len(df_second) assert second_run_count == num_problems + args.limit, ( From 4e1084d541a90dd8fe42f4e537a33b2a7bb4bde9 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Tue, 10 Feb 2026 15:33:43 -0800 Subject: [PATCH 10/23] Various upgrades --- ...imple_qa.csv => simpleqa_full_dataset.csv} | 0 src/evals/configs/samplers.py | 17 +- src/evals/constants.py | 13 +- src/evals/eval_results_analyzer.py | 81 +++------- src/evals/eval_runner.py | 150 +++++++++--------- src/evals/processing/evaluate_answer.py | 5 +- src/evals/processing/synthesize_answer.py | 53 ++++--- src/evals/processing/synthesizer_utils.py | 76 +++++++++ .../samplers/applied_samplers/exa_sampler.py | 10 +- .../serp_api_google_sampler.py | 62 -------- .../applied_samplers/tavily_sampler.py | 6 +- .../applied_samplers/you_livecrawl_sampler.py | 89 +++++++++++ .../{you_sampler.py => you_search_sampler.py} | 8 +- .../samplers/base_samplers/base_sampler.py | 42 ++--- .../base_samplers/base_sdk_sampler.py | 3 +- 15 files changed, 356 insertions(+), 259 deletions(-) rename data/{simple_qa.csv => simpleqa_full_dataset.csv} (100%) create mode 100644 src/evals/processing/synthesizer_utils.py delete mode 100644 src/evals/samplers/applied_samplers/serp_api_google_sampler.py create mode 100644 src/evals/samplers/applied_samplers/you_livecrawl_sampler.py rename src/evals/samplers/applied_samplers/{you_sampler.py => you_search_sampler.py} (91%) diff --git a/data/simple_qa.csv b/data/simpleqa_full_dataset.csv similarity index 100% rename from data/simple_qa.csv rename to data/simpleqa_full_dataset.csv diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py index e154610..6dba108 100644 --- a/src/evals/configs/samplers.py +++ b/src/evals/configs/samplers.py @@ -1,13 +1,18 @@ import os from evals.samplers.applied_samplers.exa_sampler import ExaSampler -from evals.samplers.applied_samplers.serp_api_google_sampler import SerpApiGoogleSampler from evals.samplers.applied_samplers.tavily_sampler import TavilySampler -from evals.samplers.applied_samplers.you_sampler import YouSampler +from evals.samplers.applied_samplers.you_livecrawl_sampler import YouLivecrawlSampler +from evals.samplers.applied_samplers.you_search_sampler import YouSearchSampler + SAMPLERS = [ - YouSampler( - sampler_name="you_unified_search", + YouLivecrawlSampler( + sampler_name="you_search_livecrawl", + api_key=os.getenv("YOU_API_KEY"), + ), + YouSearchSampler( + sampler_name="you_search", api_key=os.getenv("YOU_API_KEY"), ), ExaSampler( @@ -15,10 +20,6 @@ api_key=os.getenv("EXA_API_KEY"), custom_args={"text": True}, ), - SerpApiGoogleSampler( - sampler_name="serp_google", - api_key=os.getenv("SERP_API_KEY"), - ), TavilySampler( sampler_name="tavily_basic", api_key=os.getenv("TAVILY_API_KEY"), diff --git a/src/evals/constants.py b/src/evals/constants.py index 45273d7..1d413eb 100644 --- a/src/evals/constants.py +++ b/src/evals/constants.py @@ -1,3 +1,12 @@ # We used a weaker model for synthesis and a stronger model for grading to ensure fairness. -SYNTHESIS_MODEL = "gpt-4o-mini" -GRADER_MODEL = "gpt-4.1" +SYNTHESIS_MODEL = "gpt-5-nano" +GRADER_MODEL = "gpt-5-mini" + +# Maximum tokens available for search results (leaving room for prompt and response) +MAX_SEARCH_RESULT_TOKENS = 127750 + +SYNTHESIS_PROMPT = """ + You are an AI assistant that answers questions using search results. + Read the provided search snippets carefully and answer based only on information found in the snippets. + Keep your response clear and concise. +""" diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py index dfd4d8a..e97f7fb 100644 --- a/src/evals/eval_results_analyzer.py +++ b/src/evals/eval_results_analyzer.py @@ -18,6 +18,22 @@ def get_default_results_dir() -> Path: return Path(os.getcwd(), "src/evals/results") +def get_results_files(results_dir: Optional[Path] = None) -> List[str]: + """ + Get all raw results files from the results directory. + + Args: + results_dir: Optional path to results directory. Defaults to src/evals/results + + Returns: + List of file paths to raw results files + """ + if results_dir is None: + results_dir = get_default_results_dir() + + return glob.glob(f"{results_dir}/dataset_*.csv") + + def write_metrics(results_dir: Optional[Path] = None): """ Calculate metrics from raw results such as accuracy score, P50 latency, and average latency. @@ -28,16 +44,17 @@ def write_metrics(results_dir: Optional[Path] = None): if results_dir is None: results_dir = get_default_results_dir() - files = glob.glob(f"{results_dir}/raw_results_*.csv") + files = get_results_files(results_dir) metric_rows = [] for sampler_results_file in files: + dataset_name = sampler_results_file.split("dataset_")[1].split("_raw_results")[0] sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] df_sampler_results = pd.read_csv(sampler_results_file) - successful_df = df_sampler_results[df_sampler_results["response_time_ms"] != "FAILED"] + successful_df = df_sampler_results[df_sampler_results["query"] != "FAILED"] - p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() - avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() + avg_internal_latency = pd.to_numeric(successful_df["internal_response_time_ms"]).mean() + avg_end_to_end_latency = pd.to_numeric(successful_df["end_to_end_time_ms"]).mean() correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) count_answered = len(successful_df) @@ -48,63 +65,15 @@ def write_metrics(results_dir: Optional[Path] = None): metric_rows.append({ "provider": sampler_name, + "dataset": dataset_name, "accuracy_score": accuracy_score, - "p50_latency": round(float(p50_latency), 2), - "avg_latency": round(float(avg_latency), 2), + "avg_internal_latency": round(float(avg_internal_latency), 2), + "avg_end_to_end_latency": round(float(avg_end_to_end_latency), 2), "problem_count": count_answered, }) - write_path = results_dir / "simpleqa_results.csv" + write_path = results_dir / "analyzed_results.csv" metric_df = pd.DataFrame(metric_rows) metric_df.to_csv(write_path, index=False) print(f"Results were written to {write_path}") print(metric_df) - - -def calculate_sampler_metrics(sampler_results_file: str) -> Dict[str, Any]: - """ - Calculate metrics for a single sampler's results. - - Args: - sampler_results_file: Path to the raw results CSV file - - Returns: - Dictionary containing calculated metrics - """ - sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] - df_sampler_results = pd.read_csv(sampler_results_file) - successful_df = df_sampler_results[df_sampler_results["response_time_ms"] != "FAILED"] - - p50_latency = pd.to_numeric(successful_df["response_time_ms"]).median() - avg_latency = pd.to_numeric(successful_df["response_time_ms"]).mean() - correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) - count_answered = len(successful_df) - - if count_answered == 0: - raise ValueError(f"No successful results found for sampler {sampler_name}") - - accuracy_score = round((correct / count_answered) * 100, 2) - - return { - "provider": sampler_name, - "accuracy_score": accuracy_score, - "p50_latency": round(float(p50_latency), 2), - "avg_latency": round(float(avg_latency), 2), - "problem_count": count_answered, - } - - -def get_results_files(results_dir: Optional[Path] = None) -> List[str]: - """ - Get all raw results files from the results directory. - - Args: - results_dir: Optional path to results directory. Defaults to src/evals/results - - Returns: - List of file paths to raw results files - """ - if results_dir is None: - results_dir = get_default_results_dir() - - return glob.glob(f"{results_dir}/raw_results_*.csv") diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index 4befa26..cf1590f 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -1,6 +1,6 @@ """ -The main file for running the SimpleQA eval. Use this file to run the SimpleQA eval against your selected samplers. -Available samplers can be found in get_samplers() or in the `sampler/` folder +The main file for running evals. Use this file to run the eval against your selected samplers and datasets. +Available samplers can be found using --help or in configs/samplers """ import argparse @@ -19,24 +19,26 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") -# Mute noisy HTTP client logs +# Mute noisy client logs logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING) +logging.getLogger("google_genai.models").setLevel(logging.ERROR) logger = logging.getLogger(__name__) -def get_sampler_filepath(sampler_name: str, results_dir: Path = None) -> Path: +def get_sampler_filepath(sampler_name: str, dataset_name: str, results_dir: Path = None) -> Path: """Get the filepath for a sampler's results file.""" if results_dir is None: results_dir = get_default_results_dir() - return results_dir / f"raw_results_{sampler_name}.csv" + + return results_dir / f"dataset_{dataset_name}_raw_results_{sampler_name}.csv" def get_sampler(sampler_name: str): """Initialize requested samplers""" sampler = next((sampler for sampler in samplers.SAMPLERS if sampler.sampler_name == sampler_name), None) if sampler is None: - raise ValueError(f"Sampler '{sampler_name}' not found") + raise ValueError(f"Sampler '{sampler_name}' not found. Available samplers: {[sampler.sampler_name for sampler in samplers.SAMPLERS]}") return sampler @@ -48,11 +50,11 @@ def clean_results_folder(results_dir: Path = None): shutil.rmtree(results_dir) -def get_remaining_problems(df, sampler_name: str, results_dir: Path = None): +def get_remaining_problems(df, sampler_name: str, dataset_name: str, results_dir: Path = None): """In case of failure, only run problems from the dataset that have not been run yet""" if results_dir is None: results_dir = get_default_results_dir() - sampler_results_filepath = get_sampler_filepath(sampler_name, results_dir) + sampler_results_filepath = get_sampler_filepath(sampler_name, dataset_name, results_dir) if os.path.isdir(results_dir) and os.path.isfile(sampler_results_filepath): sampler_results = pd.read_csv(sampler_results_filepath) return df[~df["problem"].isin(sampler_results["query"].tolist())] @@ -68,17 +70,23 @@ async def process_query_with_semaphore(semaphore, sampler, target_query, target_ return e +def get_dataset(dataset_name): + if dataset_name == "simpleqa": + return pd.read_csv("data/simpleqa_full_dataset.csv") + # TODO: Add frames, route to deep search + else: + raise ValueError(f"Dataset '{dataset_name}' not recognized, run python src/evals/eval_runner.py --help for available datasets") + + async def run_evals( args: argparse.Namespace, results_dir: Path = None, ): """ - Run SimpleQA benchmark for each sampler. + Run benchmark for each sampler. - Run the selected number of SimpleQA queries against each requested sampler. Creates tasks in batches, and provides - a progress bar to track progress throughout the run. After each sampler is completed, write the results to the - results folder in the format "raw_results_.csv". Once all samplers are completed, calculate metrics based - on the retrieved results and create a csv called "simpleqa_results.csv". + Run the selected number of queries against each requested sampler. Creates tasks in batches, and provides + a progress bar to track progress throughout the run. Args: args: Command line arguments @@ -87,52 +95,55 @@ async def run_evals( if results_dir is None: results_dir = get_default_results_dir() - df = pd.read_csv(args.csv_path) - if args.limit: - df = df.sample(n=args.limit) if args.clean: clean_results_folder(results_dir) results = {} - for sampler_name in args.samplers: - sampler = get_sampler(sampler_name) - # Only run on problems that are not already in results folder - remaining_problems = get_remaining_problems(df, sampler.sampler_name, results_dir) - if len(remaining_problems) == 0: - logging.info(f"No problems remaining for sampler {sampler.sampler_name}, moving on...") - results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler.sampler_name, results_dir)) - continue - - logging.info(f"Running sampler {sampler.sampler_name} on {len(remaining_problems)} problems") - df = remaining_problems - - # Run problems in batches - with tqdm( - total=len(df), - desc=f"Running sampler: {sampler.sampler_name}", - unit="queries", - ) as pbar: - semaphore = asyncio.Semaphore(args.max_concurrent_tasks) - - for i in range(0, len(df), args.batch_size): - batch_df = df[i : i + args.batch_size] - - tasks = [] - for _, row in batch_df.iterrows(): - query = row["problem"] - ground_truth = row["answer"] - task = asyncio.create_task(process_query_with_semaphore(semaphore, sampler, query, ground_truth)) - tasks.append(task) - - batch_results = await asyncio.gather(*tasks, return_exceptions=True) - pbar.update(len(batch_df)) - - await asyncio.gather(*[t for t in tasks if not t.done()]) - # Write results of each batch so we can keep progress in case of a failure - write_raw_sampler_results(batch_results, sampler.sampler_name, results_dir) - - -def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str, results_dir: Path = None): + for dataset_name in args.datasets: + df = get_dataset(dataset_name) + if args.limit: + df = df.sample(n=args.limit) + for sampler_name in args.samplers: + sampler = get_sampler(sampler_name) + # Only run on problems that are not already in results folder + remaining_problems = get_remaining_problems( + df=df, sampler_name=sampler.sampler_name, dataset_name=dataset_name, results_dir=results_dir + ) + if len(remaining_problems) == 0: + logging.info(f"No problems remaining for sampler {sampler.sampler_name}, moving on...") + results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler.sampler_name, dataset_name, results_dir)) + continue + + logging.info(f"Running sampler {sampler.sampler_name} on dataset {dataset_name} on {len(remaining_problems)} problems") + df = remaining_problems + + # Run problems in batches + with tqdm( + total=len(df), + desc=f"Running sampler: {sampler.sampler_name} for dataset {dataset_name}", + unit="queries", + ) as pbar: + semaphore = asyncio.Semaphore(args.max_concurrent_tasks) + + for i in range(0, len(df), args.batch_size): + batch_df = df[i : i + args.batch_size] + + tasks = [] + for _, row in batch_df.iterrows(): + query = row["problem"] + ground_truth = row["answer"] + task = asyncio.create_task(process_query_with_semaphore(semaphore, sampler, query, ground_truth)) + tasks.append(task) + + batch_results = await asyncio.gather(*tasks, return_exceptions=True) + pbar.update(len(batch_df)) + + await asyncio.gather(*[t for t in tasks if not t.done()]) + # Write results of each batch so we can keep progress in case of a failure + write_raw_sampler_results(batch_results, sampler.sampler_name, dataset_name, results_dir) + + +def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str, dataset_name: str, results_dir: Path = None): """ Write raw results to a csv file. @@ -145,7 +156,7 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st if not os.path.isdir(results_dir): os.makedirs(results_dir, exist_ok=True) - sampler_results_filepath = get_sampler_filepath(sampler_name, results_dir) + sampler_results_filepath = get_sampler_filepath(sampler_name, dataset_name, results_dir) if os.path.isfile(sampler_results_filepath): # If file already exists, append df_sampler_results.to_csv( @@ -162,8 +173,9 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st async def main(): - available_samplers = ["you_unified_search", "exa", "exa_fast", "google", "tavily"] - parser = argparse.ArgumentParser(description="Run SimpleQA eval") + available_samplers = ["you_search_livecrawl", "you_search", "exa_search_with_contents", "google_vertex", "tavily_basic", "tavily_advanced"] + available_datasets = ["simpleqa", "xfreshqa", "finsearch"] + parser = argparse.ArgumentParser(description="Run an eval") parser.add_argument( "--samplers", default=available_samplers, @@ -171,6 +183,13 @@ async def main(): nargs="+", help=f"List of samplers to run. Choose from {available_samplers}", ) + parser.add_argument( + "--datasets", + type=str, + nargs="+", + required=True, + help=f"The dataset(s) to eval against (can specify multiple). Select from {available_datasets}", + ) parser.add_argument( "--limit", default=None, @@ -179,7 +198,7 @@ async def main(): ) parser.add_argument( "--batch-size", - default=250, + default=50, type=int, help="Used to define the batch size used in multiprocessing. Also determines how many problems will be run before appending to corresponding results file", ) @@ -189,18 +208,6 @@ async def main(): type=int, help="Used to define the max count of concurrent tasks to be used in multiprocessing", ) - parser.add_argument( - "--num-results", - default=5, - type=int, - help="Used to define the number of results returned by each provider", - ) - parser.add_argument( - "--csv-path", - default="data/simple_qa.csv", - type=str, - help="Used to define the filepath of the test set", - ) parser.add_argument( "--clean", default=False, @@ -211,6 +218,7 @@ async def main(): args = parser.parse_args() await run_evals(args) + write_metrics() diff --git a/src/evals/processing/evaluate_answer.py b/src/evals/processing/evaluate_answer.py index fb5cd07..59d1f3b 100644 --- a/src/evals/processing/evaluate_answer.py +++ b/src/evals/processing/evaluate_answer.py @@ -116,8 +116,7 @@ async def call_openai_async(self, client: httpx.AsyncClient, prompt: str) -> str payload = { "model": self.model, "messages": [{"role": "user", "content": prompt}], - "temperature": 0.0, - "max_tokens": 1024, + "max_completion_tokens": 1024, } response = await client.post( @@ -154,7 +153,7 @@ async def evaluate_single(self, question: str, target: str, predicted_answer: st predicted_answer=predicted_answer, ) - async with httpx.AsyncClient(timeout=30.0) as client: + async with httpx.AsyncClient(timeout=60.0) as client: grading_response = await self.call_openai_async(client, grader_prompt) # Parse the grade diff --git a/src/evals/processing/synthesize_answer.py b/src/evals/processing/synthesize_answer.py index 3ddc29c..c33d4c6 100644 --- a/src/evals/processing/synthesize_answer.py +++ b/src/evals/processing/synthesize_answer.py @@ -3,19 +3,17 @@ search results into a single answer to be compared against the ground truth. Using the same prompt and model for all samplers ensures an equal playing field and an apples to apples comparison across all samplers. -To view or edit the model used for synthesis, see evals.simpleqa.constants +To view or edit the model used for synthesis, see evals.constants """ -import asyncio from dataclasses import dataclass import logging import os +import requests +import time +import traceback from typing import List, Dict, Any -import httpx - -from evals import constants - @dataclass class SynthesizeAnswerResponse: @@ -25,9 +23,11 @@ class SynthesizeAnswerResponse: class SynthesizeAnswer: - def __init__(self, max_retries: int = 3): + def __init__(self, synthesis_prompt: str, synthesis_model: str, max_retries: int = 3): self.logger = logging.getLogger(self.__class__.__name__) + self.synthesis_prompt = synthesis_prompt + self.synthesis_model = synthesis_model self.max_retries = max_retries self.api_key = os.getenv("OPENAI_API_KEY") self.headers = { @@ -35,53 +35,58 @@ def __init__(self, max_retries: int = 3): "Content-Type": "application/json", } - async def process_single(self, session: httpx.AsyncClient, query: str, snippets: str) -> SynthesizeAnswerResponse: - """Synthesize a single response asynchronously""" + def process_single(self, query: str, results: str) -> SynthesizeAnswerResponse: + """Synthesize a single response""" for trial in range(self.max_retries + 1): try: - synthesis_prompt = """ - You are an AI assistant that answers questions using search results. - Read the provided search snippets carefully and answer based only on information found in the snippets. - Keep your response clear and concise. - """ - payload = { - "model": constants.SYNTHESIS_MODEL, + "model": self.synthesis_model, "messages": [ - {"role": "system", "content": synthesis_prompt}, + {"role": "system", "content": self.synthesis_prompt}, { "role": "user", - "content": f"Query: {query}\n\nSearch results: {snippets}", + "content": f"Query: {query}\n\nSearch results: {results}", }, ], } - response = await session.post( + import time + + start_time = time.time() + response = requests.post( "https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload, ) + if response.status_code == 200: result = response.json() return SynthesizeAnswerResponse( response_text=result["choices"][0]["message"]["content"], - actual_queried_message_list=[snippets], + actual_queried_message_list=[results], response_metadata={ - "model": constants.SYNTHESIS_MODEL, + "model": self.synthesis_model, "trial": trial, }, ) + if response.status_code == 402: + print("Rate limit hit") + # TODO: Find a clever way to cut this eval short, but not stop a long chain of evals + quit() else: error_text = response.text + print(f"ERROR: Failed synthesis after {self.max_retries} retries") + traceback.print_exc() raise Exception(f"API error {response.status_code}: {error_text}") except Exception as e: if trial >= self.max_retries: - self.logger.error(f"Failed after {self.max_retries} retries: {e}") + print(f"ERROR: Failed synthesis after {self.max_retries} retries") + traceback.print_exc() raise backoff = 2**trial - self.logger.warning(f"Retry {trial + 1} in {backoff}s: {e}") - await asyncio.sleep(backoff) + print(f"WARNING: Retry {trial + 1} in {backoff}s: {e}") + time.sleep(backoff) raise ValueError("Could not synthesize answer") diff --git a/src/evals/processing/synthesizer_utils.py b/src/evals/processing/synthesizer_utils.py new file mode 100644 index 0000000..dba0e38 --- /dev/null +++ b/src/evals/processing/synthesizer_utils.py @@ -0,0 +1,76 @@ +import math +from typing import List + +import tiktoken + +from evals.processing.synthesize_answer import SynthesizeAnswer +from evals.constants import ( + SYNTHESIS_PROMPT, + MAX_SEARCH_RESULT_TOKENS, + SYNTHESIS_MODEL, +) + + +def trim_results_to_model_limit( + formatted_results: list[str], + synthesis_model: str, +) -> list[str]: + """ + Trim search results to fit within the synthesis model's token limit. + + Args: + formatted_results: List of strings, each representing a search result + synthesis_model: The model name used for synthesis (e.g., "gpt-4o-mini") + + Returns: + List of trimmed result strings that fit within token limits + """ + + # Initialize tokenizer + enc = tiktoken.encoding_for_model(synthesis_model) + + # Sort results by length (token count) - shortest first + results_with_tokens = [(result, enc.encode(result)) for result in formatted_results] + results_with_tokens.sort(key=lambda x: len(x[1])) + + # Track remaining tokens available + remaining_search_result_tokens = MAX_SEARCH_RESULT_TOKENS + + # Trim each result to fit within token limit + trimmed_results = [] + for i, (result, tokens) in enumerate(results_with_tokens): + # Calculate max tokens per result based on remaining tokens and remaining results + remaining_results = len(results_with_tokens) - i + max_tokens_per_result = math.floor(remaining_search_result_tokens / remaining_results) + + # If within limit, keep as is; otherwise truncate to max_tokens_per_result + if len(tokens) <= max_tokens_per_result: + trimmed_results.append(result) + remaining_search_result_tokens -= len(tokens) + else: + # Truncate token list and decode back to text + truncated_tokens = tokens[:max_tokens_per_result] + trimmed_result = enc.decode(truncated_tokens) + trimmed_results.append(trimmed_result) + remaining_search_result_tokens -= len(truncated_tokens) + + return trimmed_results + + +def synthesize_response( + query: str, + formatted_results: list[str], + synthesis_model: str = SYNTHESIS_MODEL, +) -> str: + """ + Private method for synthesizing responses from search results using OpenAI + """ + # Trim results to fit within model token limits + trimmed_results = trim_results_to_model_limit(formatted_results, synthesis_model) + + # Concatenate results with separator + concatenated_results = "\n---\n".join(trimmed_results) + + answer_synthesizer = SynthesizeAnswer(SYNTHESIS_PROMPT, max_retries=3, synthesis_model=synthesis_model) + result = answer_synthesizer.process_single(query, concatenated_results) + return result.response_text if result else f"Synthesis failed for: {query}" diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index f74aac5..2c37799 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -1,6 +1,6 @@ """Run evals using the Exa SDK""" -from typing import Any, Dict +from typing import Any, Dict, List from exa_py import Exa @@ -33,14 +33,14 @@ def _initialize_client(self): def _get_search_results_impl(self, query: str) -> Any: if self.custom_args and self.custom_args["text"]: - return self.client.search(query=query, num_results=5, contents={"text": True}) + return self.client.search(query=query, num_results=10, contents={"text": True}) raise ValueError("Unknown configuration for Exa") - def format_results(self, results: Any) -> str: + def format_results(self, results: Any) -> list[str]: formatted_results = [] - raw_results = getattr(results, "results", None) + raw_results = getattr(results, "results", None) for result in raw_results: title = getattr(result, "title", "") url = getattr(result, "url", "") @@ -48,4 +48,4 @@ def format_results(self, results: Any) -> str: if text: formatted_results.append(f'[{title}]({url})\ntext: "{text}"\n') - return "\n---\n".join(formatted_results) + return formatted_results diff --git a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py b/src/evals/samplers/applied_samplers/serp_api_google_sampler.py deleted file mode 100644 index 1864869..0000000 --- a/src/evals/samplers/applied_samplers/serp_api_google_sampler.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from typing import Any, Dict - -from evals.samplers.base_samplers.base_api_sampler import BaseAPISampler - - -class SerpApiGoogleSampler(BaseAPISampler): - def __init__( - self, - sampler_name: str, - api_key: str = None, - timeout: float = 60.0, - max_retries: int = 3, - max_concurrency: int = 10, - needs_synthesis: bool = True, - custom_args: Dict[str, Any] | None = None, - ): - super().__init__( - sampler_name=sampler_name, - api_key=api_key, - max_retries=max_retries, - timeout=timeout, - max_concurrency=max_concurrency, - needs_synthesis=needs_synthesis, - custom_args=custom_args, - ) - - @staticmethod - def _get_base_url(): - return "https://serpapi.com" - - @staticmethod - def _get_endpoint() -> str: - return "/search" - - @staticmethod - def _get_method() -> str: - return "GET" - - def _get_headers(self) -> Dict[str, str]: - return {} - - def _get_payload(self, query: str, custom_args: Dict[str, Any] | None = None) -> Dict[str, Any]: - return { - "q": query, - "engine": "google", - "num": 5, - "api_key": self.api_key, - } - - def format_results(self, results: Any) -> str: - formatted_results = [] - if "organic_results" in results: - for result in results["organic_results"]: - if isinstance(result, dict): - title = result.get("title", "") - link = result.get("link", "") - snippet = result.get("snippet", "") - if snippet and isinstance(snippet, list): - snippet = " ".join(snippet) - formatted_results.append(f"[{title}]({link})\n snippet: {snippet}") - return "\n---\n".join(formatted_results) diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index b7a212d..2fcf644 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -35,12 +35,12 @@ def _get_search_results_impl(self, query: str) -> Any: if self.custom_args and self.custom_args["search_depth"]: return self.client.search( query=query, - max_results=5, + max_results=10, search_depth=self.custom_args["search_depth"], ) raise ValueError("Unknown configuration for Tavily") - def format_results(self, results: Any) -> str: + def format_results(self, results: Any) -> list[str]: formatted_results = [] raw_results = results["results"] @@ -52,4 +52,4 @@ def format_results(self, results: Any) -> str: if content: formatted_results.append(f"[{title}]({url})\ncontent: {content}\n") - return "\n---\n".join(formatted_results) + return formatted_results diff --git a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py new file mode 100644 index 0000000..46f5d68 --- /dev/null +++ b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py @@ -0,0 +1,89 @@ +import random +from typing import Any, Dict, List + +from evals.samplers.base_samplers.base_api_sampler import ( + BaseAPISampler, +) + + +class YouLivecrawlSampler(BaseAPISampler): + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_retries: int = 3, + max_concurrency: int = 10, + needs_synthesis: bool = True, + custom_args: Dict[str, Any] | None = None, + ): + if api_key is None: + raise ValueError( + f"API key not provided for sampler {sampler_name}. Ensure .env file is configured and contains necessary API keys" + ) + + super().__init__( + sampler_name=sampler_name, + api_key=api_key, + max_retries=max_retries, + timeout=timeout, + max_concurrency=max_concurrency, + needs_synthesis=needs_synthesis, + custom_args=custom_args, + ) + + @staticmethod + def _get_base_url(): + return "https://ydc-index.io" + + def _get_headers(self) -> Dict[str, str]: + return {"x-api-key": self.api_key} + + @staticmethod + def _get_method() -> str: + return "GET" + + @staticmethod + def _get_endpoint() -> str: + return "/v1/search/" + + def _get_payload(self, query: str) -> Dict[str, Any]: + return { + "query": query, + "count": 10, + "livecrawl": "all", + "livecrawl_formats": "markdown", + # These parameters are in beta, and are designed to maximize performance + "num_bytes": 500000 + random.randint(1, 100), + "crawl_timeout": 1, + } + + def format_results(self, results: Any) -> list[str]: + formatted_results = [] + if "results" in results: + if "web" not in results["results"]: + return [""] + + if "news" in results["results"]: + all_results = results["results"]["news"] + results["results"]["web"] + else: + all_results = results["results"]["web"] + + for result in all_results: + title = result.get("title", "") + url = result.get("url", "") + contents = result.get("contents", "") + + if "markdown" in contents: + contents = contents["markdown"] + formatted_result = f"[{title}]({url})\n{contents}" + formatted_results.append(formatted_result) + else: + description = result.get("description", "") + snippet = result.get("snippets", "") + if snippet and isinstance(snippet, list): + snippet = " ".join(snippet) + formatted_result = f"[{title}]({url})\n snippet: {snippet}\n description: {description}" + formatted_results.append(formatted_result) + + return formatted_results diff --git a/src/evals/samplers/applied_samplers/you_sampler.py b/src/evals/samplers/applied_samplers/you_search_sampler.py similarity index 91% rename from src/evals/samplers/applied_samplers/you_sampler.py rename to src/evals/samplers/applied_samplers/you_search_sampler.py index c4573c7..9795bc0 100644 --- a/src/evals/samplers/applied_samplers/you_sampler.py +++ b/src/evals/samplers/applied_samplers/you_search_sampler.py @@ -7,7 +7,7 @@ from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler -class YouSampler(BaseSDKSampler): +class YouSearchSampler(BaseSDKSampler): def __init__( self, sampler_name: str, @@ -34,10 +34,10 @@ def _initialize_client(self): def _get_search_results_impl(self, query: str) -> Any: return self.client.search.unified( query=query, - count=5, + count=10, ) - def format_results(self, results: Any) -> str: + def format_results(self, results: Any) -> list[str]: formatted_results = [] raw_results = [] if results.results and results.results.web: @@ -53,4 +53,4 @@ def format_results(self, results: Any) -> str: if snippets and isinstance(snippets, list): snippets = " ".join(snippets) formatted_results.append(f"[{title}]({url})\n snippets: {snippets}\n description: {description}") - return "\n---\n".join(formatted_results) + return formatted_results diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index e440137..7745991 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -4,9 +4,7 @@ import time from typing import Any, Dict -import httpx - -from evals.processing.synthesize_answer import SynthesizeAnswer +from evals.processing import synthesizer_utils class BaseSampler(ABC): @@ -50,7 +48,7 @@ def get_search_results(self, query: str) -> Any: pass @abstractmethod - def format_results(self, results: Any) -> str: + def format_results(self, results: Any) -> list[str]: """ Format search results. @@ -73,15 +71,6 @@ def __extract_query_from_messages__(message_list: list[dict]) -> str: return last_message["content"] return str(message_list) - async def __synthesize_response(self, query: str, formatted_context: str) -> str: - """ - Private method for synthesizing responses from search results using OpenAI - """ - answer_synthesizer = SynthesizeAnswer(max_retries=3) - async with httpx.AsyncClient(timeout=30.0) as client: - result = await answer_synthesizer.process_single(client, query, formatted_context) - return result.response_text if result else f"Synthesis failed for: {query}" - @staticmethod async def __evaluate_response(query: str, ground_truth: str, generated_answer: str) -> Dict[str, Any]: """Evaluate the generated response against ground truth""" @@ -92,21 +81,32 @@ async def __evaluate_response(query: str, ground_truth: str, generated_answer: s async def __call__(self, query_input, ground_truth: str = "", overwrite: bool = False) -> Dict[str, Any]: """Main execution pipeline""" - + internal_response_time_ms = None + end_to_end_time_ms = None if isinstance(query_input, list): query = self.__extract_query_from_messages__(query_input) else: query = str(query_input) + end_to_end_start_time = time.time() # Get raw results try: # Run synchronous SDK call in thread pool - start_time = time.time() raw_results = await asyncio.to_thread(self.get_search_results, query) - response_time_no_retries = (time.time() - start_time) * 1000 # Convert to ms + if self.sampler_name == 'you_search_livecrawl': + internal_response_time_ms = round(raw_results["metadata"]["latency"] * 1000, 2) # Convert to ms + elif self.sampler_name == 'you_search': + internal_response_time_ms = round(raw_results.metadata.latency * 1000, 2) # Convert to ms + elif 'tavily' in self.sampler_name: + internal_response_time_ms = round(raw_results['response_time'] * 1000, 2) # Convert to ms + elif 'exa' in self.sampler_name: + # Exa does not return internal run time, best we can do is API call time + internal_response_time_ms = round((time.time() - end_to_end_start_time) * 1000, 2) # Convert to ms + formatted_results = self.format_results(raw_results) except Exception as e: - raw_results, response_time_no_retries, formatted_results = ( + raw_results, internal_response_time_ms, end_to_end_time_ms, formatted_results = ( + "FAILED", "FAILED", "FAILED", "FAILED", @@ -116,9 +116,12 @@ async def __call__(self, query_input, ground_truth: str = "", overwrite: bool = # Synthesize raw results try: if self.needs_synthesis: - generated_answer = await self.__synthesize_response(query, formatted_results) + generated_answer = synthesizer_utils.synthesize_response(query, formatted_results) else: generated_answer = formatted_results # Already synthesized by API + + end_to_end_end_time = time.time() + end_to_end_time_ms = round((end_to_end_end_time - end_to_end_start_time) * 1000, 2) except Exception as e: generated_answer = "FAILED" logging.exception(e) @@ -137,7 +140,8 @@ async def __call__(self, query_input, ground_truth: str = "", overwrite: bool = # Format result result = { "query": query, - "response_time_ms": response_time_no_retries, + "internal_response_time_ms": internal_response_time_ms, + "end_to_end_time_ms": end_to_end_time_ms, "evaluation_result": evaluation_result, "generated_answer": generated_answer, "ground_truth": ground_truth, diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py index a116f84..bba35a6 100644 --- a/src/evals/samplers/base_samplers/base_sdk_sampler.py +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -2,10 +2,9 @@ import asyncio import logging import sys -import time import traceback from concurrent.futures import ThreadPoolExecutor -from typing import Any, Dict +from typing import Any from evals.samplers.base_samplers.base_sampler import BaseSampler From 7163bf497b83e846580a5f746a671c6c943be2ab Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Tue, 10 Feb 2026 15:49:49 -0800 Subject: [PATCH 11/23] Add frames --- data/frames_full_dataset.csv | 825 ++++++++++++++++++ src/evals/constants.py | 126 ++- src/evals/eval_runner.py | 19 +- src/evals/processing/evaluate_answer.py | 117 +-- .../samplers/base_samplers/base_sampler.py | 14 +- 5 files changed, 1005 insertions(+), 96 deletions(-) create mode 100644 data/frames_full_dataset.csv diff --git a/data/frames_full_dataset.csv b/data/frames_full_dataset.csv new file mode 100644 index 0000000..27d6028 --- /dev/null +++ b/data/frames_full_dataset.csv @@ -0,0 +1,825 @@ +,problem,answer,wikipedia_link_1,wikipedia_link_2,wikipedia_link_3,wikipedia_link_4,wikipedia_link_5,wikipedia_link_6,wikipedia_link_7,wikipedia_link_8,wikipedia_link_9,wikipedia_link_10,wikipedia_link_11+,reasoning_types,wiki_links +0,"If my future wife has the same first name as the 15th first lady of the United States' mother and her surname is the same as the second assassinated president's mother's maiden name, what is my future wife's name?",Jane Ballou,https://en.wikipedia.org/wiki/President_of_the_United_States,https://en.wikipedia.org/wiki/James_Buchanan,https://en.wikipedia.org/wiki/Harriet_Lane,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_who_died_in_office,https://en.wikipedia.org/wiki/James_A._Garfield,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/President_of_the_United_States', 'https://en.wikipedia.org/wiki/James_Buchanan', 'https://en.wikipedia.org/wiki/Harriet_Lane', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_who_died_in_office', 'https://en.wikipedia.org/wiki/James_A._Garfield']" +1,"Imagine there is a building called Bronte tower whose height in feet is the same number as the dewey decimal classification for the Charlotte Bronte book that was published in 1847. Where would this building rank among tallest buildings in New York City, as of August 2024?",37th,https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB,https://en.wikipedia.org/wiki/Jane_Eyre,https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_New_York_City,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB', 'https://en.wikipedia.org/wiki/Jane_Eyre', 'https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_New_York_City']" +2,How many years earlier would Punxsutawney Phil have to be canonically alive to have made a Groundhog Day prediction in the same state as the US capitol?,87,https://en.wikipedia.org/wiki/Punxsutawney_Phil,https://en.wikipedia.org/wiki/United_States_Capitol,,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Punxsutawney_Phil', 'https://en.wikipedia.org/wiki/United_States_Capitol']" +3,"As of August 1, 2024, which country were holders of the FIFA World Cup the last time the UEFA Champions League was won by a club from London?",France,https://en.wikipedia.org/wiki/FIFA_World_Cup,https://en.wikipedia.org/wiki/London,https://en.wikipedia.org/wiki/UEFA_Champions_League,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/FIFA_World_Cup', 'https://en.wikipedia.org/wiki/London', 'https://en.wikipedia.org/wiki/UEFA_Champions_League']" +4,What is the name of the vocalist from the first band to make it in the top 200 under the record label that produced the third studio album for Dismal Euphony?,Jens Kidman,https://en.wikipedia.org/wiki/Dismal_Euphony,https://en.wikipedia.org/wiki/All_Little_Devils,https://en.wikipedia.org/wiki/Nuclear_Blast,https://en.wikipedia.org/wiki/Meshuggah,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Dismal_Euphony', 'https://en.wikipedia.org/wiki/All_Little_Devils', 'https://en.wikipedia.org/wiki/Nuclear_Blast', 'https://en.wikipedia.org/wiki/Meshuggah']" +5,"According to the 2000 United States census, what was the 2000 population of the birth city of the only 21st-century mayor of Austin, Texas who also served as mayor in the 1990s? Round your answer to the nearest thousand.",506000,https://en.wikipedia.org/wiki/Mayor_of_Austin,https://en.wikipedia.org/wiki/Kirk_Watson,https://en.wikipedia.org/wiki/Oklahoma_City,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Mayor_of_Austin', 'https://en.wikipedia.org/wiki/Kirk_Watson', 'https://en.wikipedia.org/wiki/Oklahoma_City']" +6,I have an element in mind and would like you to identify the person it was named after. Here's a clue: The element's atomic number is 9 higher than that of an element discovered by the scientist who discovered Zirconium in the same year.,Mendelevium is named after Dmitri Mendeleev.,https://en.wikipedia.org/wiki/Zirconium,https://en.wikipedia.org/wiki/Martin_Heinrich_Klaproth,https://en.wikipedia.org/wiki/Uranium,https://en.wikipedia.org/wiki/Mendelevium,https://en.wikipedia.org/wiki/Periodic_table,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Zirconium', 'https://en.wikipedia.org/wiki/Martin_Heinrich_Klaproth', 'https://en.wikipedia.org/wiki/Uranium', 'https://en.wikipedia.org/wiki/Mendelevium', 'https://en.wikipedia.org/wiki/Periodic_table']" +7,"As of Aug 3, 2024, the artist who released the album ""Father of Asahd"" went to the same high school as an Olympic diver. How many Olympic teams did this diver participate on?",2,https://en.wikipedia.org/wiki/Father_of_Asahd,https://en.wikipedia.org/wiki/DJ_Khaled,https://en.wikipedia.org/wiki/Dr._Phillips_High_School,https://en.wikipedia.org/wiki/Mark_Ruiz,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Father_of_Asahd', 'https://en.wikipedia.org/wiki/DJ_Khaled', 'https://en.wikipedia.org/wiki/Dr._Phillips_High_School', 'https://en.wikipedia.org/wiki/Mark_Ruiz']" +8,A general motors vehicle is named after the largest ward in the country of Monaco. How many people had walked on the moon as of the first model year of the vehicle? Note: the model year is not the same as the year the model was first produced.,4,https://en.wikipedia.org/wiki/Monaco,https://en.wikipedia.org/wiki/List_of_Chevrolet_vehicles,https://en.wikipedia.org/wiki/Chevrolet_Monte_Carlo,https://en.wikipedia.org/wiki/Moon,https://en.wikipedia.org/wiki/Apollo_program,,,,,,https://en.wikipedia.org/wiki/List_of_Apollo_missions,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Monaco', 'https://en.wikipedia.org/wiki/List_of_Chevrolet_vehicles', 'https://en.wikipedia.org/wiki/Chevrolet_Monte_Carlo', 'https://en.wikipedia.org/wiki/Moon', 'https://en.wikipedia.org/wiki/Apollo_program', 'https://en.wikipedia.org/wiki/List_of_Apollo_missions']" +9,"The Pope born Pietro Barbo ended a long-running war two years after his papacy began, which famous conflict, immortalized in tapestry took place 400 years earlier?",The Battle of Hastings.,https://en.wikipedia.org/wiki/Pope_Paul_II,https://en.wikipedia.org/wiki/Thirteen_Years%27_War_(1454%E2%80%931466),https://en.wikipedia.org/wiki/Bayeux_Tapestry,https://en.wikipedia.org/wiki/Battle_of_Hastings,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Pope_Paul_II', 'https://en.wikipedia.org/wiki/Thirteen_Years%27_War_(1454%E2%80%931466)', 'https://en.wikipedia.org/wiki/Bayeux_Tapestry', 'https://en.wikipedia.org/wiki/Battle_of_Hastings']" +10,"An Australian artist, born the same year as artist Janet Cumbrae Stewart and fellow member of the Melbourne Society of Women Painters and Sculptors, had her painting featured on the cover of Women's World Magazine in 1923. What is the name of the painting?",Reve d'Or,https://en.wikipedia.org/wiki/Janet_Cumbrae_Stewart,https://en.wikipedia.org/wiki/Melbourne_Society_of_Women_Painters_and_Sculptors,https://en.wikipedia.org/wiki/Dora_Wilson,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Janet_Cumbrae_Stewart', 'https://en.wikipedia.org/wiki/Melbourne_Society_of_Women_Painters_and_Sculptors', 'https://en.wikipedia.org/wiki/Dora_Wilson']" +11,"As of July 1, 2024, what is the parent company of the current record label of the singer of Edge of Seventeen?",Warner Music Group,https://en.wikipedia.org/wiki/Edge_of_Seventeen,https://en.wikipedia.org/wiki/Stevie_Nicks,https://en.wikipedia.org/wiki/Reprise_Records,https://en.wikipedia.org/wiki/Atlantic_Records,https://en.wikipedia.org/wiki/Modern_Records_(1980),https://en.wikipedia.org/wiki/Warner_Music_Group,https://en.wikipedia.org/wiki/Warner_Records,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Edge_of_Seventeen', 'https://en.wikipedia.org/wiki/Stevie_Nicks', 'https://en.wikipedia.org/wiki/Reprise_Records', 'https://en.wikipedia.org/wiki/Atlantic_Records', 'https://en.wikipedia.org/wiki/Modern_Records_(1980)', 'https://en.wikipedia.org/wiki/Warner_Music_Group', 'https://en.wikipedia.org/wiki/Warner_Records']" +12,The Basibasy mine is located in Madagascar. This mine is abundant in a specific chemical element that was discovered for the first time in 1791. The person who discovered this element was born on what is now known as a major US holiday - what holiday is this?,Christmas,https://en.wikipedia.org/wiki/Basibasy_mine,https://en.wikipedia.org/wiki/Titanium,https://en.wikipedia.org/wiki/William_Gregor,https://en.wikipedia.org/wiki/Christmas,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Basibasy_mine', 'https://en.wikipedia.org/wiki/Titanium', 'https://en.wikipedia.org/wiki/William_Gregor', 'https://en.wikipedia.org/wiki/Christmas']" +13,"One of Barbara Kingsolver's best known novels is about an American missionary family which moves to Africa. At the time, the country they move to was a Belgian colony. Which year did it become independent?",1960,https://en.wikipedia.org/wiki/Barbara_Kingsolver,https://en.wikipedia.org/wiki/The_Poisonwood_Bible,https://en.wikipedia.org/wiki/Belgian_Congo,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Barbara_Kingsolver', 'https://en.wikipedia.org/wiki/The_Poisonwood_Bible', 'https://en.wikipedia.org/wiki/Belgian_Congo']" +14,Which football player got 15 or more assists in La Liga during the 2010-2011 season and also played for Arsenal at one point in his career?,This was Mesut Ozil.,https://en.wikipedia.org/wiki/2010–11_La_Liga,https://en.wikipedia.org/wiki/Mesut_Özil,https://en.wikipedia.org/wiki/Dani_Alves,https://en.wikipedia.org/wiki/Lionel_Messi,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/2010–11_La_Liga', 'https://en.wikipedia.org/wiki/Mesut_Özil', 'https://en.wikipedia.org/wiki/Dani_Alves', 'https://en.wikipedia.org/wiki/Lionel_Messi']" +15,In Slovakia there is a well known Film Festival called the Bratistlava International Film Festival. What city/ town was the film editor for the Grand Prix winner of 2003 born in?,Roudnice nad Labem,https://en.wikipedia.org/wiki/Bratislava_International_Film_Festival,https://en.wikipedia.org/wiki/Boredom_in_Brno,https://en.wikipedia.org/wiki/Ji%C5%99%C3%AD_Bro%C5%BEek,https://en.wikipedia.org/wiki/Roudnice_nad_Labem,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Bratislava_International_Film_Festival', 'https://en.wikipedia.org/wiki/Boredom_in_Brno', 'https://en.wikipedia.org/wiki/Ji%C5%99%C3%AD_Bro%C5%BEek', 'https://en.wikipedia.org/wiki/Roudnice_nad_Labem']" +16,"On March 7th, 2012, the director James Cameron explored a very deep underseas trench. As of August 3, 2024, how many times would the tallest building in San Francisco fit end to end from the bottom of the New Britain Trench to the surface of the ocean? The answer should be a rounded-off whole number.",28,https://en.wikipedia.org/wiki/James_Cameron,https://en.wikipedia.org/wiki/Solomon_Sea#Deepest_point,https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_San_Francisco,,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/James_Cameron', 'https://en.wikipedia.org/wiki/Solomon_Sea#Deepest_point', 'https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_San_Francisco']" +17,"In August of 2024, what is the first name of the mayor of the U.S. state capital city who attended the same university as at least one U.S. president and whose city is home to an outgoing or former full member of the Big 12 Conference",Leirion,https://en.wikipedia.org/wiki/Big_12_Conference,"https://en.wikipedia.org/wiki/Lincoln,_Nebraska",https://en.wikipedia.org/wiki/Leirion_Gaylor_Baird,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_education,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Big_12_Conference', 'https://en.wikipedia.org/wiki/Lincoln,_Nebraska', 'https://en.wikipedia.org/wiki/Leirion_Gaylor_Baird', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_education']" +18,"How many years after the founding of the 50th most populous US city, based on 2023 estimate population data, did Frank Fox receive UK Patent (1344259)?","98 Years (Arlington, TX & Rubik's Cube)",https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population,"https://en.wikipedia.org/wiki/Arlington,_Texas",https://en.wikipedia.org/wiki/Rubik%27s_Cube,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population', 'https://en.wikipedia.org/wiki/Arlington,_Texas', 'https://en.wikipedia.org/wiki/Rubik%27s_Cube']" +19,"As of August 4, 2024, in what state was the first secretary of the latest United States federal executive department born?",Pennsylvania,https://en.wikipedia.org/wiki/United_States_federal_executive_departments#Former_departments,https://en.wikipedia.org/wiki/United_States_Secretary_of_Homeland_Security,https://en.wikipedia.org/wiki/Tom_Ridge,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/United_States_federal_executive_departments#Former_departments', 'https://en.wikipedia.org/wiki/United_States_Secretary_of_Homeland_Security', 'https://en.wikipedia.org/wiki/Tom_Ridge']" +20,"As of August 1 2024, what is the most recently described genus of Colosteidae?","Deltaherpeton, first described in 2010",https://en.wikipedia.org/wiki/Colosteidae,https://en.wikipedia.org/wiki/Deltaherpeton,https://en.wikipedia.org/wiki/Greererpeton,https://en.wikipedia.org/wiki/Pholidogaster,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Colosteidae', 'https://en.wikipedia.org/wiki/Deltaherpeton', 'https://en.wikipedia.org/wiki/Greererpeton', 'https://en.wikipedia.org/wiki/Pholidogaster']" +21,Małgorzata Rożniecka is a model who won the title of Miss International. What is the difference in title years from when she won and the pageant winner who was murdered by her stalker?,10 years,https://en.wikipedia.org/wiki/Ma%C5%82gorzata_Ro%C5%BCniecka,https://en.wikipedia.org/wiki/Miss_International,https://en.wikipedia.org/wiki/Agnieszka_Kotlarska,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Ma%C5%82gorzata_Ro%C5%BCniecka', 'https://en.wikipedia.org/wiki/Miss_International', 'https://en.wikipedia.org/wiki/Agnieszka_Kotlarska']" +22,"According to the 1990 United States census, what was the total population of the cities in Oklahoma that had at least 100,000 residents according to the 2020 United States census?",950135,https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population,https://en.wikipedia.org/wiki/Oklahoma_City,"https://en.wikipedia.org/wiki/Tulsa,_Oklahoma","https://en.wikipedia.org/wiki/Broken_Arrow,_Oklahoma","https://en.wikipedia.org/wiki/Norman,_Oklahoma",,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population', 'https://en.wikipedia.org/wiki/Oklahoma_City', 'https://en.wikipedia.org/wiki/Tulsa,_Oklahoma', 'https://en.wikipedia.org/wiki/Broken_Arrow,_Oklahoma', 'https://en.wikipedia.org/wiki/Norman,_Oklahoma']" +23,"What was the political party of the person who advocated for the type of government used in Chikhali, Latur district to become the foundation of India's political system?",Indian National Congress,"https://en.wikipedia.org/wiki/Chikhali,_Latur_district",https://en.wikipedia.org/wiki/Panchayati_raj,https://en.wikipedia.org/wiki/Mahatma_Gandhi,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Chikhali,_Latur_district', 'https://en.wikipedia.org/wiki/Panchayati_raj', 'https://en.wikipedia.org/wiki/Mahatma_Gandhi']" +24,"Giorgio Rognoni was an Italian professional footballer who played as a midfielder. 10 years after his death who was the midfielder who played in Milan that was born in Besana in Brianza,?",Demetrio Albertini,https://en.wikipedia.org/wiki/Giorgio_Rognoni,https://en.wikipedia.org/wiki/1996%E2%80%9397_AC_Milan_season,https://en.wikipedia.org/wiki/Demetrio_Albertini,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Giorgio_Rognoni', 'https://en.wikipedia.org/wiki/1996%E2%80%9397_AC_Milan_season', 'https://en.wikipedia.org/wiki/Demetrio_Albertini']" +25,What was the age difference between Mike Tyson and Tyson Fury on the respective days on which they lost their first ever fights? Represent the figure in years only.,12 years.,https://en.wikipedia.org/wiki/Tyson_Fury,https://en.wikipedia.org/wiki/Mike_Tyson,https://en.wikipedia.org/wiki/Mike_Tyson_vs._Buster_Douglas,https://en.wikipedia.org/wiki/Tyson_Fury_vs_Oleksandr_Usyk,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Tyson_Fury', 'https://en.wikipedia.org/wiki/Mike_Tyson', 'https://en.wikipedia.org/wiki/Mike_Tyson_vs._Buster_Douglas', 'https://en.wikipedia.org/wiki/Tyson_Fury_vs_Oleksandr_Usyk']" +26,"Using the Pruett rule, out of all of the blue moons that occurred between the beginning of World War I and the end of World War II, how many of them occurred on the 31st of the month?",9,https://en.wikipedia.org/wiki/Blue_moon#Blue_moon_dates,https://en.wikipedia.org/wiki/World_War_I#,https://en.wikipedia.org/wiki/World_War_II,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Blue_moon#Blue_moon_dates', 'https://en.wikipedia.org/wiki/World_War_I#', 'https://en.wikipedia.org/wiki/World_War_II']" +27,What number would Tommy Lawton have worn playing for Chelsea FC?,9,https://en.wikipedia.org/wiki/Tommy_Lawton,https://en.wikipedia.org/wiki/Squad_number_(association_football),,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Tommy_Lawton', 'https://en.wikipedia.org/wiki/Squad_number_(association_football)']" +28,"If you subtract the year that William McCrary ""Billy"" Ray II was born from the year Obama was first sworn in as President to the United States and multiply it by the number of administrative regions in France as of January 1, 2024, what number would you get?",828,https://en.wikipedia.org/wiki/William_M._Ray_II,https://en.wikipedia.org/wiki/Barack_Obama,https://en.wikipedia.org/wiki/Regions_of_France,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/William_M._Ray_II', 'https://en.wikipedia.org/wiki/Barack_Obama', 'https://en.wikipedia.org/wiki/Regions_of_France']" +29,"If Princess Diana had been born three years earlier, who would have been Prime Minister when she was ten? ",Harold Wilson,"https://en.wikipedia.org/wiki/Diana,_Princess_of_Wales",https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Diana,_Princess_of_Wales', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom']" +30,"As of August 1, 2024, what is the population of the writer of the ""Culdcept Saga""'s birthplace? Write the answer to the nearest million, in characters.",Two million.,https://en.wikipedia.org/wiki/Culdcept_Saga,https://en.wikipedia.org/wiki/Tow_Ubukata,https://en.wikipedia.org/wiki/Gifu_Prefecture,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Culdcept_Saga', 'https://en.wikipedia.org/wiki/Tow_Ubukata', 'https://en.wikipedia.org/wiki/Gifu_Prefecture']" +31,What is the middle name of the U.S. president who died on the same day of the year as Virginia Woolf?,David,https://en.wikipedia.org/wiki/Virginia_Woolf,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_date_of_death,https://en.wikipedia.org/wiki/Dwight_D._Eisenhower,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Virginia_Woolf', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_date_of_death', 'https://en.wikipedia.org/wiki/Dwight_D._Eisenhower']" +32,"As of 2010, if you added the number of times Brazil had won the World Cup to the amount of times the Chicago Bulls had won the NBA Championship and multiplied this number by the amount of times the Dallas Cowboys had won the Super Bowl, what number are you left with?",55,https://en.wikipedia.org/wiki/FIFA_World_Cup,https://en.wikipedia.org/wiki/Chicago_Bulls,https://en.wikipedia.org/wiki/Dallas_Cowboys,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/FIFA_World_Cup', 'https://en.wikipedia.org/wiki/Chicago_Bulls', 'https://en.wikipedia.org/wiki/Dallas_Cowboys']" +33,How old would the founder of the publishing company of the magazine that serialized the manga series Raw Hero have been the year the magazine ended publication?,145,https://en.m.wikipedia.org/wiki/Raw_Hero,https://en.m.wikipedia.org/wiki/Evening_(magazine),https://en.m.wikipedia.org/wiki/Kodansha,https://en.m.wikipedia.org/wiki/Seiji_Noma,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.m.wikipedia.org/wiki/Raw_Hero', 'https://en.m.wikipedia.org/wiki/Evening_(magazine)', 'https://en.m.wikipedia.org/wiki/Kodansha', 'https://en.m.wikipedia.org/wiki/Seiji_Noma']" +34,The oldest extant football team in Italy plays in a stadium. The stadium is named after a person. Who was the emperor of China when that person was 5 years old?,Guangxu,https://en.wikipedia.org/wiki/Oldest_football_clubs,https://en.wikipedia.org/wiki/Genoa_CFC,https://en.wikipedia.org/wiki/Stadio_Luigi_Ferraris,https://en.wikipedia.org/wiki/Luigi_Ferraris_(footballer),https://en.wikipedia.org/wiki/List_of_Chinese_monarchs,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Oldest_football_clubs', 'https://en.wikipedia.org/wiki/Genoa_CFC', 'https://en.wikipedia.org/wiki/Stadio_Luigi_Ferraris', 'https://en.wikipedia.org/wiki/Luigi_Ferraris_(footballer)', 'https://en.wikipedia.org/wiki/List_of_Chinese_monarchs']" +35,"Of the four main characters on Seinfeld, which actor is the oldest?",Michael Richards,https://en.wikipedia.org/wiki/Seinfeld,https://en.wikipedia.org/wiki/Jerry_Seinfeld,https://en.wikipedia.org/wiki/Jason_Alexander,https://en.wikipedia.org/wiki/Julia_Louis-Dreyfus,https://en.wikipedia.org/wiki/Michael_Richards,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Seinfeld', 'https://en.wikipedia.org/wiki/Jerry_Seinfeld', 'https://en.wikipedia.org/wiki/Jason_Alexander', 'https://en.wikipedia.org/wiki/Julia_Louis-Dreyfus', 'https://en.wikipedia.org/wiki/Michael_Richards']" +36,"How old was Harvard University, when the person whom the longest river in British Columbia is named after, was born? The river in question only flows within the confines of British Columbia and does not enter any other province or territory. ",140 years old.,https://en.wikipedia.org/wiki/Harvard_University,https://en.wikipedia.org/wiki/List_of_longest_rivers_of_Canada,https://en.wikipedia.org/wiki/Fraser_River,https://en.wikipedia.org/wiki/Simon_Fraser_(explorer),,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Harvard_University', 'https://en.wikipedia.org/wiki/List_of_longest_rivers_of_Canada', 'https://en.wikipedia.org/wiki/Fraser_River', 'https://en.wikipedia.org/wiki/Simon_Fraser_(explorer)']" +37,"On the same day that the The Mercedes-Benz W222 arrived at dealerships, a star of the sit-com Modern Family was wed. Who did the star marry?",Justin Mikita,https://en.wikipedia.org/wiki/Mercedes-Benz_S-Class_(W222),https://en.wikipedia.org/wiki/Modern_Family,https://en.wikipedia.org/wiki/Jesse_Tyler_Ferguson,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mercedes-Benz_S-Class_(W222)', 'https://en.wikipedia.org/wiki/Modern_Family', 'https://en.wikipedia.org/wiki/Jesse_Tyler_Ferguson']" +38,Which species from the genus mulona are both found in the same country?,Mulona barnesi and mulona schausi,https://en.wikipedia.org/wiki/Mulona,https://en.wikipedia.org/wiki/Mulona_barnesi,https://en.wikipedia.org/wiki/Mulona_grisea,https://en.wikipedia.org/wiki/Mulona_lapidaria,https://en.wikipedia.org/wiki/Mulona_manni,https://en.wikipedia.org/wiki/Mulona_piperita,https://en.wikipedia.org/wiki/Mulona_schausi,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mulona', 'https://en.wikipedia.org/wiki/Mulona_barnesi', 'https://en.wikipedia.org/wiki/Mulona_grisea', 'https://en.wikipedia.org/wiki/Mulona_lapidaria', 'https://en.wikipedia.org/wiki/Mulona_manni', 'https://en.wikipedia.org/wiki/Mulona_piperita', 'https://en.wikipedia.org/wiki/Mulona_schausi']" +39,"As of July 1, 2024, if I wanted to give my daughter the middle name of the American woman who is the most decorated female in the history of American gymnastics as her first name and the full first name of the American woman who holds the world record in the 800-meter freestyle as her middle name, what would I name my daughter?",Arianne Kathleen,https://en.wikipedia.org/wiki/Simone_Biles,https://en.wikipedia.org/wiki/Katie_Ledecky,,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/Simone_Biles', 'https://en.wikipedia.org/wiki/Katie_Ledecky']" +40,I am thinking of a Ancient Roman City. The city was destroyed by volcanic eruption. The eruption occurred in the year 79 AD. The volcano was a stratovolcano. Where was the session held where it was decided that the city would be named a UNESCO world heritage site?,Naples,https://en.wikipedia.org/wiki/Stratovolcano,https://en.wikipedia.org/wiki/Eruption_of_Mount_Vesuvius_in_79_AD,https://en.wikipedia.org/wiki/Pompeii,https://en.wikipedia.org/wiki/World_Heritage_Committee,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Stratovolcano', 'https://en.wikipedia.org/wiki/Eruption_of_Mount_Vesuvius_in_79_AD', 'https://en.wikipedia.org/wiki/Pompeii', 'https://en.wikipedia.org/wiki/World_Heritage_Committee']" +41,What Formula One car was driven in 1994 by the nephew of a racing driver from Italy who drove a Ferrari 312T and shares a last name with a common cocktail drink?,Minardi M194,https://en.wikipedia.org/wiki/Minardi_M194,https://en.wikipedia.org/wiki/Pierluigi_Martini,https://en.wikipedia.org/wiki/Giancarlo_Martini,https://en.wikipedia.org/wiki/List_of_cocktails,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Minardi_M194', 'https://en.wikipedia.org/wiki/Pierluigi_Martini', 'https://en.wikipedia.org/wiki/Giancarlo_Martini', 'https://en.wikipedia.org/wiki/List_of_cocktails']" +42,"As of August 1, 2024, who is the president of the team that inspired the original name of the Washington Commanders?",Derek Schiller,https://en.wikipedia.org/wiki/Washington_Commanders,https://en.wikipedia.org/wiki/Boston_Braves,https://en.wikipedia.org/wiki/Atlanta_Braves,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Washington_Commanders', 'https://en.wikipedia.org/wiki/Boston_Braves', 'https://en.wikipedia.org/wiki/Atlanta_Braves']" +43,"As of 2023, how many more employees does the company alphabetically first by ticker symbol in the S&P500 have than the company alphabetically 2nd to last by ticker symbol in the S&P500?","8,350",https://en.wikipedia.org/wiki/List_of_S%26P_500_companies,https://en.wikipedia.org/wiki/Agilent_Technologies,https://en.wikipedia.org/wiki/Zebra_Technologies,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_S%26P_500_companies', 'https://en.wikipedia.org/wiki/Agilent_Technologies', 'https://en.wikipedia.org/wiki/Zebra_Technologies']" +44,"I am moving to the G40 postcode area - what train stations are nearby, as of 2024?",Bridgeton Railway Station and Dalmarnock Railway Station.,https://en.wikipedia.org/wiki/List_of_postcode_areas_in_the_United_Kingdom,https://en.wikipedia.org/wiki/G_postcode_area,"https://en.wikipedia.org/wiki/Bridgeton,_Glasgow",https://en.wikipedia.org/wiki/Bridgeton_railway_station,"https://en.wikipedia.org/wiki/Calton,_Glasgow ",https://en.wikipedia.org/wiki/Dalmarnock,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_postcode_areas_in_the_United_Kingdom', 'https://en.wikipedia.org/wiki/G_postcode_area', 'https://en.wikipedia.org/wiki/Bridgeton,_Glasgow', 'https://en.wikipedia.org/wiki/Bridgeton_railway_station', 'https://en.wikipedia.org/wiki/Calton,_Glasgow ', 'https://en.wikipedia.org/wiki/Dalmarnock']" +45,How old was Stephen Baldwin when Hailey and Justin got married?,52,https://en.wikipedia.org/wiki/Hailey_Bieber,https://en.wikipedia.org/wiki/Stephen_Baldwin,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Hailey_Bieber', 'https://en.wikipedia.org/wiki/Stephen_Baldwin']" +46,"As of August 1, 2024, what is the largest city of the 9th largest country by land area in Europe?",The largest city of the 9th largest country in Europe is Warsaw.,https://en.wikipedia.org/wiki/List_of_European_countries_by_area,https://en.wikipedia.org/wiki/Poland,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_European_countries_by_area', 'https://en.wikipedia.org/wiki/Poland']" +47,What was the running time of the first cartoon in the series that inspired the name of the Looney Tunes franchise?,5 minutes and 31 seconds,https://en.wikipedia.org/wiki/Looney_Tunes,https://en.wikipedia.org/wiki/Silly_Symphony,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Looney_Tunes', 'https://en.wikipedia.org/wiki/Silly_Symphony']" +48,"The state, whose motto was adopted March 26, 1928, has 0.94% of the population in 2024 speaking a language that is native to which country?",Philippines,https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_mottos,https://en.wikipedia.org/wiki/New_Jersey,https://en.wikipedia.org/wiki/Tagalog_language,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_mottos', 'https://en.wikipedia.org/wiki/New_Jersey', 'https://en.wikipedia.org/wiki/Tagalog_language']" +49,"As of 2024, at the time of his birth, what was the middle name of the U.S. president who won Alaska, graduated from Yale University, and had a son named Michael?",Lynch,https://en.wikipedia.org/wiki/United_States_presidential_elections_in_Alaska,https://en.wikipedia.org/wiki/Gerald_Ford,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/United_States_presidential_elections_in_Alaska', 'https://en.wikipedia.org/wiki/Gerald_Ford']" +50,"The first white man to visit the indigenous people who were forced on a reserve during the Klondike Gold Rush, worked for a company who officially stopped selling what in 2023?",Animal Fur Products,https://en.wikipedia.org/wiki/Klondike_Gold_Rush,https://en.wikipedia.org/wiki/H%C3%A4n,https://en.wikipedia.org/wiki/Hudson%27s_Bay_Company,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Klondike_Gold_Rush', 'https://en.wikipedia.org/wiki/H%C3%A4n', 'https://en.wikipedia.org/wiki/Hudson%27s_Bay_Company']" +51,"What state is the home of the losing team of the World Series three years before ""Old Shufflefoot"" managed his team to victory? ",Illinois,https://en.wikipedia.org/wiki/Lou_Boudreau,https://en.wikipedia.org/wiki/List_of_World_Series_champions,https://en.wikipedia.org/wiki/Chicago,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Lou_Boudreau', 'https://en.wikipedia.org/wiki/List_of_World_Series_champions', 'https://en.wikipedia.org/wiki/Chicago']" +52,Was the person who served as president of the Scottish National Party from 1987 to 2005 alive when the party was founded?,Yes,https://en.wikipedia.org/wiki/Scottish_National_Party,https://en.wikipedia.org/wiki/Winnie_Ewing,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Scottish_National_Party', 'https://en.wikipedia.org/wiki/Winnie_Ewing']" +53,"In series six of Downton Abbey, Lord Merton is diagnosed with a terminal condition. A deficiency in which vitamin causes this condition?",Vitamin B12,https://en.wikipedia.org/wiki/Downton_Abbey,https://en.wikipedia.org/wiki/Pernicious_anemia,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Downton_Abbey', 'https://en.wikipedia.org/wiki/Pernicious_anemia']" +54,How many letters long is the title of the first movie composed by the composer of the first American Godzilla movie?,17,https://en.wikipedia.org/wiki/Godzilla_(franchise),https://en.wikipedia.org/wiki/Godzilla_(1998_film),https://en.wikipedia.org/wiki/David_Arnold,,,,,,,,,Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Godzilla_(franchise)', 'https://en.wikipedia.org/wiki/Godzilla_(1998_film)', 'https://en.wikipedia.org/wiki/David_Arnold']" +55,"The author of the book ""A Good Woman""'s author was married to a man in 2008, who resigned from the board of HP due to the actions taken by the board's chair. What types of cancer did the chair survive?",Breast and skin,https://en.wikipedia.org/wiki/A_Good_Woman_(novel),https://en.wikipedia.org/wiki/Danielle_Steel,https://en.wikipedia.org/wiki/Thomas_Perkins_(businessman),https://en.wikipedia.org/wiki/Patricia_C._Dunn,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/A_Good_Woman_(novel)', 'https://en.wikipedia.org/wiki/Danielle_Steel', 'https://en.wikipedia.org/wiki/Thomas_Perkins_(businessman)', 'https://en.wikipedia.org/wiki/Patricia_C._Dunn']" +56,"Benjamin Waterhouse Hawkins was commissioned to sculpt a series of life-size dinosaurs between 1852-54. In this location, one species only had its head and back built with the rest of its body submerged underwater because they didn't know what it looked like then. Where is this display located in 2024, which dinosaur species was this, and what did the rest of its body look like after all?","Crystal Palace Park. The dinosaur is Mosasaurus, it had a streamlined body, an elongated tail ending with a downturn supporting a two-lobed fin, and two pairs of flippers. ",https://en.wikipedia.org/wiki/Benjamin_Waterhouse_Hawkins,https://en.wikipedia.org/wiki/Crystal_Palace_Dinosaurs,https://en.wikipedia.org/wiki/Mosasaurus,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Benjamin_Waterhouse_Hawkins', 'https://en.wikipedia.org/wiki/Crystal_Palace_Dinosaurs', 'https://en.wikipedia.org/wiki/Mosasaurus']" +57,The Assistant to the Regional Manager on The Office TV show (US version) has a farm. You can obtain what food colorant from the crop he grows?,Betanin,https://en.wikipedia.org/wiki/The_Office,https://en.wikipedia.org/wiki/Dwight_Schrute,https://en.wikipedia.org/wiki/Beetroot,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Office', 'https://en.wikipedia.org/wiki/Dwight_Schrute', 'https://en.wikipedia.org/wiki/Beetroot']" +58,How many Pokemon World Championships occurred in the contiguous United States during the presidency of Barack Obama?,"- Five Pokemon World Championships took place in the contiguous United States during Barack Obama's presidency - The championships tournaments were 2009 (San Diego, California), 2011 (San Diego, California), 2014 (Washington, D.C.), 2015 (Boston, Massachusetts) and 2016 (San Francisco, California)",https://en.wikipedia.org/wiki/Pok%C3%A9mon_World_Championships,https://en.wikipedia.org/wiki/Barack_Obama,https://en.wikipedia.org/wiki/Contiguous_United_States,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Pok%C3%A9mon_World_Championships', 'https://en.wikipedia.org/wiki/Barack_Obama', 'https://en.wikipedia.org/wiki/Contiguous_United_States']" +59,"Put these historical events in chronological order, starting with the earliest: The Beatles play Ed Sullivan, the fall of the Berlin Wall, The Great Depression, Atlanta Summer Games, World War I. ","World War I, The Great Depression, The Beatles play Ed Sullivan, the fall of the Berlin Wall, Atlanta Summer Games. ",https://en.wikipedia.org/wiki/World_War_I,https://en.wikipedia.org/wiki/1996_Summer_Olympics,"https://en.wikipedia.org/wiki/Fall_of_the_Berlin_Wall#:~:text=The%20fall%20of%20the%20Berlin,restrictions%20were%20overwhelmed%20and%20discarded.",https://en.wikipedia.org/wiki/The_Beatles,https://en.wikipedia.org/wiki/Great_Depression,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/World_War_I', 'https://en.wikipedia.org/wiki/1996_Summer_Olympics', 'https://en.wikipedia.org/wiki/Fall_of_the_Berlin_Wall#:~:text=The%20fall%20of%20the%20Berlin,restrictions%20were%20overwhelmed%20and%20discarded.', 'https://en.wikipedia.org/wiki/The_Beatles', 'https://en.wikipedia.org/wiki/Great_Depression']" +60,This individual won a Best Director Award at the 33rd Japan Academy Prize ceremony and is known for having directed a film that briefly surpassed the Godfather as the highest-grossing film in Japan for a short time. Which film was longer - The Godfather or his film - and by how many minutes?,The Godfather (1972) was longer than Submersion in Japan (1973) by 32 minutes.,https://en.wikipedia.org/wiki/Japan_Academy_Film_Prize,https://en.wikipedia.org/wiki/Daisaku_Kimura,https://en.wikipedia.org/wiki/Submersion_of_Japan,https://en.wikipedia.org/wiki/The_Godfather,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Japan_Academy_Film_Prize', 'https://en.wikipedia.org/wiki/Daisaku_Kimura', 'https://en.wikipedia.org/wiki/Submersion_of_Japan', 'https://en.wikipedia.org/wiki/The_Godfather']" +61,The manga 'Sailor Moon' was authored by a woman. What manga did her husband win the Shogakukan Manga Award for authoring?,YuYu Hakusho,https://en.wikipedia.org/wiki/Sailor_Moon,https://en.wikipedia.org/wiki/Naoko_Takeuchi,https://en.wikipedia.org/wiki/Yoshihiro_Togashi,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Sailor_Moon', 'https://en.wikipedia.org/wiki/Naoko_Takeuchi', 'https://en.wikipedia.org/wiki/Yoshihiro_Togashi']" +62,Where did the daughter of the winner of the first US presidential election to occur after the official end of WWII attend university?,George Washington University,https://en.wikipedia.org/wiki/World_War_II,https://en.wikipedia.org/wiki/United_States_presidential_election,https://en.wikipedia.org/wiki/Harry_S._Truman,https://en.wikipedia.org/wiki/Margaret_Truman,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/World_War_II', 'https://en.wikipedia.org/wiki/United_States_presidential_election', 'https://en.wikipedia.org/wiki/Harry_S._Truman', 'https://en.wikipedia.org/wiki/Margaret_Truman']" +63,"In the style of wrestling performed by former Greek wrestler Mikhail Theodoropoulos, who won the inaugural olympic medal?",Carl Schuhmann.,https://en.wikipedia.org/wiki/Mikhail_Theodoropoulos,https://en.wikipedia.org/wiki/Wrestling_at_the_1960_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_bantamweight,https://en.wikipedia.org/wiki/Greco-Roman_wrestling,https://en.wikipedia.org/wiki/List_of_World_and_Olympic_Champions_in_Greco-Roman_wrestling,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mikhail_Theodoropoulos', 'https://en.wikipedia.org/wiki/Wrestling_at_the_1960_Summer_Olympics_%E2%80%93_Men%27s_Greco-Roman_bantamweight', 'https://en.wikipedia.org/wiki/Greco-Roman_wrestling', 'https://en.wikipedia.org/wiki/List_of_World_and_Olympic_Champions_in_Greco-Roman_wrestling']" +64,What is the difference in elevation between the respective peaks of Eggstock (Uri Alps) and Eggstock (Schwyzer Alps) in Switzerland? Convert the final figure to centimetres and round up to the nearest 1000.,"110,000cm.",https://en.wikipedia.org/wiki/Eggstock_(Uri_Alps),https://en.wikipedia.org/wiki/Eggstock_(Schwyzer_Alps),,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Eggstock_(Uri_Alps)', 'https://en.wikipedia.org/wiki/Eggstock_(Schwyzer_Alps)']" +65,"How many films had the actress who played Trudi Frazer in ""Once Upon a Time in Hollywood"" acted in before?",3,https://en.wikipedia.org/wiki/Once_Upon_a_Time_in_Hollywood,https://en.wikipedia.org/wiki/Julia_Butters,,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Once_Upon_a_Time_in_Hollywood', 'https://en.wikipedia.org/wiki/Julia_Butters']" +66,The founder of the production company at which Tim Allsop and Stewart Williams met received a bachelor's degree from a college in the state of New York. In what year was this college's sibling institution founded?,1701,https://en.wikipedia.org/wiki/Tim_Allsop_&_Stewart_Williams,https://en.wikipedia.org/wiki/Elisabeth_Murdoch_(businesswoman),https://en.wikipedia.org/wiki/Vassar_College,https://en.wikipedia.org/wiki/Yale_University,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Tim_Allsop_&_Stewart_Williams', 'https://en.wikipedia.org/wiki/Elisabeth_Murdoch_(businesswoman)', 'https://en.wikipedia.org/wiki/Vassar_College', 'https://en.wikipedia.org/wiki/Yale_University']" +67,Which player scored more than 15 goals in Eredevisie during the 21-22 season and had previously played for Auxerre?,Sébastien Haller scored 21 goals that season and previously played for Auxerre.,https://en.wikipedia.org/wiki/2021–22_Eredivisie,https://en.wikipedia.org/wiki/Sébastien_Haller,https://en.wikipedia.org/wiki/Loïs_Openda,https://en.wikipedia.org/wiki/Vangelis_Pavlidis,https://en.wikipedia.org/wiki/Ricky_van_Wolfswinkel,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/2021–22_Eredivisie', 'https://en.wikipedia.org/wiki/Sébastien_Haller', 'https://en.wikipedia.org/wiki/Loïs_Openda', 'https://en.wikipedia.org/wiki/Vangelis_Pavlidis', 'https://en.wikipedia.org/wiki/Ricky_van_Wolfswinkel']" +68,"The latest game, as of August 4, 2024, from the creator of Kirby won an award at The Game Awards. What won Game of the Year the following year?",The Last of Us Part II,https://en.wikipedia.org/wiki/Kirby_(series),https://en.wikipedia.org/wiki/Masahiro_Sakurai,https://en.wikipedia.org/wiki/Super_Smash_Bros._Ultimate,https://en.wikipedia.org/wiki/The_Game_Awards_2020,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Kirby_(series)', 'https://en.wikipedia.org/wiki/Masahiro_Sakurai', 'https://en.wikipedia.org/wiki/Super_Smash_Bros._Ultimate', 'https://en.wikipedia.org/wiki/The_Game_Awards_2020']" +69,By how many years does the inception of the party to which former Finnish MP Lea Rakel Hiltunen last belonged predate the foundation of the Parliament of Finland itself?,7 years.,https://en.wikipedia.org/wiki/Rakel_Hiltunen,https://en.wikipedia.org/wiki/Parliament_of_Finland,https://en.wikipedia.org/wiki/Social_Democratic_Party_of_Finland,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Rakel_Hiltunen', 'https://en.wikipedia.org/wiki/Parliament_of_Finland', 'https://en.wikipedia.org/wiki/Social_Democratic_Party_of_Finland']" +70,"According to the population data in their respective Wikipedia articles in August 2024, what is the difference between the population of Seattle, WA, and Portland, OR, according to the data from 2020?","84,512",https://en.wikipedia.org/wiki/Seattle,"https://en.wikipedia.org/wiki/Portland,_Oregon",,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Seattle', 'https://en.wikipedia.org/wiki/Portland,_Oregon']" +71,"What was the Enhanced Fujita Scale rating of the 2011 tornado that hit the hometown of the band who had a controversial interview with Bryan Odell on June 3, 2012?","EF5- Joplin, Missouri",https://en.wikipedia.org/wiki/BryanStars,https://en.wikipedia.org/wiki/Never_Shout_Never,https://en.wikipedia.org/wiki/2011_Joplin_tornado,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/BryanStars', 'https://en.wikipedia.org/wiki/Never_Shout_Never', 'https://en.wikipedia.org/wiki/2011_Joplin_tornado']" +72,Which MP standing as the leader of a major party in the 2019 United Kingdom General Election was also an MP for Henley?,Boris Johnson was leader of the Conservative Party and a former MP for Henley.,https://en.wikipedia.org/wiki/2019_United_Kingdom_general_election,https://en.wikipedia.org/wiki/Boris_Johnson,https://en.wikipedia.org/wiki/Jeremy_Corbyn,https://en.wikipedia.org/wiki/Nicola_Sturgeon,https://en.wikipedia.org/wiki/Jo_Swinson,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/2019_United_Kingdom_general_election', 'https://en.wikipedia.org/wiki/Boris_Johnson', 'https://en.wikipedia.org/wiki/Jeremy_Corbyn', 'https://en.wikipedia.org/wiki/Nicola_Sturgeon', 'https://en.wikipedia.org/wiki/Jo_Swinson']" +73,Who was the author of the novel whose film adaptation lead singer Mark Arm took the name of his band from?,Raymond Friday Locke,https://en.wikipedia.org/wiki/Mark_Arm,https://en.wikipedia.org/wiki/Mudhoney,https://en.wikipedia.org/wiki/Mudhoney_(film),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mark_Arm', 'https://en.wikipedia.org/wiki/Mudhoney', 'https://en.wikipedia.org/wiki/Mudhoney_(film)']" +74,"As of August 3, 2024, how much taller was the tsunami wave of the most powerful earthquake in North America than the most powerful earthquake ever recorded in Japan?",26.5 meters of 87 feet,https://en.wikipedia.org/wiki/2011_T%C5%8Dhoku_earthquake_and_tsunami#Nuclear_power_plants,https://en.wikipedia.org/wiki/Lists_of_earthquakes#Largest_earthquakes_by_magnitude,https://en.wikipedia.org/wiki/1964_Alaska_earthquake,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/2011_T%C5%8Dhoku_earthquake_and_tsunami#Nuclear_power_plants', 'https://en.wikipedia.org/wiki/Lists_of_earthquakes#Largest_earthquakes_by_magnitude', 'https://en.wikipedia.org/wiki/1964_Alaska_earthquake']" +75,"How old would the 1975 winner of the Lenore Marshall Poetry Prize have been if they were still alive on the date when Rupi Kaur released her book titled, ""Milk and Honey""?",90,https://en.wikipedia.org/wiki/List_of_winners_of_the_Lenore_Marshall_Poetry_Prize,https://en.wikipedia.org/wiki/Cid_Corman,https://en.wikipedia.org/wiki/Milk_and_Honey_(poetry_collection),,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_winners_of_the_Lenore_Marshall_Poetry_Prize', 'https://en.wikipedia.org/wiki/Cid_Corman', 'https://en.wikipedia.org/wiki/Milk_and_Honey_(poetry_collection)']" +76,A united states island that is only 90 miles away from Cuba has been the home of several famous people. In what year did the famous author who owns a book store there first start writing?,1959,"https://en.wikipedia.org/wiki/Key_West#:~:text=The%20southernmost%20location%20that%20the,apart%20at%20their%20closest%20points.",https://en.wikipedia.org/wiki/Judy_Blume,,,,,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Key_West#:~:text=The%20southernmost%20location%20that%20the,apart%20at%20their%20closest%20points.', 'https://en.wikipedia.org/wiki/Judy_Blume']" +77,"According to the 2011 census, what is total population of the cities of the birthplaces of author Clive Barker, Prince William, and Sir Malcolm Stanley Bradbury? Round to the nearest 100,000.","11,300,000",https://en.wikipedia.org/wiki/Clive_Barker,"https://en.wikipedia.org/wiki/William,_Prince_of_Wales",https://en.wikipedia.org/wiki/Malcolm_Bradbury,https://en.wikipedia.org/wiki/England#Geography,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/Clive_Barker', 'https://en.wikipedia.org/wiki/William,_Prince_of_Wales', 'https://en.wikipedia.org/wiki/Malcolm_Bradbury', 'https://en.wikipedia.org/wiki/England#Geography']" +78,"Of the two wonders of the ancient world that were statues, how much shorter was the taller of the two compared to the tallest statue in Japan as of 2024?",222 ft,https://en.wikipedia.org/wiki/Seven_Wonders_of_the_Ancient_World#Wonders,https://en.wikipedia.org/wiki/Statue_of_Zeus_at_Olympia,https://en.wikipedia.org/wiki/Colossus_of_Rhodes,https://en.wikipedia.org/wiki/List_of_tallest_statues,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Seven_Wonders_of_the_Ancient_World#Wonders', 'https://en.wikipedia.org/wiki/Statue_of_Zeus_at_Olympia', 'https://en.wikipedia.org/wiki/Colossus_of_Rhodes', 'https://en.wikipedia.org/wiki/List_of_tallest_statues']" +79,"The actor known for playing Kenny ""Shammy"" Shamberg in Magnum P.I. was born how days before the Apollo 11 moon landing?",844 days.,https://en.wikipedia.org/wiki/Magnum_P.I._(2018_TV_series),https://en.wikipedia.org/wiki/Christopher_Thornton,https://en.wikipedia.org/wiki/Apollo_11#Mission,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Magnum_P.I._(2018_TV_series)', 'https://en.wikipedia.org/wiki/Christopher_Thornton', 'https://en.wikipedia.org/wiki/Apollo_11#Mission']" +80,"As of January 1, 2024, what was the warmest decade, since the 17th century, around the ocean that surrounds Litke Deep?",The warmest decade for the Arctic Ocean since the 17th century was during the period of 1995–2005.,https://en.wikipedia.org/wiki/Litke_Deep,https://en.wikipedia.org/wiki/Arctic_Ocean#Climate,https://en.wikipedia.org/wiki/Climate_change_in_the_Arctic,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Litke_Deep', 'https://en.wikipedia.org/wiki/Arctic_Ocean#Climate', 'https://en.wikipedia.org/wiki/Climate_change_in_the_Arctic']" +81,"Which of the bridges in Halifax, Nova Scotia is longer, and by how much? The MacKay, or the MacDonald? ",The MacDonald Bridge (1300 metres) is 100 metres longer than the MacKay (1200 metres).,https://en.wikipedia.org/wiki/Angus_L._Macdonald_Bridge,https://en.wikipedia.org/wiki/A._Murray_MacKay_Bridge,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Angus_L._Macdonald_Bridge', 'https://en.wikipedia.org/wiki/A._Murray_MacKay_Bridge']" +82,Which section of IPSC Australia Inc. is larger than Honshu and smaller than Sumatra by area?,Victoria and Tasmania,https://en.wikipedia.org/wiki/IPSC_Australia_Inc,https://en.wikipedia.org/wiki/Western_Australia,https://en.wikipedia.org/wiki/Victoria_(state),https://en.wikipedia.org/wiki/Tasmania,https://en.wikipedia.org/wiki/South_Australia,https://en.wikipedia.org/wiki/New_South_Wales,https://en.wikipedia.org/wiki/Australian_Capital_Territory,https://en.wikipedia.org/wiki/Queensland,https://en.wikipedia.org/wiki/Northern_Territory,https://en.wikipedia.org/wiki/Honshu,https://en.wikipedia.org/wiki/Sumatra,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/IPSC_Australia_Inc', 'https://en.wikipedia.org/wiki/Western_Australia', 'https://en.wikipedia.org/wiki/Victoria_(state)', 'https://en.wikipedia.org/wiki/Tasmania', 'https://en.wikipedia.org/wiki/South_Australia', 'https://en.wikipedia.org/wiki/New_South_Wales', 'https://en.wikipedia.org/wiki/Australian_Capital_Territory', 'https://en.wikipedia.org/wiki/Queensland', 'https://en.wikipedia.org/wiki/Northern_Territory', 'https://en.wikipedia.org/wiki/Honshu', 'https://en.wikipedia.org/wiki/Sumatra']" +83,"As of August 1, 2024, are there any cities in England that are more populated than Philadelphia, and which cities are they?","Yes, London",https://en.wikipedia.org/wiki/Philadelphia,https://en.wikipedia.org/wiki/List_of_ONS_built-up_areas_in_England_by_population,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Philadelphia', 'https://en.wikipedia.org/wiki/List_of_ONS_built-up_areas_in_England_by_population']" +84,Which event predates the other: the amalgamation of the Province of Betanzos with Mondonedo or the completion of a cathedral marking the destination of a prominent Spanish Catholic pilgrimage in the same province? Provide the year of the earlier event.,The completion of Santiago de Compostela Cathedral in 1211.,https://en.wikipedia.org/wiki/Province_of_A_Coru%C3%B1a,https://en.wikipedia.org/wiki/Santiago_de_Compostela_Cathedral,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Province_of_A_Coru%C3%B1a', 'https://en.wikipedia.org/wiki/Santiago_de_Compostela_Cathedral']" +85,"How many films with titles including the letter ""g"" did the director of ""Sword of the Valiant"" direct after the release of said film, but before the year 2000?",1,https://en.wikipedia.org/wiki/Sword_of_the_Valiant,https://en.wikipedia.org/wiki/Stephen_Weeks,,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Sword_of_the_Valiant', 'https://en.wikipedia.org/wiki/Stephen_Weeks']" +86,What medal did the woman who tied for 19th in the 2011 Pan American Games women's individual bowling event win as part of a women's bowling doubles team in the 2017 World Games?,Bronze medalist,https://en.wikipedia.org/wiki/Bowling_at_the_2011_Pan_American_Games_%E2%80%93_Women's_individual,https://en.wikipedia.org/wiki/Bowling_at_the_2017_World_Games,,,,,,,,,,Tabular reasoning,"[""https://en.wikipedia.org/wiki/Bowling_at_the_2011_Pan_American_Games_%E2%80%93_Women's_individual"", 'https://en.wikipedia.org/wiki/Bowling_at_the_2017_World_Games']" +87,Did the entomologist who first described lesticus purpurascens come from the same country as the entomologist who first described the subfamily of that species?,"Yes, Stefano Ludovico Straneo and Franco Andrea Bonelli are both Italian",https://en.wikipedia.org/wiki/Lesticus_purpurascens,https://en.wikipedia.org/wiki/Stefano_Ludovico_Straneo,https://en.wikipedia.org/wiki/Pterostichinae,https://en.wikipedia.org/wiki/Franco_Andrea_Bonelli,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Lesticus_purpurascens', 'https://en.wikipedia.org/wiki/Stefano_Ludovico_Straneo', 'https://en.wikipedia.org/wiki/Pterostichinae', 'https://en.wikipedia.org/wiki/Franco_Andrea_Bonelli']" +88,"If you were to combine two words, the first of which has a species called Polytrichum piliferum, and the second which has a breed type called Fleckvieh, you'd get the name of what countries capital city?",Russia,https://en.wikipedia.org/wiki/Moss,https://en.wikipedia.org/w/index.php?search=Polytrichum+piliferum&title=Special:Search&profile=advanced&fulltext=1&ns0=1,https://en.wikipedia.org/wiki/Cattle,https://en.wikipedia.org/wiki/Fleckvieh,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Moss', 'https://en.wikipedia.org/w/index.php?search=Polytrichum+piliferum&title=Special:Search&profile=advanced&fulltext=1&ns0=1', 'https://en.wikipedia.org/wiki/Cattle', 'https://en.wikipedia.org/wiki/Fleckvieh']" +89,What was the birthday of the man who was mayor of New York City the year Snoopy debuted in the Macy's Thanksgiving Day Parade?,"November 24, 1921",https://en.wikipedia.org/wiki/Snoopy,https://en.wikipedia.org/wiki/List_of_mayors_of_New_York_City,https://en.wikipedia.org/wiki/John_Lindsay,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Snoopy', 'https://en.wikipedia.org/wiki/List_of_mayors_of_New_York_City', 'https://en.wikipedia.org/wiki/John_Lindsay']" +90,What is the average distance for the left field line in MLB stadiums with a retractable roof as of August 2024? Round to the nearest whole number.,331 feet,https://en.wikipedia.org/wiki/List_of_current_Major_League_Baseball_stadiums,https://en.wikipedia.org/wiki/Rogers_Centre,https://en.wikipedia.org/wiki/Chase_Field,https://en.wikipedia.org/wiki/T-Mobile_Park,https://en.wikipedia.org/wiki/Minute_Maid_Park,,,,,,"https://en.wikipedia.org/wiki/American_Family_Field, https://en.wikipedia.org/wiki/LoanDepot_Park, https://en.wikipedia.org/wiki/Globe_Life_Field, ",Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_current_Major_League_Baseball_stadiums', 'https://en.wikipedia.org/wiki/Rogers_Centre', 'https://en.wikipedia.org/wiki/Chase_Field', 'https://en.wikipedia.org/wiki/T-Mobile_Park', 'https://en.wikipedia.org/wiki/Minute_Maid_Park', 'https://en.wikipedia.org/wiki/American_Family_Field, https://en.wikipedia.org/wiki/LoanDepot_Park, https://en.wikipedia.org/wiki/Globe_Life_Field, ']" +91,"As of August 3, 2024, which rabbi worked for both Reform Congregation Keneseth Israel in Philadelphia and Congregation Beth Israel in West Hartford, Connecticut?",Abraham J. Feldman worked for both congregations serving as an interim associate rabbi at the Reform Congregation Keneseth Israel and the leader of the Congregation Beth Israel.,https://en.wikipedia.org/wiki/Reform_Congregation_Keneseth_Israel_(Philadelphia),"https://en.wikipedia.org/wiki/Congregation_Beth_Israel_(West_Hartford,_Connecticut)",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Reform_Congregation_Keneseth_Israel_(Philadelphia)', 'https://en.wikipedia.org/wiki/Congregation_Beth_Israel_(West_Hartford,_Connecticut)']" +92,Where was the Winter Olympics held the year that the girl who was Wheaties first official spokeswoman turned 20 years old?,"Calgary, Alberta, Canada",https://en.wikipedia.org/wiki/1988_Winter_Olympics,https://en.wikipedia.org/wiki/Mary_Lou_Retton,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/1988_Winter_Olympics', 'https://en.wikipedia.org/wiki/Mary_Lou_Retton']" +93,How many of Hitler's three couriers of his last will and other political documents died after 1980?,1,https://en.wikipedia.org/wiki/Last_will_and_testament_of_Adolf_Hitler,https://en.wikipedia.org/wiki/Willy_Johannmeyer,https://en.wikipedia.org/wiki/Wilhelm_Zander,https://en.wikipedia.org/wiki/Heinz_Lorenz,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Last_will_and_testament_of_Adolf_Hitler', 'https://en.wikipedia.org/wiki/Willy_Johannmeyer', 'https://en.wikipedia.org/wiki/Wilhelm_Zander', 'https://en.wikipedia.org/wiki/Heinz_Lorenz']" +94,The inventor of the first true pinhole camera was also the first to correctly explain what theory?,The Theory of Vision,https://en.wikipedia.org/wiki/Photography,https://en.wikipedia.org/wiki/Ibn_al-Haytham,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Photography', 'https://en.wikipedia.org/wiki/Ibn_al-Haytham']" +95,"As of August 2024, the Atlanta Braves beat the Houston Astros the last time they won the World Series. How many years before this did Jackie Robinson join the Brooklyn Dodgers?",74 years,https://en.wikipedia.org/wiki/List_of_World_Series_champions#World_Series_results,https://en.wikipedia.org/wiki/2021_Atlanta_Braves_season,https://en.wikipedia.org/wiki/2021_Houston_Astros_season,https://en.wikipedia.org/wiki/Jackie_Robinson,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_World_Series_champions#World_Series_results', 'https://en.wikipedia.org/wiki/2021_Atlanta_Braves_season', 'https://en.wikipedia.org/wiki/2021_Houston_Astros_season', 'https://en.wikipedia.org/wiki/Jackie_Robinson']" +96,In which of the three Intertidal zones would you most likely find the Septifer bilocularis?,Low Intertidal Zone,https://en.wikipedia.org/wiki/Septifer_bilocularis,https://en.wikipedia.org/wiki/Mytilidae,https://en.wikipedia.org/wiki/Intertidal_zone,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Septifer_bilocularis', 'https://en.wikipedia.org/wiki/Mytilidae', 'https://en.wikipedia.org/wiki/Intertidal_zone']" +97,"In the Eurovision Song Contest 2024, one country scored a combined total (jury and televoting results) of 268 - as did another country in the contest the year before. Which countries are they?",Italy and Norway,https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2024,https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2023,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2024', 'https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2023']" +98,"The quarterback who was selected first overall in the 1998 NFL draft, won the Superbowl with 2 different teams, both named after what type of animal?",A horse,https://en.wikipedia.org/wiki/1998_NFL_draft#Player_selections,https://en.wikipedia.org/wiki/Peyton_Manning,https://en.wikipedia.org/wiki/Colt,https://en.wikipedia.org/w/index.php?title=Bronco&redirect=no,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/1998_NFL_draft#Player_selections', 'https://en.wikipedia.org/wiki/Peyton_Manning', 'https://en.wikipedia.org/wiki/Colt', 'https://en.wikipedia.org/w/index.php?title=Bronco&redirect=no']" +99,"As of 1st August 2024, How much younger is the current youngest US House Representative than the American folk hero who died at the Alamo when they were elected to the US House of Representatives?",14 years younger,https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives,https://en.wikipedia.org/wiki/Battle_of_the_Alamo,https://en.wikipedia.org/wiki/Davy_Crockett,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives', 'https://en.wikipedia.org/wiki/Battle_of_the_Alamo', 'https://en.wikipedia.org/wiki/Davy_Crockett']" +100,"Who was older, the guitar player for the Dugites from 1982-1983 or the lead singer of The Sports?",Andrew Pendlebury,https://en.wikipedia.org/wiki/The_Dugites,https://en.wikipedia.org/wiki/Andrew_Pendlebury,https://en.wikipedia.org/wiki/Stephen_Cummings,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/The_Dugites', 'https://en.wikipedia.org/wiki/Andrew_Pendlebury', 'https://en.wikipedia.org/wiki/Stephen_Cummings']" +101,"In the first movie that Emma Stone won an Academy Award for Best Actress in, did her costar win an Academy Award for Best Actor?","Ryan Gosling was nominated for an Academy Award for Best Actor in La La Land, but didn't win.",https://en.wikipedia.org/wiki/Emma_Stone,https://en.wikipedia.org/wiki/La_La_Land#Cast,https://en.wikipedia.org/wiki/89th_Academy_Awards,,,,,,,,,Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Emma_Stone', 'https://en.wikipedia.org/wiki/La_La_Land#Cast', 'https://en.wikipedia.org/wiki/89th_Academy_Awards']" +102,"As of August 4, 2024, what is the first initial and surname of the cricketer who became the top-rated test batsman in the 2020s, is the fastest player of their country to 6 1000 run milestones in tests, and became their country's all-time leading run scorer in tests in the same year?",K. Williamson,https://en.wikipedia.org/wiki/ICC_men%27s_player_rankings#Top_10_Test_batsmen,https://en.wikipedia.org/wiki/Kane_Williamson,https://en.wikipedia.org/wiki/List_of_New_Zealand_Test_cricket_records#Most_career_runs,,,,,,,,,Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/ICC_men%27s_player_rankings#Top_10_Test_batsmen', 'https://en.wikipedia.org/wiki/Kane_Williamson', 'https://en.wikipedia.org/wiki/List_of_New_Zealand_Test_cricket_records#Most_career_runs']" +103,"As of 2024, what percentage of Afroasiatic language speakers speak Central Atlas Tamazight?",0.49%,https://en.wikipedia.org/wiki/Afroasiatic_languages,https://en.wikipedia.org/wiki/Central_Atlas_Tamazight,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Afroasiatic_languages', 'https://en.wikipedia.org/wiki/Central_Atlas_Tamazight']" +104,"""The Terminator"" was released on October 26th exactly how many years after the famous gunfight at the O.K. Corral occurred?",103,https://en.wikipedia.org/wiki/Gunfight_at_the_O.K._Corral,https://en.wikipedia.org/wiki/The_Terminator,,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Gunfight_at_the_O.K._Corral', 'https://en.wikipedia.org/wiki/The_Terminator']" +105,"If an Ixodes scapularis tick in its nymph stage feeds on a host in the Spring, how many seasons later is it most likely to transmit Lyme disease if it becomes an adult in the same year?",Two seasons.,https://en.wikipedia.org/wiki/Tick,https://en.wikipedia.org/wiki/Lyme_disease,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Tick', 'https://en.wikipedia.org/wiki/Lyme_disease']" +106,What percentage of his total league appearances did footballer Derek Smith (born 1946) make with the team whose original name is shared by a bird impressionist born in the nineteenth century? Give your answer to two decimal places.,95.35%,"https://en.wikipedia.org/wiki/Derek_Smith_(footballer,_born_1946)",https://en.wikipedia.org/wiki/Tranmere_Rovers_F.C.,https://en.wikipedia.org/wiki/Ellesmere_Port_Town_F.C.,https://en.wikipedia.org/wiki/Joe_Belmont_(bird_impressionist),,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Derek_Smith_(footballer,_born_1946)', 'https://en.wikipedia.org/wiki/Tranmere_Rovers_F.C.', 'https://en.wikipedia.org/wiki/Ellesmere_Port_Town_F.C.', 'https://en.wikipedia.org/wiki/Joe_Belmont_(bird_impressionist)']" +107,Was the founder of the bank that was established 42 years before the National Banking Act was expanded to include the Territory of Hawai'i still alive when it acquired Pioneer Federal Savings Bank?,No,https://en.wikipedia.org/wiki/U.S._national_banks_of_Hawaii,https://en.wikipedia.org/wiki/Charles_Reed_Bishop,https://en.wikipedia.org/wiki/First_Hawaiian_Bank,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/U.S._national_banks_of_Hawaii', 'https://en.wikipedia.org/wiki/Charles_Reed_Bishop', 'https://en.wikipedia.org/wiki/First_Hawaiian_Bank']" +108,"If you take the height of the Eiffel Tower in metres, add the number of arrondissements in Paris and subtract the street number of the official residence of the French prime minister, is the final answer a prime number?",Yes,https://en.wikipedia.org/wiki/Eiffel_Tower,https://en.wikipedia.org/wiki/Arrondissements_of_Paris,https://en.wikipedia.org/wiki/H%C3%B4tel_Matignon,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Eiffel_Tower', 'https://en.wikipedia.org/wiki/Arrondissements_of_Paris', 'https://en.wikipedia.org/wiki/H%C3%B4tel_Matignon']" +109,"Of the 3 largest canary islands, which has the hottest mean temperature in July as of 2024?",Tenerife,https://en.wikipedia.org/wiki/Canary_Islands,https://en.wikipedia.org/wiki/Tenerife,https://en.wikipedia.org/wiki/Fuerteventura,https://en.wikipedia.org/wiki/Gran_Canaria,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Canary_Islands', 'https://en.wikipedia.org/wiki/Tenerife', 'https://en.wikipedia.org/wiki/Fuerteventura', 'https://en.wikipedia.org/wiki/Gran_Canaria']" +110,"As of 1st August 2024, Are the actors who play Summer and Luke in the OC in anything else together?","Yes, Nashville series 5.",https://en.wikipedia.org/wiki/The_O.C.,https://en.wikipedia.org/wiki/Rachel_Bilson,https://en.wikipedia.org/wiki/Chris_Carmack,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_O.C.', 'https://en.wikipedia.org/wiki/Rachel_Bilson', 'https://en.wikipedia.org/wiki/Chris_Carmack']" +111,How many years was the first vessel Stephen Etnier commanded constructed after the novel that first inspired him to pursue painting was published?,7,https://en.wikipedia.org/wiki/Stephen_Etnier,https://en.wikipedia.org/wiki/The_Moon_and_Sixpence,https://en.wikipedia.org/wiki/USS_Mizpah,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Stephen_Etnier', 'https://en.wikipedia.org/wiki/The_Moon_and_Sixpence', 'https://en.wikipedia.org/wiki/USS_Mizpah']" +112,"As of the financial year ending July 31st 2023, what was the size of the endowment at the university attended by rugby player Fred McLeod?",£559.8 million,https://en.wikipedia.org/wiki/Fred_McLeod_(rugby_union),https://en.wikipedia.org/wiki/University_of_Edinburgh,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Fred_McLeod_(rugby_union)', 'https://en.wikipedia.org/wiki/University_of_Edinburgh']" +113,What medal was the captain of The RMS Titanic awarded by King Edward VII?,The Transport Medal,https://en.wikipedia.org/wiki/Titanic,https://en.wikipedia.org/wiki/Edward_Smith_(sea_captain),,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Titanic', 'https://en.wikipedia.org/wiki/Edward_Smith_(sea_captain)']" +114,What painting was stolen from The Louvre exactly 56 years before the birth of activist and songwriter Serj Tankian?,The Mona Lisa,"https://en.wikipedia.org/wiki/Mona_Lisa#Refuge,_theft,_and_vandalism",https://en.wikipedia.org/wiki/Serj_Tankian,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Mona_Lisa#Refuge,_theft,_and_vandalism', 'https://en.wikipedia.org/wiki/Serj_Tankian']" +115,"As of August 1, 2024, which player who scored more than 10 goals in the 2022 Argentine Premier League season also played for Elche in Spain?",Franco Cristaldo scored more than 10 goals that year and also played for Elche.,https://en.wikipedia.org/wiki/2022_Argentine_Primera_División,https://en.wikipedia.org/wiki/Mateo_Retegui,https://en.wikipedia.org/wiki/Franco_Cristaldo,https://en.wikipedia.org/wiki/Enzo_Copetti,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/2022_Argentine_Primera_División', 'https://en.wikipedia.org/wiki/Mateo_Retegui', 'https://en.wikipedia.org/wiki/Franco_Cristaldo', 'https://en.wikipedia.org/wiki/Enzo_Copetti']" +116,"As of 2024, how many of the Star Wars actors whose first or last names are 'Jack' have starred in more than 2 Star wars movies?",2,https://en.wikipedia.org/wiki/List_of_Star_Wars_film_actors#Introduced_in_The_Skywalker_Saga,https://en.wikipedia.org/wiki/Jack_Purvis_(actor),https://en.wikipedia.org/wiki/Jack_Thompson_(actor),https://en.wikipedia.org/wiki/Andrew_Jack_(dialect_coach),,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Star_Wars_film_actors#Introduced_in_The_Skywalker_Saga', 'https://en.wikipedia.org/wiki/Jack_Purvis_(actor)', 'https://en.wikipedia.org/wiki/Jack_Thompson_(actor)', 'https://en.wikipedia.org/wiki/Andrew_Jack_(dialect_coach)']" +117,"I am thinking of a province that has the smallest land area in it's particular country, but also has the the 10th largest population. This country has 10 provinces. This province joined the country in 1873. What is the scientific name of the provincial flower?",Cypripedium Acaule,https://en.wikipedia.org/wiki/Province,https://en.wikipedia.org/wiki/Provinces_and_territories_of_Canada,https://en.wikipedia.org/wiki/Prince_Edward_Island,https://en.wikipedia.org/wiki/Cypripedium_acaule,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Province', 'https://en.wikipedia.org/wiki/Provinces_and_territories_of_Canada', 'https://en.wikipedia.org/wiki/Prince_Edward_Island', 'https://en.wikipedia.org/wiki/Cypripedium_acaule']" +118,"As of 1st January 2023, If I am 7 years younger than the eldest granddaughter of the female monarch with the longest reign in confirmed history was at the time of the monarch's death, how old am I?",34,https://en.wikipedia.org/wiki/List_of_longest-reigning_monarchs,https://en.wikipedia.org/wiki/Elizabeth_II,"https://en.wikipedia.org/wiki/Anne,_Princess_Royal",https://en.wikipedia.org/wiki/Zara_Tindall,https://en.wikipedia.org/wiki/Charles_III,"https://en.wikipedia.org/wiki/Prince_Andrew,_Duke_of_York","https://en.wikipedia.org/wiki/Prince_Edward,_Duke_of_Edinburgh",,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_longest-reigning_monarchs', 'https://en.wikipedia.org/wiki/Elizabeth_II', 'https://en.wikipedia.org/wiki/Anne,_Princess_Royal', 'https://en.wikipedia.org/wiki/Zara_Tindall', 'https://en.wikipedia.org/wiki/Charles_III', 'https://en.wikipedia.org/wiki/Prince_Andrew,_Duke_of_York', 'https://en.wikipedia.org/wiki/Prince_Edward,_Duke_of_Edinburgh']" +119,The Office is an American mockumentary sitcom television series that first aired in 2005. Who won the Academy Award for Best Director the same year that the show had its series finale?,Ang Lee won the award for best director for Life of Pi.,https://en.wikipedia.org/wiki/The_Office_(American_TV_series),https://en.wikipedia.org/wiki/85th_Academy_Awards,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Office_(American_TV_series)', 'https://en.wikipedia.org/wiki/85th_Academy_Awards']" +120,"In square KM, how much bigger is the total area of La Casita-Garciasville, Texas compared to that of the independent state that was recognised in the 1929 Lateran Treaty?",10.81,"https://en.wikipedia.org/wiki/La_Casita-Garciasville,_Texas",https://en.wikipedia.org/wiki/Lateran_Treaty,https://en.wikipedia.org/wiki/Vatican_City,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/La_Casita-Garciasville,_Texas', 'https://en.wikipedia.org/wiki/Lateran_Treaty', 'https://en.wikipedia.org/wiki/Vatican_City']" +121,"Which fast food restaurant opened first, McDonald's, Wendy's or In-and-Out?",McDonald's in 1940,https://en.wikipedia.org/wiki/McDonald%27s,https://en.wikipedia.org/wiki/In-N-Out_Burger,https://en.wikipedia.org/wiki/Wendy%27s,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/McDonald%27s', 'https://en.wikipedia.org/wiki/In-N-Out_Burger', 'https://en.wikipedia.org/wiki/Wendy%27s']" +122,How much wider in centimeters is the painting that inspired a Stephen Sondheim musical than the 10th most expensive painting ever sold?,211.2 cm,https://en.wikipedia.org/wiki/Works_of_Stephen_Sondheim,https://en.wikipedia.org/wiki/A_Sunday_Afternoon_on_the_Island_of_La_Grande_Jatte,https://en.wikipedia.org/wiki/List_of_most_expensive_paintings,"https://en.wikipedia.org/wiki/The_Standard_Bearer_(Rembrandt,_1636)",,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Works_of_Stephen_Sondheim', 'https://en.wikipedia.org/wiki/A_Sunday_Afternoon_on_the_Island_of_La_Grande_Jatte', 'https://en.wikipedia.org/wiki/List_of_most_expensive_paintings', 'https://en.wikipedia.org/wiki/The_Standard_Bearer_(Rembrandt,_1636)']" +123,"What is the birthplace and hometown of the winning goal scorer of the 2010 Vancouver Olympics, Men's Ice Hockey event?",Halifax,https://en.wikipedia.org/wiki/Ice_hockey_at_the_2010_Winter_Olympics_%E2%80%93_Men%27s_tournament,https://en.wikipedia.org/wiki/Sidney_Crosby,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Ice_hockey_at_the_2010_Winter_Olympics_%E2%80%93_Men%27s_tournament', 'https://en.wikipedia.org/wiki/Sidney_Crosby']" +124,Who won the World series the year Happy Days premiered?,The Oakland Athletics,https://en.wikipedia.org/wiki/Happy_Days#Characters,https://en.wikipedia.org/wiki/1974_World_Series,,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Happy_Days#Characters', 'https://en.wikipedia.org/wiki/1974_World_Series']" +125,How many days after the United States release of the record holder for largest sweep at the 2004 Oscars was the death of that movie's cinematographer?,4149 days,https://en.wikipedia.org/wiki/List_of_Academy_Award_records,https://en.wikipedia.org/wiki/The_Lord_of_the_Rings:_The_Return_of_the_King,https://en.wikipedia.org/wiki/Andrew_Lesnie,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Academy_Award_records', 'https://en.wikipedia.org/wiki/The_Lord_of_the_Rings:_The_Return_of_the_King', 'https://en.wikipedia.org/wiki/Andrew_Lesnie']" +126,"In the Belgian capital, there is a street named after King Leopold II's oldest daughter which is lined with chestnut trees and is the home to many embassies. What is the capital of the country whose embassy is found at number 425?",Zagreb,https://en.wikipedia.org/wiki/Belgium,https://en.wikipedia.org/wiki/City_of_Brussels,https://en.wikipedia.org/wiki/Avenue_Louise,https://en.wikipedia.org/wiki/Croatia,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Belgium', 'https://en.wikipedia.org/wiki/City_of_Brussels', 'https://en.wikipedia.org/wiki/Avenue_Louise', 'https://en.wikipedia.org/wiki/Croatia']" +127,I'm thinking of a man whose first name is Hart. He acted in a miniseries (based on a historical novel about WW2 with three words in its title) by an author who also wrote a book about a kid with the last name of Bookbinder.,Hart Bochner,https://en.wikipedia.org/wiki/City_Boy:_The_Adventures_of_Herbie_Bookbinder,https://en.wikipedia.org/wiki/Herman_Wouk,https://en.wikipedia.org/wiki/War_and_Remembrance_(miniseries),https://en.wikipedia.org/wiki/Hart_Bochner,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/City_Boy:_The_Adventures_of_Herbie_Bookbinder', 'https://en.wikipedia.org/wiki/Herman_Wouk', 'https://en.wikipedia.org/wiki/War_and_Remembrance_(miniseries)', 'https://en.wikipedia.org/wiki/Hart_Bochner']" +128,Name the teams in alphabetical order that every AL MVP from Texas Rangers retired from as of August 2024.,"Cleveland Indians, New York Yankees, Texas Rangers, Toronto Blue Jays",https://en.wikipedia.org/wiki/Texas_Rangers_award_winners_and_league_leaders,https://en.wikipedia.org/wiki/Jeff_Burroughs,https://en.wikipedia.org/wiki/Juan_Gonz%C3%A1lez_(baseball),https://en.wikipedia.org/wiki/Iv%C3%A1n_Rodr%C3%ADguez,https://en.wikipedia.org/wiki/Alex_Rodriguez,https://en.wikipedia.org/wiki/Josh_Hamilton,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Texas_Rangers_award_winners_and_league_leaders', 'https://en.wikipedia.org/wiki/Jeff_Burroughs', 'https://en.wikipedia.org/wiki/Juan_Gonz%C3%A1lez_(baseball)', 'https://en.wikipedia.org/wiki/Iv%C3%A1n_Rodr%C3%ADguez', 'https://en.wikipedia.org/wiki/Alex_Rodriguez', 'https://en.wikipedia.org/wiki/Josh_Hamilton']" +129,What films did Big Hit Music's 7-member boy group release in the year that the company's 5-member boy group first debuted?,Love Yourself in Seoul (2019) and Bring the Soul: The Movie (2019),https://en.wikipedia.org/wiki/Big_Hit_Music#Groups,https://en.wikipedia.org/wiki/BTS#,https://en.wikipedia.org/wiki/Tomorrow_X_Together,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Big_Hit_Music#Groups', 'https://en.wikipedia.org/wiki/BTS#', 'https://en.wikipedia.org/wiki/Tomorrow_X_Together']" +130,What year was the University that gave Taylor Swift an honorary doctorate founded?,1831 (New York University),https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Taylor_Swift#Honorary_degree,https://en.wikipedia.org/wiki/New_York_University,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Taylor_Swift#Honorary_degree', 'https://en.wikipedia.org/wiki/New_York_University']" +131,What is the etymology of the name of the province to the east of the province in which Hazrati Sultan District is located?,"The Dari name 'Baghlan' comes from the Bactrian 'Bagolango', meaning 'image-temple'",https://en.wikipedia.org/wiki/Hazrati_Sultan_District,https://en.wikipedia.org/wiki/Samangan_Province,https://en.wikipedia.org/wiki/Baghlan_Province,https://en.wikipedia.org/wiki/Kushan_Empire,https://en.wikipedia.org/wiki/Bactrian_language,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Hazrati_Sultan_District', 'https://en.wikipedia.org/wiki/Samangan_Province', 'https://en.wikipedia.org/wiki/Baghlan_Province', 'https://en.wikipedia.org/wiki/Kushan_Empire', 'https://en.wikipedia.org/wiki/Bactrian_language']" +132,"Consider the number of months lapsing between the major earthquake that caused widespread destruction around the Bay of Naples in 62 CE to the eruption of Mount Vesuvius in 79 CE which buried Pompeii in ash. If Mount Vesuvius were to have erupted once whenever that number of months came to pass between its 79 CE eruption and the date on which ""Pompeii"" by Bastille was officially released, how many times would the volcano have erupted between those two events?",109 times,https://en.wikipedia.org/wiki/Eruption_of_Mount_Vesuvius_in_79_AD,https://en.wikipedia.org/wiki/Pompeii_(song),,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Eruption_of_Mount_Vesuvius_in_79_AD', 'https://en.wikipedia.org/wiki/Pompeii_(song)']" +133,"Multiple the number of Tony's won by the guest host of SNL 12/6/1997 by the number of Oscar nominations received by the 2023 film directed by Greta Gerwig. Then divide this number by the number of Grammy's won by the band behind the 1979 album ""Tusk"".",12,https://en.wikipedia.org/wiki/Saturday_Night_Live_season_23,https://en.wikipedia.org/wiki/Nathan_Lane,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Nathan_Lane,https://en.wikipedia.org/wiki/Barbie_(film),https://en.wikipedia.org/wiki/List_of_accolades_received_by_Barbie_(film),https://en.wikipedia.org/wiki/Tusk_(album),https://en.wikipedia.org/wiki/Fleetwood_Mac#Grammy_Awards,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Saturday_Night_Live_season_23', 'https://en.wikipedia.org/wiki/Nathan_Lane', 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Nathan_Lane', 'https://en.wikipedia.org/wiki/Barbie_(film)', 'https://en.wikipedia.org/wiki/List_of_accolades_received_by_Barbie_(film)', 'https://en.wikipedia.org/wiki/Tusk_(album)', 'https://en.wikipedia.org/wiki/Fleetwood_Mac#Grammy_Awards']" +134,"On March 3rd during the year of Mariah Carey's birth, a famous space launch occurred and the mission lasted for how many days?",10 days,https://en.wikipedia.org/wiki/Mariah_Carey,https://en.wikipedia.org/wiki/1969,https://en.wikipedia.org/wiki/Apollo_9,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mariah_Carey', 'https://en.wikipedia.org/wiki/1969', 'https://en.wikipedia.org/wiki/Apollo_9']" +135,What is the Chinese name for the bodhisattva that the Sensoji temple is dedicated to?,Guanyin,https://en.wikipedia.org/wiki/Sens%C5%8D-ji,https://en.wikipedia.org/wiki/Avalokite%C5%9Bvara,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Sens%C5%8D-ji', 'https://en.wikipedia.org/wiki/Avalokite%C5%9Bvara']" +136,Which president of the United States is represented by the sum of the ordinal numbers of the presidencies of the four men depicted on Mount Rushmore?,Joe Biden,https://en.wikipedia.org/wiki/Mount_Rushmore#History,https://en.wikipedia.org/wiki/George_Washington,https://en.wikipedia.org/wiki/Thomas_Jefferson,https://en.wikipedia.org/wiki/Theodore_Roosevelt,https://en.wikipedia.org/wiki/Abraham_Lincoln,https://en.wikipedia.org/wiki/Presidency_of_Joe_Biden,https://en.wikipedia.org/wiki/Ordinal_number,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Mount_Rushmore#History', 'https://en.wikipedia.org/wiki/George_Washington', 'https://en.wikipedia.org/wiki/Thomas_Jefferson', 'https://en.wikipedia.org/wiki/Theodore_Roosevelt', 'https://en.wikipedia.org/wiki/Abraham_Lincoln', 'https://en.wikipedia.org/wiki/Presidency_of_Joe_Biden', 'https://en.wikipedia.org/wiki/Ordinal_number']" +137,"If we consider their inception being the date of declared independence, how many years older is the US than Mexico?",45 years,https://en.wikipedia.org/wiki/Mexico,https://en.wikipedia.org/wiki/United_States,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Mexico', 'https://en.wikipedia.org/wiki/United_States']" +138,The manager of the Schenectady Blue Jays in 1953 also played Major League Baseball for which teams?,"Skeeter Newsome - Philadelphia Athletics, Boston Red Sox, and Philadelphia Phillies",https://en.wikipedia.org/wiki/Schenectady_Blue_Jays,https://en.wikipedia.org/wiki/Skeeter_Newsome,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Schenectady_Blue_Jays', 'https://en.wikipedia.org/wiki/Skeeter_Newsome']" +139,"What is the name of the play written in May 2016 by a playwright who won the MacArthur Fellowship the same year as the poet who wrote ""Postcolonial Love Poem""?",Skeleton Crew,https://en.wikipedia.org/wiki/Postcolonial_Love_Poem,https://en.wikipedia.org/wiki/Natalie_Diaz,https://en.wikipedia.org/wiki/MacArthur_Fellows_Program#,https://en.wikipedia.org/wiki/Dominique_Morisseau,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Postcolonial_Love_Poem', 'https://en.wikipedia.org/wiki/Natalie_Diaz', 'https://en.wikipedia.org/wiki/MacArthur_Fellows_Program#', 'https://en.wikipedia.org/wiki/Dominique_Morisseau']" +140,What is the birth date of the person picked right after Lee Vaughn in the 1997 NFL draft?,"August 24, 1974",https://en.wikipedia.org/wiki/Lee_Vaughn,https://en.wikipedia.org/wiki/1997_NFL_draft#Round_6,https://en.wikipedia.org/wiki/Tony_McCombs,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Lee_Vaughn', 'https://en.wikipedia.org/wiki/1997_NFL_draft#Round_6', 'https://en.wikipedia.org/wiki/Tony_McCombs']" +141,"Suppose Egon Sendler's book ""Les mystères du Christ: Icônes de la liturgie"" was written in the same year as the birth of Nemanja Markovic. How old would the book be when the New Hampshire state election results for the Democratic party were 53.9% and 125,822 votes?",51 years.,https://en.wikipedia.org/wiki/Egon_Sendler,https://en.wikipedia.org/wiki/Elections_in_New_Hampshire,https://en.wikipedia.org/wiki/Nemanja_Markovi%C4%87,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Egon_Sendler', 'https://en.wikipedia.org/wiki/Elections_in_New_Hampshire', 'https://en.wikipedia.org/wiki/Nemanja_Markovi%C4%87']" +142,"As of August 5, 2024, what is the name of the federal law that was found to be violated by the company that Brian Bergstein is employed by?",The Sherman Antitrust Act,https://en.wikipedia.org/wiki/Brian_Bergstein,https://en.wikipedia.org/wiki/Associated_Press,https://en.wikipedia.org/wiki/Associated_Press_v._United_States,https://en.wikipedia.org/wiki/Sherman_Antitrust_Act,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Brian_Bergstein', 'https://en.wikipedia.org/wiki/Associated_Press', 'https://en.wikipedia.org/wiki/Associated_Press_v._United_States', 'https://en.wikipedia.org/wiki/Sherman_Antitrust_Act']" +143,How many more votes did the Conservatives receive in the Highlands and Islands region in the 2021 Scottish Parliamentary Elections than in 2016?,"16,086",https://en.wikipedia.org/wiki/2021_Scottish_Parliament_election,https://en.wikipedia.org/wiki/2016_Scottish_Parliament_election,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/2021_Scottish_Parliament_election', 'https://en.wikipedia.org/wiki/2016_Scottish_Parliament_election']" +144,What lake in Isreal supports a population of the state bird of the Indian state Haryana?,The Sea of Galilee,https://en.wikipedia.org/wiki/Sea_of_Galilee,https://en.wikipedia.org/wiki/Black_francolin,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Sea_of_Galilee', 'https://en.wikipedia.org/wiki/Black_francolin']" +145,"As of August 3, 2024, what is the sum of the birth years of every tennis player to both represent the country that tennis was first played and complete a Grand Slam.",5980,https://en.wikipedia.org/wiki/Tennis,https://en.wikipedia.org/wiki/Grand_Slam_(tennis),https://en.wikipedia.org/wiki/Jordanne_Whiley,https://en.wikipedia.org/wiki/Gordon_Reid_(tennis),https://en.wikipedia.org/wiki/Alfie_Hewett,,,,,,,Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Tennis', 'https://en.wikipedia.org/wiki/Grand_Slam_(tennis)', 'https://en.wikipedia.org/wiki/Jordanne_Whiley', 'https://en.wikipedia.org/wiki/Gordon_Reid_(tennis)', 'https://en.wikipedia.org/wiki/Alfie_Hewett']" +146,Rosie Ruiz was disqualified from the Boston Marathon. The subsequent winner placed in what position of the 1988 Grandma's Marathon?,Jacqueline Gareau placed first in the 1988 Grandma's Marathon.,"https://en.wikipedia.org/wiki/Boston_Marathon#Rosie_Ruiz,_the_impostor",https://en.wikipedia.org/wiki/Jacqueline_Gareau,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Boston_Marathon#Rosie_Ruiz,_the_impostor', 'https://en.wikipedia.org/wiki/Jacqueline_Gareau']" +147,"On Nov 15, 2017 a painting was sold for US $450 million setting a new record for the most expensive painting ever sold at public auction. What year was the auction house where this purchase took place founded?",1766,https://en.wikipedia.org/wiki/List_of_most_expensive_paintings,https://en.wikipedia.org/wiki/Christie%27s,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_most_expensive_paintings', 'https://en.wikipedia.org/wiki/Christie%27s']" +148,"A disease that had millions of dollars raised for on April 20, 1992, was first recognized by the Center for Disease Control and Prevention (CDC) in what year?",1981,https://en.wikipedia.org/wiki/1992,https://en.wikipedia.org/wiki/HIV/AIDS,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1992', 'https://en.wikipedia.org/wiki/HIV/AIDS']" +149,"As of July 1, 2023, what is the total number of letters in the names of the capital cities of the 5 most populated countries in the world?",43,https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations),https://en.wikipedia.org/wiki/India,https://en.wikipedia.org/wiki/China,https://en.wikipedia.org/wiki/United_States,https://en.wikipedia.org/wiki/Indonesia,https://en.wikipedia.org/wiki/Pakistan,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)', 'https://en.wikipedia.org/wiki/India', 'https://en.wikipedia.org/wiki/China', 'https://en.wikipedia.org/wiki/United_States', 'https://en.wikipedia.org/wiki/Indonesia', 'https://en.wikipedia.org/wiki/Pakistan']" +150,"Where was the rapper behind the song ""Hind's Hall"" born? ","Seattle, Washington",https://en.wikipedia.org/wiki/Hind%27s_Hall,https://en.wikipedia.org/wiki/Macklemore,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Hind%27s_Hall', 'https://en.wikipedia.org/wiki/Macklemore']" +151,"Based on the information available on Wikipedia on August 4, 2024 at 2:42 AM Greenwich Mean Time, which of the following areas, Minamidaitōjima, Nuapada district, or Vostochnaya Niva has the highest population and how many more citizens does it contain than the other two provided cities?","The Nuapada district contains 608,269 more people than the other two areas combined.",https://en.wikipedia.org/wiki/Nuapada_district,https://en.wikipedia.org/wiki/Vostochnaya_Niva,https://en.wikipedia.org/wiki/Minamidait%C5%8Djima,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Nuapada_district', 'https://en.wikipedia.org/wiki/Vostochnaya_Niva', 'https://en.wikipedia.org/wiki/Minamidait%C5%8Djima']" +152,What are the first three letters of the capital city of the country where Shakespeare's longest play is set?,Cop,https://en.wikipedia.org/wiki/Hamlet,https://en.wikipedia.org/wiki/Denmark,,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Hamlet', 'https://en.wikipedia.org/wiki/Denmark']" +153,How many of Ron Hutchinson's teams won League Championships while he was on them?,"Of the six hockey teams that Hutchinson played on during his career, two, the Flin Flon Bombers and the Vancouver Canucks, took home League Championships during his time on the roster. ",https://en.wikipedia.org/wiki/Ron_Hutchinson_(ice_hockey),https://en.wikipedia.org/wiki/Flin_Flon_Bombers,https://en.wikipedia.org/wiki/Vancouver_Canucks_(WHL),https://en.wikipedia.org/wiki/New_York_Rangers,https://en.wikipedia.org/wiki/Seattle_Totems,https://en.wikipedia.org/wiki/Charlotte_Checkers_(1956%E2%80%931977),https://en.wikipedia.org/wiki/Cranbrook_Royals,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Ron_Hutchinson_(ice_hockey)', 'https://en.wikipedia.org/wiki/Flin_Flon_Bombers', 'https://en.wikipedia.org/wiki/Vancouver_Canucks_(WHL)', 'https://en.wikipedia.org/wiki/New_York_Rangers', 'https://en.wikipedia.org/wiki/Seattle_Totems', 'https://en.wikipedia.org/wiki/Charlotte_Checkers_(1956%E2%80%931977)', 'https://en.wikipedia.org/wiki/Cranbrook_Royals']" +154,What Pink Floyd album came out the year Pablo Picasso died?,Dark Side of the Moon,https://en.wikipedia.org/wiki/Pablo_Picasso,https://en.wikipedia.org/wiki/The_Dark_Side_of_the_Moon,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Pablo_Picasso', 'https://en.wikipedia.org/wiki/The_Dark_Side_of_the_Moon']" +155,"As of August 1 2024, what books has the author of the Harry Potter series written under an alias?","The Cuckoo's Calling, The Silkworm, Career of Evil, Lethal White, Troubled Blood, The Ink Black Heart, The Running Grave ",https://en.wikipedia.org/wiki/Harry_Potter,https://en.wikipedia.org/wiki/J._K._Rowling,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Harry_Potter', 'https://en.wikipedia.org/wiki/J._K._Rowling']" +156,What football team did Travis Kelce play for the year Taylor Swift's VMA acceptance speech was interrupted by Kanye West?,The University of Cincinnati Bearcats,https://en.wikipedia.org/wiki/Taylor_Swift,https://en.wikipedia.org/wiki/Travis_Kelce,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Taylor_Swift', 'https://en.wikipedia.org/wiki/Travis_Kelce']" +157,What horror movie remake did the director who was the first to attempt and failed to make Brokeback Mountian into a film direct in the 90's?,Psycho,https://en.wikipedia.org/wiki/Brokeback_Mountain,https://en.wikipedia.org/wiki/Gus_Van_Sant,https://en.wikipedia.org/wiki/Psycho_(1998_film),,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Brokeback_Mountain', 'https://en.wikipedia.org/wiki/Gus_Van_Sant', 'https://en.wikipedia.org/wiki/Psycho_(1998_film)']" +158,"As of 2024, how many total Academy award nominations has the the man who won the Academy award for best actor one year before 1999 received?",12,https://en.wikipedia.org/wiki/70th_Academy_Awards,https://en.wikipedia.org/wiki/Jack_Nicholson,,,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/70th_Academy_Awards', 'https://en.wikipedia.org/wiki/Jack_Nicholson']" +159,"What is the difference between the fastest recorded swimming speed of a fish, and the fastest record for swimming the 50m freestyle in the 2020 Tokyo Olympics in meters per second?",34.30 m/s,https://en.wikipedia.org/wiki/Fastest_animals#Fish,https://en.wikipedia.org/wiki/List_of_Olympic_records_in_swimming,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Fastest_animals#Fish', 'https://en.wikipedia.org/wiki/List_of_Olympic_records_in_swimming']" +160,How many years separate the birth of Alexander Graham Bell and the birth of Charles Dickens?,35,https://en.wikipedia.org/wiki/Alexander_Graham_Bell,https://en.wikipedia.org/wiki/Charles_Dickens,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Alexander_Graham_Bell', 'https://en.wikipedia.org/wiki/Charles_Dickens']" +161,Stamatios Krimigis was named after an asteroid by the IAU. The observing site that found this asteroid is part of a large observatory known for discovering a planet. What is the difference in years between the discovery of this asteroid and the discovery of the planet?,49 years,https://en.wikipedia.org/wiki/Stamatios_Krimigis,https://en.wikipedia.org/wiki/List_of_minor_planets:_8001%E2%80%939000#323c,https://en.wikipedia.org/wiki/Anderson_Mesa_Station,https://en.wikipedia.org/wiki/Lowell_Observatory,https://en.wikipedia.org/wiki/Pluto,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Stamatios_Krimigis', 'https://en.wikipedia.org/wiki/List_of_minor_planets:_8001%E2%80%939000#323c', 'https://en.wikipedia.org/wiki/Anderson_Mesa_Station', 'https://en.wikipedia.org/wiki/Lowell_Observatory', 'https://en.wikipedia.org/wiki/Pluto']" +162,What two buildings can be said to have introduced the onset of the architectural style of Big Ben in London?,Dromore Cathedral and The Great Hall of Lambeth Palace,https://en.wikipedia.org/wiki/Big_Ben#Design,https://en.wikipedia.org/wiki/Gothic_Revival_architecture#Roots,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Big_Ben#Design', 'https://en.wikipedia.org/wiki/Gothic_Revival_architecture#Roots']" +163,How many years after Ghengis Khan died did World War II begin?,712 years,https://en.wikipedia.org/wiki/Genghis_Khan,https://en.wikipedia.org/wiki/World_War_II,,,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Genghis_Khan', 'https://en.wikipedia.org/wiki/World_War_II']" +164,"Among the singers in the 1985 version of ""That's What Friends Are For,"" which one was born in the Peach State?",Gladys Knight,https://en.wikipedia.org/wiki/That%27s_What_Friends_Are_For,https://en.wikipedia.org/wiki/Dionne_Warwick,https://en.wikipedia.org/wiki/Elton_John,https://en.wikipedia.org/wiki/Stevie_Wonder,https://en.wikipedia.org/wiki/Gladys_Knight,https://en.wikipedia.org/wiki/Georgia_(U.S._state),,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/That%27s_What_Friends_Are_For', 'https://en.wikipedia.org/wiki/Dionne_Warwick', 'https://en.wikipedia.org/wiki/Elton_John', 'https://en.wikipedia.org/wiki/Stevie_Wonder', 'https://en.wikipedia.org/wiki/Gladys_Knight', 'https://en.wikipedia.org/wiki/Georgia_(U.S._state)']" +165,On what station did the television show that started in 1993 and had a star who shared a name with the third wife of King Henry VIII first run?,CBS,https://en.wikipedia.org/wiki/Jane_Seymour,https://en.wikipedia.org/wiki/Jane_Seymour_(actress),"https://en.wikipedia.org/wiki/Dr._Quinn,_Medicine_Woman",,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Jane_Seymour', 'https://en.wikipedia.org/wiki/Jane_Seymour_(actress)', 'https://en.wikipedia.org/wiki/Dr._Quinn,_Medicine_Woman']" +166,"As of January 1, 2024, are any members of Vampire Weekend Capricorn? If no, what are their signs?","No. Ezra Koenig is Aries, Chris Baio is Scorpio, and Chris Tomson is Pisces.",https://en.wikipedia.org/wiki/Vampire_Weekend,https://en.wikipedia.org/wiki/Ezra_Koenig,https://en.wikipedia.org/wiki/Chris_Baio,https://en.wikipedia.org/wiki/Chris_Tomson,https://en.wikipedia.org/wiki/Astrological_sign,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Vampire_Weekend', 'https://en.wikipedia.org/wiki/Ezra_Koenig', 'https://en.wikipedia.org/wiki/Chris_Baio', 'https://en.wikipedia.org/wiki/Chris_Tomson', 'https://en.wikipedia.org/wiki/Astrological_sign']" +167,"Who is married to the actor who plays Imdad Khan in the film version of The Wonderful Story of Henry Sugar, as of August 1, 2024 ?",Daniela Lavender,https://en.wikipedia.org/wiki/The_Wonderful_Story_of_Henry_Sugar_(film),https://en.wikipedia.org/wiki/Ben_Kingsley,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Wonderful_Story_of_Henry_Sugar_(film)', 'https://en.wikipedia.org/wiki/Ben_Kingsley']" +168,"What is the difference in mean flow rate (in cubic feet per second) between the River Avon at Great Somerford and its tributary, the River Marden?",75.5 cubic ft/s,"https://en.wikipedia.org/wiki/River_Avon,_Bristol#Hydrology_and_water_quality",https://en.wikipedia.org/wiki/River_Marden,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/River_Avon,_Bristol#Hydrology_and_water_quality', 'https://en.wikipedia.org/wiki/River_Marden']" +169,"I am thinking of a country. A former member of Swedish Parliament during 2002 – 2006 was born there. English is the official language but many other languages are spoken there. The Trans–West African Coastal Highway passes through this country. In June 2020, Democratic Party leaders in the United States caused controversy by wearing stoles made of cloth from this country.",Ghana,https://en.wikipedia.org/wiki/Joe_Frans_(politician),https://en.wikipedia.org/wiki/Languages_of_Ghana,https://en.wikipedia.org/wiki/Trans%E2%80%93West_African_Coastal_Highway,https://en.wikipedia.org/wiki/Kente_cloth#cite_note-CNN-2020-06-08-18,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Joe_Frans_(politician)', 'https://en.wikipedia.org/wiki/Languages_of_Ghana', 'https://en.wikipedia.org/wiki/Trans%E2%80%93West_African_Coastal_Highway', 'https://en.wikipedia.org/wiki/Kente_cloth#cite_note-CNN-2020-06-08-18']" +170,"Which film, based loosely on the story of Frederick 'Fritz' Niland, won the Golden Globe for best drama at the 56th Golden Globes in 1999?","Saving Private Ryan was loosely based on the story of Frederick 'Fritz' Niland, and won the Golden Globe for best drama at the 56th Golden Globes in 1999.",https://en.wikipedia.org/wiki/Niland_brothers,https://en.wikipedia.org/wiki/Saving_Private_Ryan#Reception,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Niland_brothers', 'https://en.wikipedia.org/wiki/Saving_Private_Ryan#Reception']" +171,Which player scored 20 goals in the English Premier League in the 2006-2007 season and won 'Chelsea Players Player of the Year' award in 2007?,Didier Drogba scored 20 goals in the the 2006-2007 English Premier League season and won the 'Chelsea Players Player of the Year' award in 2007.,https://en.wikipedia.org/wiki/2006–07_FA_Premier_League,https://en.wikipedia.org/wiki/Didier_Drogba#Honours,,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/2006–07_FA_Premier_League', 'https://en.wikipedia.org/wiki/Didier_Drogba#Honours']" +172,"Which Northern Irish footballer who started in the 90s played for 9 English clubs, 8 of which were FA Cup winners?",Keith Gillespie,https://en.wikipedia.org/wiki/List_of_Northern_Ireland_international_footballers#List_of_players,https://en.wikipedia.org/wiki/Keith_Gillespie,https://en.wikipedia.org/wiki/Manchester_United_F.C.,https://en.wikipedia.org/wiki/Wigan_Athletic_F.C.,https://en.wikipedia.org/wiki/Newcastle_United_F.C.,https://en.wikipedia.org/wiki/Blackburn_Rovers_F.C.,https://en.wikipedia.org/wiki/Leicester_City_F.C.,https://en.wikipedia.org/wiki/Sheffield_United_F.C.,https://en.wikipedia.org/wiki/Charlton_Athletic_F.C.,https://en.wikipedia.org/wiki/Bradford_City_A.F.C.,https://en.wikipedia.org/wiki/Darlington_F.C.,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Northern_Ireland_international_footballers#List_of_players', 'https://en.wikipedia.org/wiki/Keith_Gillespie', 'https://en.wikipedia.org/wiki/Manchester_United_F.C.', 'https://en.wikipedia.org/wiki/Wigan_Athletic_F.C.', 'https://en.wikipedia.org/wiki/Newcastle_United_F.C.', 'https://en.wikipedia.org/wiki/Blackburn_Rovers_F.C.', 'https://en.wikipedia.org/wiki/Leicester_City_F.C.', 'https://en.wikipedia.org/wiki/Sheffield_United_F.C.', 'https://en.wikipedia.org/wiki/Charlton_Athletic_F.C.', 'https://en.wikipedia.org/wiki/Bradford_City_A.F.C.', 'https://en.wikipedia.org/wiki/Darlington_F.C.']" +173,"In roman numerals, how many nations competed in the Olympic Games where the most gold medals was won by an athlete at a single Olympic Games, as of 1st July 2024?",CCIV,https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists_at_a_single_Games,https://en.wikipedia.org/wiki/2008_Summer_Olympics,,,,,,,,,,Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/List_of_multiple_Olympic_gold_medalists_at_a_single_Games', 'https://en.wikipedia.org/wiki/2008_Summer_Olympics']" +174,"Which jetliner first flown on June 12th, 1994 is also widely used, as of August 3, 2024, by an airline using the ICE system for entertainment on board?","The Boeing 777 was first flown on the 12th of June, 1994 and is widely used by Emirates, which uses the ICE system on board.",https://en.wikipedia.org/wiki/Boeing_777,https://en.wikipedia.org/wiki/Emirates_(airline)#Services,,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Boeing_777', 'https://en.wikipedia.org/wiki/Emirates_(airline)#Services']" +175,"If Andrew Fluegelman's suggested donation for his freeware program were paid at a rate of 1 per every day, how much money would he have made during his fast?","$1,225.00",https://en.wikipedia.org/wiki/PC-Talk,https://en.wikipedia.org/wiki/Andrew_Fluegelman,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/PC-Talk', 'https://en.wikipedia.org/wiki/Andrew_Fluegelman']" +176,"For the year 2020, what was the difference in total fertility rate (TFR) for East Timor and Japan?",1.92,https://en.wikipedia.org/wiki/Demographics_of_East_Timor,https://en.wikipedia.org/wiki/Demographics_of_Japan,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Demographics_of_East_Timor', 'https://en.wikipedia.org/wiki/Demographics_of_Japan']" +177,"As of August 1, 2024, if you add together the age of Ana Ollo Hualde and the age of the country of Israel what number do you get when you subtract 35 from your answer?",100 (59 + 76) - 35,https://en.wikipedia.org/wiki/Ana_Ollo,https://en.wikipedia.org/wiki/Israel,,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Ana_Ollo', 'https://en.wikipedia.org/wiki/Israel']" +178,"I'm trying to show my daughter some of the movies I grew up with. There's one a really want to show her but I can't remember the name of it. I remember that the male lead also played Fred in a live action Scooby Doo movie and the main girl was in this crazy anti-drug commercial in the 90s where she used a pan to destroy a bunch of things in a kitchen and said ""This is your brain on drugs..."". The movie is about a guy who makes a bet with his friend that he can turn an unpopular girl into prom queen. Can you tell me the name of the film?",*She's All That*,https://en.wikipedia.org/wiki/Scooby-Doo_(film),https://en.wikipedia.org/wiki/Freddie_Prinze_Jr.,https://en.wikipedia.org/wiki/This_Is_Your_Brain_on_Drugs,https://en.wikipedia.org/wiki/Rachael_Leigh_Cook,https://en.wikipedia.org/wiki/She%27s_All_That,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Scooby-Doo_(film)', 'https://en.wikipedia.org/wiki/Freddie_Prinze_Jr.', 'https://en.wikipedia.org/wiki/This_Is_Your_Brain_on_Drugs', 'https://en.wikipedia.org/wiki/Rachael_Leigh_Cook', 'https://en.wikipedia.org/wiki/She%27s_All_That']" +179,"Which major city in Europe can be reached from New York City, if you use the total distance calculated through GPA coordinates (give or take 18 miles) from West Quoddy Light in Maine to Cape Sarichef Light in Alaska?","Prague, Czech Republic",https://en.wikipedia.org/wiki/West_Quoddy_Head_Light,https://en.wikipedia.org/wiki/Cape_Sarichef_Light,https://en.wikipedia.org/wiki/Module:Location_map/data/USA_New_York_City,"https://en.wikipedia.org/wiki/Prague#:~:text=Prague%20is%20located%20approximately%20at,N%2014%C2%B025%E2%80%B2E.",,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/West_Quoddy_Head_Light', 'https://en.wikipedia.org/wiki/Cape_Sarichef_Light', 'https://en.wikipedia.org/wiki/Module:Location_map/data/USA_New_York_City', 'https://en.wikipedia.org/wiki/Prague#:~:text=Prague%20is%20located%20approximately%20at,N%2014%C2%B025%E2%80%B2E.']" +180,When did the actress with multiple sclerosis who starred in the comedy about killing her husband receive a star on the Hollywood Walk of Fame?,"November 14, 2022",https://en.wikipedia.org/wiki/Dead_to_Me_(TV_series),https://en.wikipedia.org/wiki/Christina_Applegate,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Dead_to_Me_(TV_series)', 'https://en.wikipedia.org/wiki/Christina_Applegate']" +181,What is so distinctive about the label design of an Australian record label which was purchased in 1960 by another record label which produced the debut album for The Clash?,The octagonal shape,https://en.wikipedia.org/wiki/The_Clash,https://en.wikipedia.org/wiki/The_Clash_(album),https://en.wikipedia.org/wiki/Columbia_Records,https://en.wikipedia.org/wiki/Australian_Record_Company,https://en.wikipedia.org/wiki/Coronet_Records,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Clash', 'https://en.wikipedia.org/wiki/The_Clash_(album)', 'https://en.wikipedia.org/wiki/Columbia_Records', 'https://en.wikipedia.org/wiki/Australian_Record_Company', 'https://en.wikipedia.org/wiki/Coronet_Records']" +182,"Concerning the 2007 book by Sofi Oksanen, the novel was described as ""not shrink from depicting rape, torture or murder."" In what year was the publication that quoted this founded?",1872,https://en.wikipedia.org/wiki/Sofi_Oksanen,https://en.wikipedia.org/wiki/Winnipeg_Free_Press,https://en.wikipedia.org/wiki/Purge_(novel),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Sofi_Oksanen', 'https://en.wikipedia.org/wiki/Winnipeg_Free_Press', 'https://en.wikipedia.org/wiki/Purge_(novel)']" +183,"How many months, rounded to the nearest whole number, did it take to construct the tallest building in the world as of January 1, 2024?",69,https://en.wikipedia.org/wiki/List_of_tallest_buildings_and_structures,https://en.wikipedia.org/wiki/Burj_Khalifa,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_tallest_buildings_and_structures', 'https://en.wikipedia.org/wiki/Burj_Khalifa']" +184,"Which player that scored more than 20 goals in the 2020-2021 Bundesliga season went on to play for a Red Bull owned club, as of August 1, 2024?",Andre Silva went on to play for RB Leipzig.,https://en.wikipedia.org/wiki/2020–21_Bundesliga,https://en.wikipedia.org/wiki/Erling_Haaland,"https://en.wikipedia.org/wiki/André_Silva_(footballer,_born_1995)",https://en.wikipedia.org/wiki/Robert_Lewandowski,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/2020–21_Bundesliga', 'https://en.wikipedia.org/wiki/Erling_Haaland', 'https://en.wikipedia.org/wiki/André_Silva_(footballer,_born_1995)', 'https://en.wikipedia.org/wiki/Robert_Lewandowski']" +185,What actor who starred in the People's Choice Award for Favorite Comedic Movie 1993 later became a U.S. President?,"The actor who starred in the People's Choice Award for Favorite Comedic Movie in 1993 who later became the 45th President of the United States, Donald Trump.",https://en.wikipedia.org/wiki/19th_People%27s_Choice_Awards#Awards,https://en.wikipedia.org/wiki/Home_Alone_2:_Lost_in_New_York#Cast,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States#Presidents,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/19th_People%27s_Choice_Awards#Awards', 'https://en.wikipedia.org/wiki/Home_Alone_2:_Lost_in_New_York#Cast', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States#Presidents']" +186,"If you divide the number of Papuan tribes in the Sarmi and Keerom Regencies of Papua province in Indonesia as of 2024 by the number of indigenous tribes in Brazil whose names, as they are typically written, begin with letters W, X, Y, or Z as of 2024, what is the answer to the fifth decimal place, rounding up?",1.82143,https://en.wikipedia.org/wiki/Indigenous_people_of_New_Guinea,https://en.wikipedia.org/wiki/List_of_indigenous_peoples_of_Brazil,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Indigenous_people_of_New_Guinea', 'https://en.wikipedia.org/wiki/List_of_indigenous_peoples_of_Brazil']" +187,What member of the 1992 Unified Olympic women's gymnastics team scored a 9.975 in the qualifier for floor and competed under three different flags in her Olympic career?,Svetlana Boginskaya,https://en.wikipedia.org/wiki/Svetlana_Boginskaya,https://en.wikipedia.org/wiki/Gymnastics_at_the_1992_Summer_Olympics_%E2%80%93_Women%27s_artistic_team_all-around,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Svetlana_Boginskaya', 'https://en.wikipedia.org/wiki/Gymnastics_at_the_1992_Summer_Olympics_%E2%80%93_Women%27s_artistic_team_all-around']" +188,"In the country where Haribomo is located, what is the largest ethnic group as of August 3, 2024?",Harimbo is located in Mali where the Bambara are the largest ethnic group.,https://en.wikipedia.org/wiki/Haribomo,https://en.wikipedia.org/wiki/Mali#Ethnic_groups,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Haribomo', 'https://en.wikipedia.org/wiki/Mali#Ethnic_groups']" +189,What was the first elected position of the official that was in the office before Steven C. Johnson became the 42nd Kansas State Treasurer?,Wichita School Board member,https://en.wikipedia.org/wiki/Kansas_State_Treasurer,https://en.wikipedia.org/wiki/Lynn_Rogers_(politician)#Career,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Kansas_State_Treasurer', 'https://en.wikipedia.org/wiki/Lynn_Rogers_(politician)#Career']" +190,As of 1st june 2024 Which Jonas brother has a wife who narrated a nature documentary released under the Disneynature label?,Nick,https://en.wikipedia.org/wiki/Jonas_Brothers#Members,https://en.wikipedia.org/wiki/Nick_Jonas#Personal_life,https://en.wikipedia.org/wiki/Joe_Jonas#Personal_life,https://en.wikipedia.org/wiki/Kevin_Jonas#Personal_life,https://en.wikipedia.org/wiki/Disneynature#Filmography,https://en.wikipedia.org/wiki/Priyanka_Chopra,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Jonas_Brothers#Members', 'https://en.wikipedia.org/wiki/Nick_Jonas#Personal_life', 'https://en.wikipedia.org/wiki/Joe_Jonas#Personal_life', 'https://en.wikipedia.org/wiki/Kevin_Jonas#Personal_life', 'https://en.wikipedia.org/wiki/Disneynature#Filmography', 'https://en.wikipedia.org/wiki/Priyanka_Chopra']" +191,"I'm thinking of a famous house, can you tell me which one from these clues? * The author of a philosophical work whose frontispiece was designed by Abraham Bosse spent time here. * The son of Francis Talbot and Mary Dacre used this house as a royal jail. ",Chatsworth House,https://en.wikipedia.org/wiki/Chatsworth_House,https://en.wikipedia.org/wiki/Thomas_Hobbes,https://en.wikipedia.org/wiki/Leviathan_(Hobbes_book),https://en.wikipedia.org/wiki/Abraham_Bosse,"https://en.wikipedia.org/wiki/George_Talbot,_6th_Earl_of_Shrewsbury",,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Chatsworth_House', 'https://en.wikipedia.org/wiki/Thomas_Hobbes', 'https://en.wikipedia.org/wiki/Leviathan_(Hobbes_book)', 'https://en.wikipedia.org/wiki/Abraham_Bosse', 'https://en.wikipedia.org/wiki/George_Talbot,_6th_Earl_of_Shrewsbury']" +192,"Out of all of the sovereign states with U.N. membership as of January 1, 2024, that recognize Abkhazia as a sovereign state, how many of them have active volcanoes?","3, Russia, Nicaragua, and Syria.",https://en.wikipedia.org/wiki/List_of_sovereign_states,https://en.wikipedia.org/wiki/Lists_of_volcanoes,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_sovereign_states', 'https://en.wikipedia.org/wiki/Lists_of_volcanoes']" +193,Do the timelines in the stories of Nier: Automata and Nier Replicant intersect?,"No, they do not intersect.",https://en.wikipedia.org/wiki/Nier,https://en.wikipedia.org/wiki/Nier:_Automata,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Nier', 'https://en.wikipedia.org/wiki/Nier:_Automata']" +194,"As of August 3, 2024, what is the capital of the 7th largest country in Asia?",The capital of the 7th largest country in Asia is Ulaanbaatar,https://en.wikipedia.org/wiki/List_of_Asian_countries_by_area,https://en.wikipedia.org/wiki/Mongolia,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Asian_countries_by_area', 'https://en.wikipedia.org/wiki/Mongolia']" +195,"As of August 3, 2024, what is the biggest religion in the country who has the best democracy in 2023, according to the democracy index?",The Evangelical Lutheran Church of Norway,https://en.wikipedia.org/wiki/The_Economist_Democracy_Index,https://en.wikipedia.org/wiki/Norway,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/The_Economist_Democracy_Index', 'https://en.wikipedia.org/wiki/Norway']" +196,"Ben Darwin, former Australian rugby union footballer, graduated from the Australian Institute of Sport (AIS). How many years after Darwin's birth was the headquarters for the AIS opened? Ignore the month either event occurred.",5,https://en.wikipedia.org/wiki/Ben_Darwin,https://en.wikipedia.org/wiki/Australian_Institute_of_Sport,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Ben_Darwin', 'https://en.wikipedia.org/wiki/Australian_Institute_of_Sport']" +197,"What city does the band whose song spent the most weeks at No. 1 on the Billboard Hot Rock & Alternative Songs chart as of August 1, 2024 originate from?","Las Vegas, Nevada",https://en.wikipedia.org/wiki/Hot_Rock_%26_Alternative_Songs,https://en.wikipedia.org/wiki/High_Hopes_(Panic!_at_the_Disco_song),https://en.wikipedia.org/wiki/Panic!_at_the_Disco,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Hot_Rock_%26_Alternative_Songs', 'https://en.wikipedia.org/wiki/High_Hopes_(Panic!_at_the_Disco_song)', 'https://en.wikipedia.org/wiki/Panic!_at_the_Disco']" +198,Why didn't Harvard have calculus classes when it first opened?,Calculus was not invented yet.,https://en.wikipedia.org/wiki/Harvard_University,https://en.wikipedia.org/wiki/Calculus,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Harvard_University', 'https://en.wikipedia.org/wiki/Calculus']" +199,Which Lord of Montpellier had a child named Tortoseta?,William VIII of Montpellier,https://en.wikipedia.org/wiki/Lords_of_Montpellier,https://en.wikipedia.org/wiki/William_I_of_Montpellier,https://en.wikipedia.org/wiki/William_II_of_Montpellier,https://en.wikipedia.org/wiki/William_III_of_Montpellier,https://en.wikipedia.org/wiki/William_V_of_Montpellier,https://en.wikipedia.org/wiki/William_IV_of_Montpellier,https://en.wikipedia.org/wiki/William_VI_of_Montpellier,https://en.wikipedia.org/wiki/William_VII_of_Montpellier,https://en.wikipedia.org/wiki/William_VIII_of_Montpellier,https://en.wikipedia.org/wiki/William_IX_of_Montpellier,"https://en.wikipedia.org/wiki/Marie_of_Montpellier, https://en.wikipedia.org/wiki/James_I_of_Aragon, https://en.wikipedia.org/wiki/James_II_of_Majorca, https://en.wikipedia.org/wiki/Sancho_of_Majorca, https://en.wikipedia.org/wiki/James_III_of_Majorca",Multiple constraints,"['https://en.wikipedia.org/wiki/Lords_of_Montpellier', 'https://en.wikipedia.org/wiki/William_I_of_Montpellier', 'https://en.wikipedia.org/wiki/William_II_of_Montpellier', 'https://en.wikipedia.org/wiki/William_III_of_Montpellier', 'https://en.wikipedia.org/wiki/William_V_of_Montpellier', 'https://en.wikipedia.org/wiki/William_IV_of_Montpellier', 'https://en.wikipedia.org/wiki/William_VI_of_Montpellier', 'https://en.wikipedia.org/wiki/William_VII_of_Montpellier', 'https://en.wikipedia.org/wiki/William_VIII_of_Montpellier', 'https://en.wikipedia.org/wiki/William_IX_of_Montpellier', 'https://en.wikipedia.org/wiki/Marie_of_Montpellier, https://en.wikipedia.org/wiki/James_I_of_Aragon, https://en.wikipedia.org/wiki/James_II_of_Majorca, https://en.wikipedia.org/wiki/Sancho_of_Majorca, https://en.wikipedia.org/wiki/James_III_of_Majorca']" +200,"As of August 3rd 2024, how many Emmy Award nominations does the main cast member that was introduced in Season 2 of It's Always Sunny in Philadelphia have?",5,https://en.wikipedia.org/wiki/It%27s_Always_Sunny_in_Philadelphia,https://en.wikipedia.org/wiki/Danny_DeVito#Acting_credits_and_accolades,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Danny_DeVito,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/It%27s_Always_Sunny_in_Philadelphia', 'https://en.wikipedia.org/wiki/Danny_DeVito#Acting_credits_and_accolades', 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Danny_DeVito']" +201,What is the atomic number in roman numerals for the element that has the same symbol as the first two letters of the 23rd most populous city (as of 2024) of the country represented between Italy and China at the World Showcase in Epcot?,LXXIX,https://en.wikipedia.org/wiki/Epcot,https://en.wikipedia.org/wiki/List_of_cities_in_Germany_by_population,https://en.wikipedia.org/wiki/Gold,,,,,,,,,Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Epcot', 'https://en.wikipedia.org/wiki/List_of_cities_in_Germany_by_population', 'https://en.wikipedia.org/wiki/Gold']" +202,"The National Peace Corps Association was founded in 1979 by a United States politician. This founder then appointed the very first director of the Peace Corps, his brother-in-law. What is the first name of this director?",Robert,https://en.wikipedia.org/wiki/National_Peace_Corps_Association,https://en.wikipedia.org/wiki/John_F._Kennedy,https://en.wikipedia.org/wiki/Sargent_Shriver,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/National_Peace_Corps_Association', 'https://en.wikipedia.org/wiki/John_F._Kennedy', 'https://en.wikipedia.org/wiki/Sargent_Shriver']" +203,"As of August 3, 2024, what is the hometown of the captain of the team that won the Stanley Cup three years before 2017?","Manhattan Beach, CA",https://en.m.wikipedia.org/wiki/2014_Stanley_Cup_Finals,https://en.m.wikipedia.org/wiki/Dustin_Brown_(ice_hockey),,,,,,,,,,Numerical reasoning | Post processing,"['https://en.m.wikipedia.org/wiki/2014_Stanley_Cup_Finals', 'https://en.m.wikipedia.org/wiki/Dustin_Brown_(ice_hockey)']" +204,"Tell me the names of the two famous people I'm thinking of by using the following clues: They both have the initials M.T. Both were known by nicknames that included the word ""Iron"" One became the world champion in his sport while the other was her country's political leader",Mike Tyson and Margaret Thatcher,https://en.wikipedia.org/wiki/Mike_Tyson,https://en.wikipedia.org/wiki/Margaret_Thatcher,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Mike_Tyson', 'https://en.wikipedia.org/wiki/Margaret_Thatcher']" +205,"If we added the sum of all ages as of 2010 (assuming they were still alive) of the inventors of the cotton gin, vacuum pump, and commercial toilet paper (ignoring the month) and then subtracted the ages of the inventors of the safety pin and the sewing machine what number would we have?",622,"https://en.wikipedia.org/wiki/Cotton_gin#:~:text=A%20cotton%20gin%E2%80%94meaning%20%22cotton,productivity%20than%20manual%20cotton%20separation.",https://en.wikipedia.org/wiki/Pneumatics,https://en.wikipedia.org/wiki/Joseph_Gayetty,https://en.wikipedia.org/wiki/Walter_Hunt_(inventor),,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Cotton_gin#:~:text=A%20cotton%20gin%E2%80%94meaning%20%22cotton,productivity%20than%20manual%20cotton%20separation.', 'https://en.wikipedia.org/wiki/Pneumatics', 'https://en.wikipedia.org/wiki/Joseph_Gayetty', 'https://en.wikipedia.org/wiki/Walter_Hunt_(inventor)']" +206,"What are the combined ages of the Guildford 4 at the time of their trial, divided by the number of the Birmingham 6 who were originally from the capital of Northern Ireland? Round it and provide the answer in binary.",10001,https://en.wikipedia.org/wiki/Guildford_Four_and_Maguire_Seven,https://en.wikipedia.org/wiki/Birmingham_Six,https://en.wikipedia.org/wiki/Northern_Ireland,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Guildford_Four_and_Maguire_Seven', 'https://en.wikipedia.org/wiki/Birmingham_Six', 'https://en.wikipedia.org/wiki/Northern_Ireland']" +207,Who was the King of Siam during the 6th deadliest single-day terrorist attack in U.S. history?,King Prajadhipok,https://en.wikipedia.org/wiki/Terrorism_in_the_United_States#Deadliest_attacks,https://en.wikipedia.org/wiki/Supreme_Council_of_State_of_Siam,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Terrorism_in_the_United_States#Deadliest_attacks', 'https://en.wikipedia.org/wiki/Supreme_Council_of_State_of_Siam']" +208,"Roberto Álamo starred in a film with Inma Cuesta, I think it was released in 2021 but I can't remember the name. What was the movie called and who did he play?",El páramo / The Wasteland. He played Salvador.,https://en.wikipedia.org/wiki/Roberto_%C3%81lamo,https://en.wikipedia.org/wiki/Inma_Cuesta,https://en.wikipedia.org/wiki/The_Wasteland_(2021_film),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Roberto_%C3%81lamo ', 'https://en.wikipedia.org/wiki/Inma_Cuesta', 'https://en.wikipedia.org/wiki/The_Wasteland_(2021_film)']" +209,"What was the age difference, in years, between the seventh former President of Murray State University and the comic book artist who worked on *Superman & Bugs Bunny* and *New Guardians* when the comic book artist graduated from college?",7 years,https://en.wikipedia.org/wiki/Superman_%26_Bugs_Bunny,https://en.wikipedia.org/wiki/New_Guardians,https://en.wikipedia.org/wiki/Joe_Staton,https://en.wikipedia.org/wiki/Constantine_W._Curris,https://en.wikipedia.org/wiki/Murray_State_University#Former_Presidents_of_the_University,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Superman_%26_Bugs_Bunny', 'https://en.wikipedia.org/wiki/New_Guardians', 'https://en.wikipedia.org/wiki/Joe_Staton', 'https://en.wikipedia.org/wiki/Constantine_W._Curris', 'https://en.wikipedia.org/wiki/Murray_State_University#Former_Presidents_of_the_University']" +210,"Of the top 3 women's WTA singles ranking as of 29th July 2024, which has a father who was an ice hockey player?",Aryna Sablenka,https://en.wikipedia.org/wiki/Women%27s_Tennis_Association,https://en.wikipedia.org/wiki/Iga_%C5%9Awi%C4%85tek,https://en.wikipedia.org/wiki/Aryna_Sabalenka,https://en.wikipedia.org/wiki/Coco_Gauff,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Women%27s_Tennis_Association', 'https://en.wikipedia.org/wiki/Iga_%C5%9Awi%C4%85tek', 'https://en.wikipedia.org/wiki/Aryna_Sabalenka', 'https://en.wikipedia.org/wiki/Coco_Gauff']" +211,How many times could Usain Bolt span the length of a standard Olympic-sized swimming pool if Tom Daley was standing on his head? Please answer to a whole Usain or Tom without exceeding the length of the pool.,13,https://en.wikipedia.org/wiki/Olympic-size_swimming_pool,https://en.wikipedia.org/wiki/Usain_Bolt,https://en.wikipedia.org/wiki/Tom_Daley,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Olympic-size_swimming_pool', 'https://en.wikipedia.org/wiki/Usain_Bolt', 'https://en.wikipedia.org/wiki/Tom_Daley']" +212,Who was Prime Minister when Will Young won Pop Idol?,Tony Blair,https://en.wikipedia.org/wiki/Pop_Idol,https://en.wikipedia.org/wiki/Timeline_of_prime_ministers_of_Great_Britain_and_the_United_Kingdom,,,,,,,,,,Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Pop_Idol', 'https://en.wikipedia.org/wiki/Timeline_of_prime_ministers_of_Great_Britain_and_the_United_Kingdom']" +213,"I'm thinking of a diving duck, the largest found in North America, that breeds in the Prairie Pothole Region. This duck, smothered in blackcurrant sauce, was once described by Edith Wharton as an ""especially luxurious dinner"" which was served in New York City in the 1870s. Which duck am I thinking of?",The canvasback.,https://en.wikipedia.org/wiki/Diving_duck,https://en.wikipedia.org/wiki/Prairie_Pothole_Region,https://en.wikipedia.org/wiki/Canvasback,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Diving_duck', 'https://en.wikipedia.org/wiki/Prairie_Pothole_Region', 'https://en.wikipedia.org/wiki/Canvasback']" +214,"I'm thinking of an Actor that won an Oscar for Best Supporting Actor two years after being nominated for the same award, but not winning. The same year that the actor won the Oscar, the Actor also won a BAFTA in the same category and for the same film. The Actor also won a Tony award in 1974.",Christopher Plummer,https://en.wikipedia.org/wiki/28th_Tony_Awards,https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor,https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Actor_in_a_Supporting_Role,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Christopher_Plummer,,,,,,,,Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/28th_Tony_Awards', 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor', 'https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Actor_in_a_Supporting_Role', 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Christopher_Plummer']" +215,"In the George Eliot novel described by Virginia Woolf as ""one of the few English novels written for grown-up people"", one of the main characters shares a first name with a former prime minister of Burkino Faso who went on the join the board of a Canadian mining company. What forename do they share?",Tertius,https://en.wikipedia.org/wiki/George_Eliot,https://en.wikipedia.org/wiki/Middlemarch,https://en.wikipedia.org/wiki/Tertius_Zongo,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/George_Eliot', 'https://en.wikipedia.org/wiki/Middlemarch', 'https://en.wikipedia.org/wiki/Tertius_Zongo']" +216,What is the population of the town (as of December 2020) that holds the Mountain Cheese Olympics?,5793,https://en.wikipedia.org/wiki/Mountain_Cheese_Olympics,https://en.wikipedia.org/wiki/Appenzell_District,,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Mountain_Cheese_Olympics', 'https://en.wikipedia.org/wiki/Appenzell_District']" +217,How old would James Logan have been when the estate of his descendent donated Neshaminy State Park land to the Commonwealth of Pennsylvania?,282 years,https://en.wikipedia.org/wiki/Neshaminy_State_Park,https://en.wikipedia.org/wiki/James_Logan_(statesman),,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Neshaminy_State_Park', 'https://en.wikipedia.org/wiki/James_Logan_(statesman)']" +218,Is the time between Oasis's first album and 2024 shorter or longer than between Oasis's first album and The Beatles' last album?,Longer.,https://en.wikipedia.org/wiki/Oasis_(band),https://en.wikipedia.org/wiki/The_Beatles,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Oasis_(band)', 'https://en.wikipedia.org/wiki/The_Beatles']" +219,Ysaires Restituyo was a volleyball player known for playing in the NORCECA Beach Volleyball Circuit. How much older was her partner in 2007 than her partner in 2009?,2 years,https://en.wikipedia.org/wiki/Ysaires_Restituyo,https://en.wikipedia.org/wiki/Cinthia_Pi%C3%B1eiro#,https://en.wikipedia.org/wiki/Ana_Ligia_Fabian,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Ysaires_Restituyo', 'https://en.wikipedia.org/wiki/Cinthia_Pi%C3%B1eiro#', 'https://en.wikipedia.org/wiki/Ana_Ligia_Fabian']" +220,"Demi Moore shares a birthday with which costar from the movies ""Margin Call"" and ""Deconstructing Harry""?",Stanley Tucci,https://en.wikipedia.org/wiki/Deconstructing_Harry,https://en.wikipedia.org/wiki/Margin_Call,https://en.wikipedia.org/wiki/Stanley_Tucci,https://en.wikipedia.org/wiki/Demi_Moore,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Deconstructing_Harry', 'https://en.wikipedia.org/wiki/Margin_Call', 'https://en.wikipedia.org/wiki/Stanley_Tucci', 'https://en.wikipedia.org/wiki/Demi_Moore']" +221,If the Great North Run took place on York's Medieval Walls how many circuits of them would the athletes run? Round to the nearest tenth of a circuit.,6.2 circuits.,https://en.wikipedia.org/wiki/York_city_walls,https://en.wikipedia.org/wiki/Great_North_Run,https://en.wikipedia.org/wiki/Half_marathon,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/York_city_walls', 'https://en.wikipedia.org/wiki/Great_North_Run', 'https://en.wikipedia.org/wiki/Half_marathon']" +222,What was the 2021 population of the birthplace of the leader of the party that won the 1869 Newfoundland general election?,"9,162 was the population of Shaftesbury (the birthplace of Charles Fox Bennett) in the 2021 census.",https://en.wikipedia.org/wiki/1869_Newfoundland_general_election,https://en.wikipedia.org/wiki/Charles_Fox_Bennett,https://en.wikipedia.org/wiki/Shaftesbury,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/1869_Newfoundland_general_election', 'https://en.wikipedia.org/wiki/Charles_Fox_Bennett', 'https://en.wikipedia.org/wiki/Shaftesbury']" +223,"After Meat Loaf legally changed his name due to a commercial, what was the first new brand launched by the company to whom the commercial belonged?",Dockers,https://en.wikipedia.org/wiki/Meat_Loaf#Personal_life,https://en.wikipedia.org/wiki/Levi_Strauss_%26_Co.#Blue_jeans_era_(1960s%E2%80%931980s),,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Meat_Loaf#Personal_life', 'https://en.wikipedia.org/wiki/Levi_Strauss_%26_Co.#Blue_jeans_era_(1960s%E2%80%931980s)']" +224,"I'm a concert venue in Washington, D.C. Blink-182 played here on their tour the same year Brazil won their fifth FIFA World Cup. What was my name in 2010?",Verizon Center,https://en.wikipedia.org/wiki/Brazil_national_football_team#FIFA_World_Cup,https://en.wikipedia.org/wiki/Blink-182#Tours,https://en.wikipedia.org/wiki/Pop_Disaster_Tour,https://en.wikipedia.org/wiki/Capital_One_Arena,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Brazil_national_football_team#FIFA_World_Cup', 'https://en.wikipedia.org/wiki/Blink-182#Tours', 'https://en.wikipedia.org/wiki/Pop_Disaster_Tour', 'https://en.wikipedia.org/wiki/Capital_One_Arena']" +225,Which prime minister has been in office between 2017 and 2022 and also shares their middle name with a former monarch of Scotland.,Theresa (Mary) May,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,https://en.wikipedia.org/wiki/Theresa_May,https://en.wikipedia.org/wiki/Boris_Johnson,https://en.wikipedia.org/wiki/Liz_Truss,https://en.wikipedia.org/wiki/Rishi_Sunak,https://en.wikipedia.org/wiki/List_of_Scottish_monarchs,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom', 'https://en.wikipedia.org/wiki/Theresa_May', 'https://en.wikipedia.org/wiki/Boris_Johnson', 'https://en.wikipedia.org/wiki/Liz_Truss', 'https://en.wikipedia.org/wiki/Rishi_Sunak', 'https://en.wikipedia.org/wiki/List_of_Scottish_monarchs']" +226,"Who were the first two women who won the Nobel Prize, in any category, who were also mothers?",Marie Curie and Grazia Deledda,https://en.wikipedia.org/wiki/Marie_Curie,en.wikipedia.org/wiki/Grazia_Deledda,en.wikipedia.org/wiki/List_of_female_Nobel_laureates,https://en.wikipedia.org/wiki/Irène_Joliot-Curie,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Marie_Curie', 'en.wikipedia.org/wiki/Grazia_Deledda', 'en.wikipedia.org/wiki/List_of_female_Nobel_laureates', 'https://en.wikipedia.org/wiki/Irène_Joliot-Curie']" +227,"Of the non-Americans who have won the Phoenix Open as of 2024, who was the youngest at the time of his win?",Hideki Matsuyama,https://en.wikipedia.org/wiki/Phoenix_Open,https://en.wikipedia.org/wiki/Nick_Taylor_(golfer),https://en.wikipedia.org/wiki/Hideki_Matsuyama,https://en.wikipedia.org/wiki/Aaron_Baddeley,https://en.wikipedia.org/wiki/Vijay_Singh,https://en.wikipedia.org/wiki/Jesper_Parnevik,https://en.wikipedia.org/wiki/Sandy_Lyle,https://en.wikipedia.org/wiki/Bruce_Crampton,https://en.wikipedia.org/wiki/George_Knudson,https://en.wikipedia.org/wiki/Bobby_Locke,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Phoenix_Open', 'https://en.wikipedia.org/wiki/Nick_Taylor_(golfer)', 'https://en.wikipedia.org/wiki/Hideki_Matsuyama', 'https://en.wikipedia.org/wiki/Aaron_Baddeley', 'https://en.wikipedia.org/wiki/Vijay_Singh', 'https://en.wikipedia.org/wiki/Jesper_Parnevik', 'https://en.wikipedia.org/wiki/Sandy_Lyle', 'https://en.wikipedia.org/wiki/Bruce_Crampton', 'https://en.wikipedia.org/wiki/George_Knudson', 'https://en.wikipedia.org/wiki/Bobby_Locke']" +228,"The US Naval ship that sunk in Havana Harbor on February 15, 1898, is named for a state that was admitted to the Union while what woman was serving as First Lady?",Elizabeth Monroe,https://en.wikipedia.org/wiki/USS_Maine_(1889),https://en.wikipedia.org/wiki/Maine,https://en.wikipedia.org/wiki/List_of_first_ladies_of_the_United_States,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/USS_Maine_(1889)', 'https://en.wikipedia.org/wiki/Maine', 'https://en.wikipedia.org/wiki/List_of_first_ladies_of_the_United_States']" +229,"As of the 2023 Major League Baseball season, who is the only player in the top 15 lists of career home runs, career runs batted in, and career hits, who was also named (in 2007) the all-time Gold Glove team?",Willie Mays,https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_career_home_run_leaders,https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_career_runs_batted_in_leaders,https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_career_hits_leaders,https://en.wikipedia.org/wiki/Gold_Glove_Award,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_career_home_run_leaders', 'https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_career_runs_batted_in_leaders', 'https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_career_hits_leaders', 'https://en.wikipedia.org/wiki/Gold_Glove_Award']" +230,Who had the best career batting average out of every player to hit a home run in the 2002 World Series matchup between the Anaheim Angeles and San Francisco Giants?,Barry Bonds with a .298 lifetime batting average.,https://en.wikipedia.org/wiki/2002_World_Series#Composite_box,https://en.wikipedia.org/wiki/Barry_Bonds,https://en.wikipedia.org/wiki/Darin_Erstad,https://en.wikipedia.org/wiki/David_Bell_(baseball),https://en.wikipedia.org/wiki/Jeff_Kent,https://en.wikipedia.org/wiki/J._T._Snow,https://en.wikipedia.org/wiki/Reggie_Sanders,https://en.wikipedia.org/wiki/Rich_Aurilia,https://en.wikipedia.org/wiki/Scott_Spiezio,https://en.wikipedia.org/wiki/Shawon_Dunston,"https://en.wikipedia.org/wiki/Tim_Salmon, https://en.wikipedia.org/wiki/Troy_Glaus",Tabular reasoning,"['https://en.wikipedia.org/wiki/2002_World_Series#Composite_box', 'https://en.wikipedia.org/wiki/Barry_Bonds', 'https://en.wikipedia.org/wiki/Darin_Erstad', 'https://en.wikipedia.org/wiki/David_Bell_(baseball)', 'https://en.wikipedia.org/wiki/Jeff_Kent', 'https://en.wikipedia.org/wiki/J._T._Snow', 'https://en.wikipedia.org/wiki/Reggie_Sanders', 'https://en.wikipedia.org/wiki/Rich_Aurilia', 'https://en.wikipedia.org/wiki/Scott_Spiezio', 'https://en.wikipedia.org/wiki/Shawon_Dunston', 'https://en.wikipedia.org/wiki/Tim_Salmon, https://en.wikipedia.org/wiki/Troy_Glaus']" +231,"Out of all of the feature-length theatrical films that John Carpenter directed before 2015, which has the longest running time?",Starman (1984),https://en.wikipedia.org/wiki/John_Carpenter_filmography,https://en.wikipedia.org/wiki/Dark_Star_(film),https://en.wikipedia.org/wiki/Assault_on_Precinct_13_(1976_film),https://en.wikipedia.org/wiki/Halloween_(1978_film),https://en.wikipedia.org/wiki/The_Fog,https://en.wikipedia.org/wiki/Escape_from_New_York,https://en.wikipedia.org/wiki/The_Thing_(1982_film),https://en.wikipedia.org/wiki/Christine_(1983_film),https://en.wikipedia.org/wiki/Starman_(film),https://en.wikipedia.org/wiki/Big_Trouble_in_Little_China,"https://en.wikipedia.org/wiki/Prince_of_Darkness_(film), https://en.wikipedia.org/wiki/They_Live, https://en.wikipedia.org/wiki/Memoirs_of_an_Invisible_Man_(film), https://en.wikipedia.org/wiki/In_the_Mouth_of_Madness, https://en.wikipedia.org/wiki/Village_of_the_Damned_(1995_film), https://en.wikipedia.org/wiki/Escape_from_L.A., https://en.wikipedia.org/wiki/Vampires_(1998_film), https://en.wikipedia.org/wiki/Ghosts_of_Mars, https://en.wikipedia.org/wiki/The_Ward_(film)",Tabular reasoning,"['https://en.wikipedia.org/wiki/John_Carpenter_filmography', 'https://en.wikipedia.org/wiki/Dark_Star_(film)', 'https://en.wikipedia.org/wiki/Assault_on_Precinct_13_(1976_film)', 'https://en.wikipedia.org/wiki/Halloween_(1978_film)', 'https://en.wikipedia.org/wiki/The_Fog', 'https://en.wikipedia.org/wiki/Escape_from_New_York', 'https://en.wikipedia.org/wiki/The_Thing_(1982_film)', 'https://en.wikipedia.org/wiki/Christine_(1983_film)', 'https://en.wikipedia.org/wiki/Starman_(film)', 'https://en.wikipedia.org/wiki/Big_Trouble_in_Little_China', 'https://en.wikipedia.org/wiki/Prince_of_Darkness_(film), https://en.wikipedia.org/wiki/They_Live, https://en.wikipedia.org/wiki/Memoirs_of_an_Invisible_Man_(film), https://en.wikipedia.org/wiki/In_the_Mouth_of_Madness, https://en.wikipedia.org/wiki/Village_of_the_Damned_(1995_film), https://en.wikipedia.org/wiki/Escape_from_L.A., https://en.wikipedia.org/wiki/Vampires_(1998_film), https://en.wikipedia.org/wiki/Ghosts_of_Mars, https://en.wikipedia.org/wiki/The_Ward_(film)']" +232,How much taller (in centimetres) is Mark O'Halloran (ex West Tigers rugby league player) than the London Broncos' player with the heritage number 341?,11cm,https://en.wikipedia.org/wiki/Mark_O%27Halloran_(rugby_league),https://en.wikipedia.org/wiki/Robert_Fico,https://en.wikipedia.org/wiki/London_Broncos#1994%E2%80%932005:_Broncos_and_Super_League,https://en.wikipedia.org/wiki/Steele_Retchless,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Mark_O%27Halloran_(rugby_league)', 'https://en.wikipedia.org/wiki/Robert_Fico', 'https://en.wikipedia.org/wiki/London_Broncos#1994%E2%80%932005:_Broncos_and_Super_League', 'https://en.wikipedia.org/wiki/Steele_Retchless']" +233,"This founder of the Academy of Science, St. Louis became established as a botonist for a monograph he did in 1842. What is the scientic name for plant featured in that monograph?",Cuscuta,"https://en.wikipedia.org/wiki/Academy_of_Science,_St._Louis",https://en.wikipedia.org/wiki/George_Engelmann,https://en.wikipedia.org/wiki/Cuscuta,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Academy_of_Science,_St._Louis', 'https://en.wikipedia.org/wiki/George_Engelmann', 'https://en.wikipedia.org/wiki/Cuscuta']" +234,"Which American actress, born the same year Denzel Washington won his first Best Actor in a Leading Role Academy Award, had a supporting role in the second season of the Netflix series ""You""?",Jenna Ortega,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Denzel_Washington,https://en.wikipedia.org/wiki/Jenna_Ortega,https://en.wikipedia.org/wiki/Victoria_Pedretti,https://en.wikipedia.org/wiki/Carmela_Zumbado,https://en.wikipedia.org/wiki/Ambyr_Childers,https://en.wikipedia.org/wiki/Saffron_Burrows,https://en.wikipedia.org/wiki/Tati_Gabrielle,https://en.wikipedia.org/wiki/You_(TV_series)#Season_2_(2019),,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Denzel_Washington', 'https://en.wikipedia.org/wiki/Jenna_Ortega', 'https://en.wikipedia.org/wiki/Victoria_Pedretti', 'https://en.wikipedia.org/wiki/Carmela_Zumbado', 'https://en.wikipedia.org/wiki/Ambyr_Childers', 'https://en.wikipedia.org/wiki/Saffron_Burrows', 'https://en.wikipedia.org/wiki/Tati_Gabrielle', 'https://en.wikipedia.org/wiki/You_(TV_series)#Season_2_(2019)']" +235,"What is the birthday of the man who produced the pop song named after one of the largest bird in the Procellariiformes species, off the 2024 album The Tortured Poets Department?","Aaron Dessner was born April 23, 1976.",https://en.wikipedia.org/wiki/Procellariiformes,https://en.wikipedia.org/wiki/Aaron_Dessner,https://en.wikipedia.org/wiki/The_Tortured_Poets_Department,,,,,,,,,Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Procellariiformes', 'https://en.wikipedia.org/wiki/Aaron_Dessner', 'https://en.wikipedia.org/wiki/The_Tortured_Poets_Department']" +236,What Jeep model shares its name with the Secret Service codename for a 21st-century US president?,Renegade,https://en.wikipedia.org/wiki/Secret_Service_code_name,https://en.wikipedia.org/wiki/List_of_Jeep_vehicles,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Secret_Service_code_name', 'https://en.wikipedia.org/wiki/List_of_Jeep_vehicles']" +237,Who was the Prime Minister of Canada in the year that the 1965 winner of the Best New Artist Grammy Award made their first appearance on US television?,Lester Pearson,https://en.wikipedia.org/wiki/Grammy_Award_for_Best_New_Artist#1960s,https://en.wikipedia.org/wiki/The_Beatles,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Canada#Prime_ministers,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Grammy_Award_for_Best_New_Artist#1960s', 'https://en.wikipedia.org/wiki/The_Beatles', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Canada#Prime_ministers']" +238,"Part of the dynamic duo who broke a record set by Mickey Lolich and Bill Freehan in 1975 for most starts together, this MLB pitcher's battery mate ranked first all-time among catchers in putouts as of 2022. In what year did he (the pitcher) make his Grand Ole Opry debut?",2024,https://en.wikipedia.org/wiki/St._Louis_Cardinals,https://en.wikipedia.org/wiki/Yadier_Molina,https://en.wikipedia.org/wiki/Adam_Wainwright,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/St._Louis_Cardinals', 'https://en.wikipedia.org/wiki/Yadier_Molina', 'https://en.wikipedia.org/wiki/Adam_Wainwright']" +239,How much time passed between the release of the blockchain platform founded by the individual who resides in the town of the same name as the James Buchanan House and the release of the prior blockchain platform co-founded by the same individual?,"Wheatland, also known as the James Buchanan House, is the same name as the town in Wyoming where Charles Hoskinson resides. Charles Hoskinson cofounded Ethereum and founded Cardano. Cardano was released in 2017; Ethereum was released in 2015. Two years elapsed between the two platforms' releases.",https://en.wikipedia.org/wiki/Wheatland_(James_Buchanan_House),"https://en.wikipedia.org/wiki/Wheatland,_Wyoming",https://en.wikipedia.org/wiki/Cardano_(blockchain_platform),https://en.wikipedia.org/wiki/Ethereum#Ether,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Wheatland_(James_Buchanan_House)', 'https://en.wikipedia.org/wiki/Wheatland,_Wyoming', 'https://en.wikipedia.org/wiki/Cardano_(blockchain_platform)', 'https://en.wikipedia.org/wiki/Ethereum#Ether']" +240,Who was the character in the roster of the Marvel vs. Capcom entry that was ported to the PlayStation 4 in 2016 that represented a video game franchise that has zombies as main antagonists and is the player character of a game released in 1999?,Jill Valentine,https://en.wikipedia.org/wiki/Marvel_vs._Capcom,https://en.wikipedia.org/wiki/Ultimate_Marvel_vs._Capcom_3,https://en.wikipedia.org/wiki/Jill_Valentine,https://en.wikipedia.org/wiki/Nemesis_(Resident_Evil),https://en.wikipedia.org/wiki/Albert_Wesker,https://en.wikipedia.org/wiki/Chris_Redfield,https://en.wikipedia.org/wiki/Frank_West_(Dead_Rising),https://en.wikipedia.org/wiki/Resident_Evil_3:_Nemesis,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Marvel_vs._Capcom', 'https://en.wikipedia.org/wiki/Ultimate_Marvel_vs._Capcom_3', 'https://en.wikipedia.org/wiki/Jill_Valentine', 'https://en.wikipedia.org/wiki/Nemesis_(Resident_Evil)', 'https://en.wikipedia.org/wiki/Albert_Wesker', 'https://en.wikipedia.org/wiki/Chris_Redfield', 'https://en.wikipedia.org/wiki/Frank_West_(Dead_Rising)', 'https://en.wikipedia.org/wiki/Resident_Evil_3:_Nemesis']" +241,"The screenwriter of the film, which received nominations for Best Screenplay and Best Actor in a Motion Picture - Drama at the 1995 Golden Globes, attended which Michigan university?",Grand Valley State University,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_in_a_Motion_Picture_%E2%80%93_Drama,https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Screenplay,https://en.wikipedia.org/wiki/Tim_Robbins,https://en.wikipedia.org/wiki/Patrick_Sheane_Duncan,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_in_a_Motion_Picture_%E2%80%93_Drama', 'https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Screenplay', 'https://en.wikipedia.org/wiki/Tim_Robbins', 'https://en.wikipedia.org/wiki/Patrick_Sheane_Duncan']" +242,NASA launched an Apollo mission a year after the Stonewall Riots. How many collective children did the astronauts onboard that mission have?,8,https://en.wikipedia.org/wiki/Stonewall_riots,https://en.wikipedia.org/wiki/List_of_Apollo_missions,https://en.wikipedia.org/wiki/Jim_Lovell,https://en.wikipedia.org/wiki/Jack_Swigert,https://en.wikipedia.org/wiki/Fred_Haise,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Stonewall_riots', 'https://en.wikipedia.org/wiki/List_of_Apollo_missions', 'https://en.wikipedia.org/wiki/Jim_Lovell', 'https://en.wikipedia.org/wiki/Jack_Swigert', 'https://en.wikipedia.org/wiki/Fred_Haise']" +243,A United States women's national soccer team player scored her first career international goal during the semi-final match of the 2015 FIFA Women's World Cup. This same player scored her second goal the next year. Tell me the difference in attendance between these two games.,"43,518",https://en.wikipedia.org/wiki/2015_FIFA_Women%27s_World_Cup_knockout_stage,https://en.wikipedia.org/wiki/Kelley_O%27Hara,https://en.wikipedia.org/wiki/Carli_Lloyd,https://en.wikipedia.org/wiki/2016_CONCACAF_Women%27s_Olympic_Qualifying_Championship#Group_A,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/2015_FIFA_Women%27s_World_Cup_knockout_stage', 'https://en.wikipedia.org/wiki/Kelley_O%27Hara', 'https://en.wikipedia.org/wiki/Carli_Lloyd', 'https://en.wikipedia.org/wiki/2016_CONCACAF_Women%27s_Olympic_Qualifying_Championship#Group_A']" +244,"Who is the Formula One driver who won their first Driver's Championship in the 46th season, what team did they race for that year, and how many years after that team's first race was it?","Nigel Mansell, Williams Grand Prix Engineering, 15 years",https://en.wikipedia.org/wiki/History_of_Formula_One,https://en.wikipedia.org/wiki/1992_Formula_One_World_Championship,https://en.m.wikipedia.org/wiki/Williams_Grand_Prix_Engineering,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/History_of_Formula_One', 'https://en.wikipedia.org/wiki/1992_Formula_One_World_Championship', 'https://en.m.wikipedia.org/wiki/Williams_Grand_Prix_Engineering']" +245,"There is only one existing lighthouse with attached living quarters in the ninth-largest US state by area, as of August 1, 2024. This lighthouse is located on the north side of a bay named for which tribe?",The Yaquina Tribe,https://en.wikipedia.org/wiki/Yaquina_Bay_Light,https://en.wikipedia.org/wiki/Oregon,https://en.wikipedia.org/wiki/Yaquina_Bay,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Yaquina_Bay_Light', 'https://en.wikipedia.org/wiki/Oregon', 'https://en.wikipedia.org/wiki/Yaquina_Bay']" +246,"Consider the following three people: 1. Edmund, who turned 10 on the day of the Battle of Hastings 2. Edward, who turned 12 on the day that Guy Fawkes was executed 3. Eddie, who turned 14 on the day of the London 2012 Summer Olympics opening ceremony Who would be oldest: Edmund on the day King Henry I of England died, Edward on the day of the Battle of Naseby, or Eddie on the day Liz Truss announced her resignation as Conservative Party leader?",Edmund,https://en.wikipedia.org/wiki/Battle_of_Hastings,https://en.wikipedia.org/wiki/Guy_Fawkes,https://en.wikipedia.org/wiki/2012_Summer_Olympics_opening_ceremony,https://en.wikipedia.org/wiki/Henry_I_of_England,https://en.wikipedia.org/wiki/Battle_of_Naseby,https://en.wikipedia.org/wiki/Liz_Truss,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Battle_of_Hastings', 'https://en.wikipedia.org/wiki/Guy_Fawkes', 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_opening_ceremony', 'https://en.wikipedia.org/wiki/Henry_I_of_England', 'https://en.wikipedia.org/wiki/Battle_of_Naseby', 'https://en.wikipedia.org/wiki/Liz_Truss']" +247,What was the final league position of the football team found in the city where José Loiola won his first World Championship gold medal for the season which began in that same year when he won gold?,15th,https://en.wikipedia.org/wiki/Jos%C3%A9_Loiola,https://en.wikipedia.org/wiki/1999%E2%80%932000_Olympique_de_Marseille_season,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Jos%C3%A9_Loiola', 'https://en.wikipedia.org/wiki/1999%E2%80%932000_Olympique_de_Marseille_season']" +248,"Which is bigger based on their maximum average lengths multiplied by their number of legs: an elephant beetle, a brown rhinoceros beetle, or a bee hummingbird?",Elephant beetle,https://en.wikipedia.org/wiki/Elephant_beetle,https://en.wikipedia.org/wiki/Xylotrupes_gideon,https://en.wikipedia.org/wiki/Bee_hummingbird,https://en.wikipedia.org/wiki/Bird#Anatomy_and_physiology,https://en.wikipedia.org/wiki/Insect,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Elephant_beetle', 'https://en.wikipedia.org/wiki/Xylotrupes_gideon', 'https://en.wikipedia.org/wiki/Bee_hummingbird', 'https://en.wikipedia.org/wiki/Bird#Anatomy_and_physiology', 'https://en.wikipedia.org/wiki/Insect']" +249,"What is the name of the Japanese man who protested the U.S.'s involvement in the Iraq War, and who has also been awarded multiple times the same award that ""Shrek"" won in 2002, beating ""Jimmy Neutron: Boy Genius""?",Hayao Miyazaki,https://en.wikipedia.org/wiki/Shrek#Accolades,https://en.wikipedia.org/wiki/74th_Academy_Awards#Winners_and_nominees,https://en.wikipedia.org/wiki/Academy_Award_for_Best_Animated_Feature#Multiple_wins_and_nominations,https://en.wikipedia.org/wiki/Hayao_Miyazaki#Views,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Shrek#Accolades', 'https://en.wikipedia.org/wiki/74th_Academy_Awards#Winners_and_nominees', 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Animated_Feature#Multiple_wins_and_nominations', 'https://en.wikipedia.org/wiki/Hayao_Miyazaki#Views']" +250,The Sikh empire's capital at the time of the Battle of Sobraon came under the rule of the British Crown in what year?,1858,https://en.wikipedia.org/wiki/Battle_of_Sobraon,https://en.wikipedia.org/wiki/Lahore,https://en.wikipedia.org/wiki/Punjab_Province_(British_India),,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Battle_of_Sobraon', 'https://en.wikipedia.org/wiki/Lahore', 'https://en.wikipedia.org/wiki/Punjab_Province_(British_India)']" +251,"I remember reading a book in elementary school that I LOVED and I want to read it to my daughter. The problem is I can't remember the title. I know that when I first read it, it had recently won a Caldecott Medal. I have another memory from around the same time period of watching the Sydney Summer Olympics. All I remember about the plot is that it was based on a true story and set in the winter time. Can you help me remember the title?",Snowflake Bentley,https://en.wikipedia.org/wiki/Caldecott_Medal#Recipients,https://en.wikipedia.org/wiki/2000_Summer_Olympics,https://en.wikipedia.org/wiki/Snowflake_Bentley_(book),,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Caldecott_Medal#Recipients', 'https://en.wikipedia.org/wiki/2000_Summer_Olympics', 'https://en.wikipedia.org/wiki/Snowflake_Bentley_(book)']" +252,What is the name of the town or city of birth of the player who won the men's singles at the US Open on the year after Venus & Serena Williams played each other for the 8th time as pros?,"Omaha, Nebraska",https://en.wikipedia.org/wiki/Williams_sisters_rivalry,https://en.wikipedia.org/wiki/2003_US_Open_(tennis),https://en.wikipedia.org/wiki/Andy_Roddick,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Williams_sisters_rivalry', 'https://en.wikipedia.org/wiki/2003_US_Open_(tennis)', 'https://en.wikipedia.org/wiki/Andy_Roddick']" +253,Which fast food chain did the sponsor of the Women's 2018 Volleyball Thailand League acquire rights to in 1987?,Kentucky Fried Chicken,https://en.wikipedia.org/wiki/2018%E2%80%9319_Women%27s_Volleyball_Thailand_League,https://en.wikipedia.org/wiki/Charoen_Pokphand,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/2018%E2%80%9319_Women%27s_Volleyball_Thailand_League', 'https://en.wikipedia.org/wiki/Charoen_Pokphand']" +254,How many calories are in 7 oz. of the fruit given as a present in the musical that won Best Revival of a Musical at the 52nd Annual Tony Awards?,100 calories,https://en.wikipedia.org/wiki/52nd_Tony_Awards,https://en.wikipedia.org/wiki/Cabaret_(musical),https://en.wikipedia.org/wiki/Pineapple,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/52nd_Tony_Awards', 'https://en.wikipedia.org/wiki/Cabaret_(musical)', 'https://en.wikipedia.org/wiki/Pineapple']" +255,"In the second album of a synthpop-rock band from the county seat city of Utah County, which song has a length of under 3 minutes, not counting any bonus tracks?",Everybody Talks,"https://en.wikipedia.org/wiki/Utah_County,_Utah","https://en.wikipedia.org/wiki/Provo,_Utah","https://en.wikipedia.org/wiki/List_of_people_from_Provo,_Utah",https://en.wikipedia.org/wiki/Neon_Trees,https://en.wikipedia.org/wiki/Picture_Show_(album),,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Utah_County,_Utah', 'https://en.wikipedia.org/wiki/Provo,_Utah', 'https://en.wikipedia.org/wiki/List_of_people_from_Provo,_Utah', 'https://en.wikipedia.org/wiki/Neon_Trees', 'https://en.wikipedia.org/wiki/Picture_Show_(album)']" +256,"The name of which supernova remnant nebula spelled backwards (not including the word ""nubula"") is a homonym for a large sailing ship?","Crab Nebula (""barc"" is a homonym for ""barque"")",https://en.wikipedia.org/wiki/List_of_supernova_remnants,https://en.wikipedia.org/wiki/Barque,,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/List_of_supernova_remnants', 'https://en.wikipedia.org/wiki/Barque']" +257,What award did the arachnologist who discovered several species of pseudoscorpions endemic to Australia receive in 2013? The species he discovered in 1987 belong to the Garypidae family and the Synsphyronus genus.,The Bonnet Award by the International Society of Arachnology,https://en.wikipedia.org/wiki/Pseudoscorpion#Classification,https://en.wikipedia.org/wiki/Garypidae#Genera,https://en.wikipedia.org/wiki/Synsphyronus#Species,"https://en.wikipedia.org/wiki/Mark_Harvey_(arachnologist)#Achievements,_awards_and_recognition",,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Pseudoscorpion#Classification', 'https://en.wikipedia.org/wiki/Garypidae#Genera', 'https://en.wikipedia.org/wiki/Synsphyronus#Species', 'https://en.wikipedia.org/wiki/Mark_Harvey_(arachnologist)#Achievements,_awards_and_recognition']" +258,"As of August 2, 2024, what is the title of the most viewed episode in the second most viewed season of the TV show that Zooey Deschanel stars in as a character named ""Jess Day""?","The most viewed episode of the second season (second most viewed) is its first episode, ""Re-Launch"". ",https://en.wikipedia.org/wiki/Zooey_Deschanel,https://en.wikipedia.org/wiki/New_Girl#Episodes,https://en.wikipedia.org/wiki/List_of_New_Girl_episodes#Season_2_(2012%E2%80%9313),,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Zooey_Deschanel', 'https://en.wikipedia.org/wiki/New_Girl#Episodes', 'https://en.wikipedia.org/wiki/List_of_New_Girl_episodes#Season_2_(2012%E2%80%9313)']" +259,American author Joan Didion's second fiction novel has a film adaptation. The director of this film is the uncle of a famous pop singer This pop singer once famously sang a song with lyrics describing a combustible-containing device. This song received nominations for Best Pop Solo Performance and Record of the Year at the 54th Annual Grammy Awards. This song was inspired by another book. What is the name of the song and the book that inspired it?,"Firework by Katy Perry was inspired by ""On the Road"" by Jack Kerouac ",https://en.wikipedia.org/wiki/Joan_Didion,https://en.wikipedia.org/wiki/Play_It_as_It_Lays,https://en.wikipedia.org/wiki/Play_It_as_It_Lays_(film),https://en.wikipedia.org/wiki/Frank_Perry,https://en.wikipedia.org/wiki/Katy_Perry,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Katy_Perry#Awards_and_nominations,,,,,,Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Joan_Didion', 'https://en.wikipedia.org/wiki/Play_It_as_It_Lays', 'https://en.wikipedia.org/wiki/Play_It_as_It_Lays_(film)', 'https://en.wikipedia.org/wiki/Frank_Perry', 'https://en.wikipedia.org/wiki/Katy_Perry', 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Katy_Perry#Awards_and_nominations']" +260,"Canadian politician Keir Clark attended a now defunct university college, which shut its doors in 1969. Who served as the first President of its successor institution?",Ronald James Baker,https://en.wikipedia.org/wiki/Keir_Clark,https://en.wikipedia.org/wiki/Prince_of_Wales_College,https://en.wikipedia.org/wiki/University_of_Prince_Edward_Island,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Keir_Clark', 'https://en.wikipedia.org/wiki/Prince_of_Wales_College', 'https://en.wikipedia.org/wiki/University_of_Prince_Edward_Island']" +261,"As of August 2024, who was president of the United States the last time The Beach Boys topped the chart on the Billboard Hot 100?",Ronald Reagan,https://en.wikipedia.org/wiki/The_Beach_Boys#History,https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_1988,https://en.wikipedia.org/wiki/President_of_the_United_States#History_and_development,https://en.wikipedia.org/wiki/Ronald_Reagan,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/The_Beach_Boys#History', 'https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_1988', 'https://en.wikipedia.org/wiki/President_of_the_United_States#History_and_development', 'https://en.wikipedia.org/wiki/Ronald_Reagan']" +262,"As of August 1, 2024, which NFL players were both league MVP and Super Bowl MVP in the same season?","Bart Starr (66), Terry Bradshaw (78), Joe Montana (89), Emmit Smith (93), Steve Young (94), Kurt Warner (99), and Patrick Mahomes (22).",https://en.wikipedia.org/wiki/Associated_Press_NFL_Most_Valuable_Player_Award,https://en.wikipedia.org/wiki/Super_Bowl_Most_Valuable_Player_Award,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Associated_Press_NFL_Most_Valuable_Player_Award', 'https://en.wikipedia.org/wiki/Super_Bowl_Most_Valuable_Player_Award']" +263,Who was the wife of the founder of the city where the 2023 Tour De France started?,Violant of Castile,https://en.wikipedia.org/wiki/2023_Tour_de_France,https://en.wikipedia.org/wiki/Bilbao,https://en.wikipedia.org/wiki/Diego_L%C3%B3pez_V_de_Haro,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/2023_Tour_de_France', 'https://en.wikipedia.org/wiki/Bilbao', 'https://en.wikipedia.org/wiki/Diego_L%C3%B3pez_V_de_Haro']" +264,What was the military rank of the employer of the astronomer who discovered the globular cluster of stars called NGC 6441?,Major General was the rank of Thomas Brisbane.,https://en.wikipedia.org/wiki/NGC_6441,https://en.wikipedia.org/wiki/James_Dunlop,https://en.wikipedia.org/wiki/Thomas_Brisbane,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/NGC_6441', 'https://en.wikipedia.org/wiki/James_Dunlop', 'https://en.wikipedia.org/wiki/Thomas_Brisbane']" +265,"What is the average height of Mount Everest, Mount Thor, Mount Denali and The Matterhorn?",17382 ft.,https://en.wikipedia.org/wiki/Denali,https://en.wikipedia.org/wiki/Matterhorn,https://en.wikipedia.org/wiki/Mount_Thor,https://en.wikipedia.org/wiki/Mount_Everest,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Denali', 'https://en.wikipedia.org/wiki/Matterhorn', 'https://en.wikipedia.org/wiki/Mount_Thor', 'https://en.wikipedia.org/wiki/Mount_Everest']" +266,"How old was the famous composer who taught at the Ospedale della Pietà and was known as ""The Red Priest"" when his first opera premiered? ","Antonio Vivaldi was 35 when his first opera, Ottone in villa, premiered.",https://en.wikipedia.org/wiki/Ospedale_della_Piet%C3%A0,https://en.wikipedia.org/wiki/Antonio_Vivaldi,https://en.wikipedia.org/wiki/List_of_operas_by_Antonio_Vivaldi,https://en.wikipedia.org/wiki/Ottone_in_villa,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Ospedale_della_Piet%C3%A0', 'https://en.wikipedia.org/wiki/Antonio_Vivaldi', 'https://en.wikipedia.org/wiki/List_of_operas_by_Antonio_Vivaldi', 'https://en.wikipedia.org/wiki/Ottone_in_villa']" +267,"Twenty-three years after the deadliest battle in the US Civil War, who served as governor in the state in which this battle was fought?",Robert E Pattison,https://en.wikipedia.org/wiki/List_of_costliest_American_Civil_War_land_battles,https://en.wikipedia.org/wiki/Battle_of_Gettysburg,https://en.wikipedia.org/wiki/List_of_governors_of_Pennsylvania,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_costliest_American_Civil_War_land_battles', 'https://en.wikipedia.org/wiki/Battle_of_Gettysburg', 'https://en.wikipedia.org/wiki/List_of_governors_of_Pennsylvania']" +268,Which president featured on a U.S. coin in 1972 served the longest as President of the United States of America?,Franklin Delano Roosevelt (12 years).,https://en.wikipedia.org/wiki/Penny_(United_States_coin),https://en.wikipedia.org/wiki/Nickel_(United_States_coin),https://en.wikipedia.org/wiki/Dime_(United_States_coin),https://en.wikipedia.org/wiki/Quarter_(United_States_coin),https://en.wikipedia.org/wiki/Half_dollar_(United_States_coin),https://en.wikipedia.org/wiki/Dollar_coin_(United_States),https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Penny_(United_States_coin)', 'https://en.wikipedia.org/wiki/Nickel_(United_States_coin)', 'https://en.wikipedia.org/wiki/Dime_(United_States_coin)', 'https://en.wikipedia.org/wiki/Quarter_(United_States_coin)', 'https://en.wikipedia.org/wiki/Half_dollar_(United_States_coin)', 'https://en.wikipedia.org/wiki/Dollar_coin_(United_States)', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States']" +269,"Tiny Tina's Wonderlands' developer released a game 10 years prior to Tiny Tina's Wonderlands' release, this game was released on PC and Consoles. What is the name of the central antagonist of that game?",Handsome Jack.,https://en.wikipedia.org/wiki/Tiny_Tina%27s_Wonderlands,https://en.wikipedia.org/wiki/Gearbox_Software,https://en.wikipedia.org/wiki/Borderlands_2,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Tiny_Tina%27s_Wonderlands', 'https://en.wikipedia.org/wiki/Gearbox_Software', 'https://en.wikipedia.org/wiki/Borderlands_2']" +270,"In 2023, Beavers are known to be the second-largest living rodents. The largest extant rodent species are natively most prevalent in a single continent. What is the largest country in this continent?",Brazil.,https://en.wikipedia.org/wiki/Beaver,https://en.wikipedia.org/wiki/Hydrochoerus,https://en.wikipedia.org/wiki/Capybara,https://en.wikipedia.org/wiki/South_America,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Beaver', 'https://en.wikipedia.org/wiki/Hydrochoerus', 'https://en.wikipedia.org/wiki/Capybara', 'https://en.wikipedia.org/wiki/South_America']" +271,Which family duo both made an appearance at the 2017 Billboard Music Awards and starred together on a Disney Channel Original Series?,Miley Cyrus and Billy Ray Cyrus,https://en.wikipedia.org/wiki/2017_Billboard_Music_Awards,https://en.wikipedia.org/wiki/List_of_programs_broadcast_by_Disney_Channel,https://en.wikipedia.org/wiki/Hannah_Montana,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/2017_Billboard_Music_Awards', 'https://en.wikipedia.org/wiki/List_of_programs_broadcast_by_Disney_Channel', 'https://en.wikipedia.org/wiki/Hannah_Montana']" +272,"Two seasons after Demar Derozan was traded to the San Antonio Spurs, who was the leading scorer for the fifth place team in the Western Conference?",Luka Dončić,https://en.wikipedia.org/wiki/DeMar_DeRozan,https://en.wikipedia.org/wiki/2020%E2%80%9321_NBA_season,https://en.wikipedia.org/wiki/2020%E2%80%9321_Dallas_Mavericks_season,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/DeMar_DeRozan', 'https://en.wikipedia.org/wiki/2020%E2%80%9321_NBA_season', 'https://en.wikipedia.org/wiki/2020%E2%80%9321_Dallas_Mavericks_season']" +273,What William Wyler movie debuted the same year that the chairman for the 1982–83 Wolverhampton Wanderers season was born?,Jezebel,https://en.wikipedia.org/wiki/1982%E2%80%9383_Wolverhampton_Wanderers_F.C._season,https://en.wikipedia.org/wiki/Derek_Dougan,https://en.wikipedia.org/wiki/William_Wyler,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1982%E2%80%9383_Wolverhampton_Wanderers_F.C._season', 'https://en.wikipedia.org/wiki/Derek_Dougan', 'https://en.wikipedia.org/wiki/William_Wyler']" +274,"How many more letters does the name of the city that the director of ""Whiplash"" (2014) was born in have compared the name of the city in which the film first premiered?",2,https://en.wikipedia.org/wiki/Whiplash_(2014_film)#,https://en.wikipedia.org/wiki/Damien_Chazelle,https://en.wikipedia.org/wiki/2014_Sundance_Film_Festival,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/Whiplash_(2014_film)#', 'https://en.wikipedia.org/wiki/Damien_Chazelle', 'https://en.wikipedia.org/wiki/2014_Sundance_Film_Festival']" +275,The Brihadeeswarar Temple was built by an Indian emperor. The emperor’s only sister‘s husband is a king of another Dynasty. Name the Dynasty and state how many known kings ruled within that Dynasty.,Bana Kingdom. 10 Kings.,https://en.wikipedia.org/wiki/Brihadisvara_Temple,https://en.wikipedia.org/wiki/Rajaraja_I,https://en.wikipedia.org/wiki/Kundavai_Pir%C4%81ttiy%C4%81r,https://en.wikipedia.org/wiki/Bana_Kingdom,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Brihadisvara_Temple', 'https://en.wikipedia.org/wiki/Rajaraja_I', 'https://en.wikipedia.org/wiki/Kundavai_Pir%C4%81ttiy%C4%81r', 'https://en.wikipedia.org/wiki/Bana_Kingdom']" +276,"Of the 'Big Four' of Thrash Metal, whose debut full-length was released first?",Metallica,https://en.wikipedia.org/wiki/Thrash_metal,https://en.wikipedia.org/wiki/Metallica,https://en.wikipedia.org/wiki/Slayer,https://en.wikipedia.org/wiki/Megadeth,https://en.wikipedia.org/wiki/Anthrax_(American_band),https://en.wikipedia.org/wiki/Kill_%27Em_All,https://en.wikipedia.org/wiki/Show_No_Mercy,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Thrash_metal', 'https://en.wikipedia.org/wiki/Metallica', 'https://en.wikipedia.org/wiki/Slayer', 'https://en.wikipedia.org/wiki/Megadeth', 'https://en.wikipedia.org/wiki/Anthrax_(American_band)', 'https://en.wikipedia.org/wiki/Kill_%27Em_All', 'https://en.wikipedia.org/wiki/Show_No_Mercy']" +277,What is the date of the movie directed by Gordon Douglas that featured an American decathlete who was a part of the 1984 Summer Olympic Torch Relay and first African American to light the cauldron?,"April 2, 1961",https://en.wikipedia.org/wiki/1984_Summer_Olympics,https://en.wikipedia.org/wiki/Rafer_Johnson,https://en.wikipedia.org/wiki/The_Sins_of_Rachel_Cade,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1984_Summer_Olympics', 'https://en.wikipedia.org/wiki/Rafer_Johnson', 'https://en.wikipedia.org/wiki/The_Sins_of_Rachel_Cade']" +278,A 1986 song by Peter Gabriel shares the name with a tool. What is the name of the tool and how does it look?,"Sledgehammer: A tool with a large, flat, often metal head, attached to a long handle.",https://en.wikipedia.org/wiki/Peter_Gabriel,https://en.wikipedia.org/wiki/So_(album),https://en.wikipedia.org/wiki/Sledgehammer,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Peter_Gabriel', 'https://en.wikipedia.org/wiki/So_(album)', 'https://en.wikipedia.org/wiki/Sledgehammer']" +279,How old was the author of Dragon Ball when the manga was first released?,29,https://en.wikipedia.org/wiki/Dragon_Ball,https://en.wikipedia.org/wiki/Akira_Toriyama,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Dragon_Ball', 'https://en.wikipedia.org/wiki/Akira_Toriyama']" +280,I'm thinking of a dam. Here are some clues: -It had two official names in its history. -Construction was ordered to begin by the husband of the President of The Girl Scouts of the USA in 1936.,Hoover Dam (briefly known as Boulder Dam),https://en.wikipedia.org/wiki/Girl_Scouts_of_the_USA#Presidents,https://en.wikipedia.org/wiki/Lou_Henry_Hoover,https://en.wikipedia.org/wiki/Herbert_Hoover,https://en.wikipedia.org/wiki/Hoover_Dam,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Girl_Scouts_of_the_USA#Presidents', 'https://en.wikipedia.org/wiki/Lou_Henry_Hoover', 'https://en.wikipedia.org/wiki/Herbert_Hoover', 'https://en.wikipedia.org/wiki/Hoover_Dam']" +281,I am the narrator character in the final novel written by the recipient of the 1963 Hugo Award for Best Novel. Who am I?,Angel Archer.,https://en.wikipedia.org/wiki/Hugo_Award_for_Best_Novel,https://en.wikipedia.org/wiki/Philip_K._Dick#Career,https://en.wikipedia.org/wiki/The_Transmigration_of_Timothy_Archer,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Hugo_Award_for_Best_Novel', 'https://en.wikipedia.org/wiki/Philip_K._Dick#Career', 'https://en.wikipedia.org/wiki/The_Transmigration_of_Timothy_Archer']" +282,Who developed the first effective vaccine against the disease that killed the father of a famous Hungarian composer born in 1811?,Almroth Edward Wright,https://en.wikipedia.org/wiki/List_of_Hungarian_composers,https://en.wikipedia.org/wiki/Franz_Liszt,https://en.wikipedia.org/wiki/Typhoid_fever,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Hungarian_composers', 'https://en.wikipedia.org/wiki/Franz_Liszt', 'https://en.wikipedia.org/wiki/Typhoid_fever']" +283,"What was the last prose book written by the poet who referred to Wyndham Lewis as ""that lonely old volcano of the Right.""?",Forewords and Afterwords (1973),https://en.wikipedia.org/wiki/Wyndham_Lewis,https://en.wikipedia.org/wiki/W._H._Auden,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wyndham_Lewis', 'https://en.wikipedia.org/wiki/W._H._Auden']" +284,What was the age difference between the inventor of Barbie and the inventor of Hot Wheels?,6 months and 26 days.,https://en.wikipedia.org/wiki/Barbie,https://en.wikipedia.org/wiki/Hot_Wheels,https://en.wikipedia.org/wiki/Ruth_Handler,https://en.wikipedia.org/wiki/Elliot_Handler,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Barbie', 'https://en.wikipedia.org/wiki/Hot_Wheels', 'https://en.wikipedia.org/wiki/Ruth_Handler', 'https://en.wikipedia.org/wiki/Elliot_Handler']" +285,This deceased American singer and songwriter who's song about censorship was nominated for a Grammy Award (for Best Hard Rock Performance) in the same year that was designated as International Space Year by the United Nations. How many letters are in his name?,11 (Layne Staley),https://en.wikipedia.org/wiki/Layne_Staley,https://en.wikipedia.org/wiki/Man_in_the_Box,https://en.wikipedia.org/wiki/1992,https://en.wikipedia.org/wiki/Grammy_Award_for_Best_Hard_Rock_Performance,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Layne_Staley', 'https://en.wikipedia.org/wiki/Man_in_the_Box', 'https://en.wikipedia.org/wiki/1992', 'https://en.wikipedia.org/wiki/Grammy_Award_for_Best_Hard_Rock_Performance']" +286,"In 2020, the Italian curling team won the Moscow Classic annual bonspiel. One of the members of that four-person team was not born in Italy. As of August 1, 2024, what are the colors of the national flag of their country of origin?","Red and White (Switzerland, Joël Retornaz)",https://en.wikipedia.org/wiki/Red_Square_Classic,https://en.wikipedia.org/wiki/Jo%C3%ABl_Retornaz,https://en.wikipedia.org/wiki/Amos_Mosaner,https://en.wikipedia.org/wiki/Sebastiano_Arman,https://en.wikipedia.org/wiki/Simone_Gonin,https://en.wikipedia.org/wiki/Flag_of_Switzerland,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Red_Square_Classic', 'https://en.wikipedia.org/wiki/Jo%C3%ABl_Retornaz', 'https://en.wikipedia.org/wiki/Amos_Mosaner', 'https://en.wikipedia.org/wiki/Sebastiano_Arman', 'https://en.wikipedia.org/wiki/Simone_Gonin', 'https://en.wikipedia.org/wiki/Flag_of_Switzerland']" +287,When was the song Cold Blow and the Rainy Night featured on a daily list of folk songs recorded by Jon Boden?,"February 5, 2011",https://en.wikipedia.org/wiki/Cold_Blow_and_the_Rainy_Night_(song),https://en.wikipedia.org/wiki/A_Folk_Song_A_Day,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Cold_Blow_and_the_Rainy_Night_(song)', 'https://en.wikipedia.org/wiki/A_Folk_Song_A_Day']" +288,What album by the band named 'The Band' was released on the same day as the Nuclear Non-Proliferation Treaty was opened for signature?,Music from Big Pink,https://en.wikipedia.org/wiki/Treaty_on_the_Non-Proliferation_of_Nuclear_Weapons,https://en.wikipedia.org/wiki/The_Band#Discography,https://en.wikipedia.org/wiki/Music_from_Big_Pink,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Treaty_on_the_Non-Proliferation_of_Nuclear_Weapons', 'https://en.wikipedia.org/wiki/The_Band#Discography', 'https://en.wikipedia.org/wiki/Music_from_Big_Pink']" +289,"In the year the first UK winner of Big Brother was born, who was the Prime Minister?",Sir Edward Heath,https://en.wikipedia.org/wiki/Big_Brother_(franchise),https://en.wikipedia.org/wiki/Craig_Phillips,https://en.wikipedia.org/wiki/Edward_Heath,,,,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Big_Brother_(franchise)', 'https://en.wikipedia.org/wiki/Craig_Phillips', 'https://en.wikipedia.org/wiki/Edward_Heath']" +290,"The lead actress of the television show The Good Place who played protagonist Eleanor Shellstop, is married to a man who has been in many TV series and Films in his career. What was the name of the character he provided the voice for in a 2021 animated movie?",Ruben,https://en.wikipedia.org/wiki/The_Good_Place,https://en.wikipedia.org/wiki/Kristen_Bell,https://en.wikipedia.org/wiki/Dax_Shepard#Film,https://en.wikipedia.org/wiki/PAW_Patrol:_The_Movie,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/The_Good_Place', 'https://en.wikipedia.org/wiki/Kristen_Bell', 'https://en.wikipedia.org/wiki/Dax_Shepard#Film', 'https://en.wikipedia.org/wiki/PAW_Patrol:_The_Movie']" +291,Andy Warhol made the painting Marilyn Diptych from a publicity photo of Marilyn Monroe for a film. What was the name of Monroe's character in that film?,Rose Loomis,https://en.wikipedia.org/wiki/Marilyn_Diptych,https://en.wikipedia.org/wiki/Niagara_(1953_film),,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Marilyn_Diptych', 'https://en.wikipedia.org/wiki/Niagara_(1953_film)']" +292,"Out of every team that has won at least one official global Valorant tournament, which team has also won at least one League of Legends World Championship and at least one CS:GO Major Championship as of 2024?",Fnatic,https://en.wikipedia.org/wiki/Valorant_Champions_Tour,https://en.wikipedia.org/wiki/League_of_Legends_World_Championship,https://en.wikipedia.org/wiki/Counter-Strike_Major_Championships,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Valorant_Champions_Tour', 'https://en.wikipedia.org/wiki/League_of_Legends_World_Championship', 'https://en.wikipedia.org/wiki/Counter-Strike_Major_Championships']" +293,How old was the New-York born comic book writer who created the character Catwoman when he died?,83,https://en.wikipedia.org/wiki/Catwoman,https://en.wikipedia.org/wiki/Bill_Finger,https://en.wikipedia.org/wiki/Bob_Kane,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Catwoman', 'https://en.wikipedia.org/wiki/Bill_Finger', 'https://en.wikipedia.org/wiki/Bob_Kane']" +294,"How long did Steve Jobs live, percentage wise, compared to the average lifespan of a person in the United States in 1984? Round each number to the nearest whole integer before calculating your answer. Then round your answer the nearest hundredth.",75.68%,https://en.wikipedia.org/wiki/List_of_countries_by_past_life_expectancy,https://en.wikipedia.org/wiki/Steve_Jobs,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_countries_by_past_life_expectancy', 'https://en.wikipedia.org/wiki/Steve_Jobs']" +295,What Indian Empire was overrun by Huns 24 years after conventionally accepted date of the fall of the Western Roman Empire?,Gupta Empire,https://en.wikipedia.org/wiki/Roman_Empire,https://en.wikipedia.org/wiki/List_of_ancient_great_powers,https://en.wikipedia.org/wiki/Gupta_Empire,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Roman_Empire', 'https://en.wikipedia.org/wiki/List_of_ancient_great_powers', 'https://en.wikipedia.org/wiki/Gupta_Empire']" +296,Which album by the band Paramore came out after the death of Michael Jackson and before the death of Amy Winehouse?,Brand New Eyes,https://en.wikipedia.org/wiki/Michael_Jackson,https://en.wikipedia.org/wiki/Amy_Winehouse,https://en.wikipedia.org/wiki/Paramore,https://en.wikipedia.org/wiki/Brand_New_Eyes,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Michael_Jackson', 'https://en.wikipedia.org/wiki/Amy_Winehouse', 'https://en.wikipedia.org/wiki/Paramore', 'https://en.wikipedia.org/wiki/Brand_New_Eyes']" +297,What song topped Billboard magazine's Top 30 chart in the same year that the first documented case of a person being hit and surviving a meteorite occurred?,Little Things Mean a Lot by Kitty Kallen,https://en.wikipedia.org/wiki/Ann_Elizabeth_Fowler_Hodges,https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1954,,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Ann_Elizabeth_Fowler_Hodges', 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1954']" +298,How much bigger was Ford Motor Company's market share of US Sales the year Matthew McConaughey won his first MTV Movie Award than the year Zac Efron won his?,10.30%,https://en.wikipedia.org/wiki/Ford_Motor_Company#Sales_numbers,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Matthew_McConaughey,https://en.wikipedia.org/wiki/Zac_Efron#Awards_and_nominations,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Ford_Motor_Company#Sales_numbers', 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Matthew_McConaughey', 'https://en.wikipedia.org/wiki/Zac_Efron#Awards_and_nominations']" +299,"At the time of their publication, who was the editor of the magazine that published the two short stories that first developed the World Urslula LeGuinn would use for the first of the two books Harold Bloom would later call her masterpieces?","Cele Goldsmith (or Cele Goldsmith Lalli, once she married)",https://en.wikipedia.org/wiki/Ursula_K._Le_Guin,https://en.wikipedia.org/wiki/A_Wizard_of_Earthsea,https://en.wikipedia.org/wiki/The_Rule_of_Names,https://en.wikipedia.org/wiki/Fantastic_(magazine),,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Ursula_K._Le_Guin', 'https://en.wikipedia.org/wiki/A_Wizard_of_Earthsea', 'https://en.wikipedia.org/wiki/The_Rule_of_Names', 'https://en.wikipedia.org/wiki/Fantastic_(magazine)']" +300,Who lit the Olympic cauldron in the Summer Games immediately following the release of Van Halen's sixth studio album?,"Rafer Johnson lit the Olympic cauldron for the 1984 Summer Olympics, which began July 28, 1984, six months after the release of 1984 (stylized in Roman numerals as MCMLXXXIV), Van Halen's sixth studio album.",https://en.wikipedia.org/wiki/1984_(Van_Halen_album),https://en.wikipedia.org/wiki/1984_Summer_Olympics,https://en.wikipedia.org/wiki/Van_Halen_discography,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/1984_(Van_Halen_album)', 'https://en.wikipedia.org/wiki/1984_Summer_Olympics', 'https://en.wikipedia.org/wiki/Van_Halen_discography']" +301,"By mass, what is the largest species out of the closest living relatives to manatees and dugongs?",African bush elephant (Loxodonta africana),https://en.wikipedia.org/wiki/Manatee,https://en.wikipedia.org/wiki/Sirenia,https://en.wikipedia.org/wiki/Tethytheria,https://en.wikipedia.org/wiki/Proboscidea,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Manatee', 'https://en.wikipedia.org/wiki/Sirenia', 'https://en.wikipedia.org/wiki/Tethytheria', 'https://en.wikipedia.org/wiki/Proboscidea']" +302,"At the time of the END of the 2016 French Open, what was the head-to-head between the Men's World No. 1 at that time, and the runner-up of that year's Australian Open in the Men's Singles category?",Novak Djokovic 24-10 Andy Murray,https://en.wikipedia.org/wiki/2016_French_Open,https://en.wikipedia.org/wiki/List_of_ATP_number_1_ranked_singles_tennis_players,https://en.wikipedia.org/wiki/2016_Australian_Open_%E2%80%93_Men%27s_singles,https://en.wikipedia.org/wiki/Djokovic%E2%80%93Murray_rivalry,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/2016_French_Open', 'https://en.wikipedia.org/wiki/List_of_ATP_number_1_ranked_singles_tennis_players', 'https://en.wikipedia.org/wiki/2016_Australian_Open_%E2%80%93_Men%27s_singles', 'https://en.wikipedia.org/wiki/Djokovic%E2%80%93Murray_rivalry']" +303,"How many more full seasons already existed of the longest running cartoon television series ever in the United States of all time than seasons of ""X-Men: The Animated Series"" when Barack Obama was elected president of the United States. Show me your reasoning using a mathematical equation. Write out your answer in words, but give me the mathematical equation in numerical form. ",Fourteen 19 - 5 = 14,https://en.wikipedia.org/wiki/Barack_Obama,https://en.wikipedia.org/wiki/X-Men:_The_Animated_Series,https://en.wikipedia.org/wiki/The_Simpsons,https://en.wikipedia.org/wiki/List_of_animated_television_series_by_episode_count,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Barack_Obama', 'https://en.wikipedia.org/wiki/X-Men:_The_Animated_Series', 'https://en.wikipedia.org/wiki/The_Simpsons', 'https://en.wikipedia.org/wiki/List_of_animated_television_series_by_episode_count']" +304,What year was the person born who was Prime Minister of The United Kingdom during the year that the first African American ran for president of the United States?,1792,"https://en.wikipedia.org/wiki/List_of_African-American_United_States_presidential_and_vice_presidential_candidates#:~:text=In%201848%2C%20Frederick%20Douglass%20became,major%20party%2C%20namely%20the%20Democrats.",https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,"https://en.wikipedia.org/wiki/John_Russell,_1st_Earl_Russell",,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_African-American_United_States_presidential_and_vice_presidential_candidates#:~:text=In%201848%2C%20Frederick%20Douglass%20became,major%20party%2C%20namely%20the%20Democrats.', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom', 'https://en.wikipedia.org/wiki/John_Russell,_1st_Earl_Russell']" +305,What is the shortest possible abbreviation in the United States for the last to be discovered of the three antileprosy drugs on the World Health Organization's List of Essential Medicines?,R,https://en.wikipedia.org/wiki/WHO_Model_List_of_Essential_Medicines,https://en.wikipedia.org/wiki/Clofazimine,https://en.wikipedia.org/wiki/Dapsone,https://en.wikipedia.org/wiki/Rifampicin,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/WHO_Model_List_of_Essential_Medicines', 'https://en.wikipedia.org/wiki/Clofazimine', 'https://en.wikipedia.org/wiki/Dapsone', 'https://en.wikipedia.org/wiki/Rifampicin']" +306,What is the 7th track on the 3rd album released by the band formed in 1981 and fronted by Jordan Luck?,As I Love You,https://en.wikipedia.org/wiki/Jordan_Luck,https://en.wikipedia.org/wiki/The_Exponents,https://en.wikipedia.org/wiki/Amplifier_(Dance_Exponents_album)#Track_listing,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Jordan_Luck', 'https://en.wikipedia.org/wiki/The_Exponents', 'https://en.wikipedia.org/wiki/Amplifier_(Dance_Exponents_album)#Track_listing']" +307,This athlete was the first man to run the 100 metres in under 10 seconds at an Olympic Games. Which NFL team was he drafted by?,Miami Dolphins (Jim Hines),https://en.wikipedia.org/wiki/100_metres,https://en.wikipedia.org/wiki/Jim_Hines,,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/100_metres', 'https://en.wikipedia.org/wiki/Jim_Hines']" +308,Which member of the Wu-Tang Clan was born on Staten Island?,Ghostface Killah,https://en.wikipedia.org/wiki/Wu-Tang_Clan,https://en.wikipedia.org/wiki/RZA,https://en.wikipedia.org/wiki/GZA,https://en.wikipedia.org/wiki/Method_Man,https://en.wikipedia.org/wiki/Raekwon,https://en.wikipedia.org/wiki/Ghostface_Killah,https://en.wikipedia.org/wiki/Inspectah_Deck,https://en.wikipedia.org/wiki/U-God,https://en.wikipedia.org/wiki/Masta_Killa,https://en.wikipedia.org/wiki/Ol%27_Dirty_Bastard,https://en.wikipedia.org/wiki/Cappadonna,Multiple constraints,"['https://en.wikipedia.org/wiki/Wu-Tang_Clan', 'https://en.wikipedia.org/wiki/RZA', 'https://en.wikipedia.org/wiki/GZA', 'https://en.wikipedia.org/wiki/Method_Man', 'https://en.wikipedia.org/wiki/Raekwon', 'https://en.wikipedia.org/wiki/Ghostface_Killah', 'https://en.wikipedia.org/wiki/Inspectah_Deck', 'https://en.wikipedia.org/wiki/U-God', 'https://en.wikipedia.org/wiki/Masta_Killa', 'https://en.wikipedia.org/wiki/Ol%27_Dirty_Bastard', 'https://en.wikipedia.org/wiki/Cappadonna']" +309,"The New Zealand author of the children's book ""Bobby the Littlest War Hero"" won a Queen's Service Medal how many years before it was renamed the King's Service Medal?",Glyn Harper won the medal 12 years before it was renamed.,https://en.wikipedia.org/wiki/Glyn_Harper,https://en.wikipedia.org/wiki/King%27s_Service_Medal,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Glyn_Harper', 'https://en.wikipedia.org/wiki/King%27s_Service_Medal']" +310,"I'm thinking of a sport that a University in the midwest USA won the championship for in 1974. The University suffered a defeat in football in 2005, losing to the University where the 18th honourary citizen of Beijing obtained his Bachelor's degree. What is the sport I'm thinking of?",Cross country,https://en.wikipedia.org/wiki/List_of_honorary_citizens_of_Beijing,https://en.wikipedia.org/wiki/George_E._Killian,https://en.wikipedia.org/wiki/Ohio_Northern_University,https://en.wikipedia.org/wiki/University_of_Mount_Union,https://en.wikipedia.org/wiki/NCAA_Division_III_men%27s_cross_country_championships,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_honorary_citizens_of_Beijing', 'https://en.wikipedia.org/wiki/George_E._Killian', 'https://en.wikipedia.org/wiki/Ohio_Northern_University', 'https://en.wikipedia.org/wiki/University_of_Mount_Union', 'https://en.wikipedia.org/wiki/NCAA_Division_III_men%27s_cross_country_championships']" +311,"Who was elected the United States President in the same year that a ship, which was named after the snake that some argue killed Cleopatra, wrecked after the United Kingdom captured it from France?",Andrew Jackson,https://en.wikipedia.org/wiki/Cleopatra,https://en.wikipedia.org/wiki/HMS_Asp,https://en.wikipedia.org/wiki/French_brig_Serpent_(1807),https://en.wikipedia.org/wiki/1828_United_States_presidential_election,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Cleopatra', 'https://en.wikipedia.org/wiki/HMS_Asp', 'https://en.wikipedia.org/wiki/French_brig_Serpent_(1807)', 'https://en.wikipedia.org/wiki/1828_United_States_presidential_election']" +312,What major historical event began 171 years before the first European Nations' Cup in the country that hosted the tournament?,The French Revolution,https://en.wikipedia.org/wiki/UEFA_European_Championship,https://en.wikipedia.org/wiki/1960_European_Nations%27_Cup,https://en.wikipedia.org/wiki/France,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/UEFA_European_Championship', 'https://en.wikipedia.org/wiki/1960_European_Nations%27_Cup', 'https://en.wikipedia.org/wiki/France']" +313,"For how long was the ""Father of the National Parks"", the first president to declare a national park, and the president who declared the most national parks all alive at the same time?","26 years, 8 months, and 26 days.",https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States,https://en.wikipedia.org/wiki/John_Muir,https://en.wikipedia.org/wiki/Ulysses_S._Grant,https://en.wikipedia.org/wiki/Theodore_Roosevelt,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States', 'https://en.wikipedia.org/wiki/John_Muir', 'https://en.wikipedia.org/wiki/Ulysses_S._Grant', 'https://en.wikipedia.org/wiki/Theodore_Roosevelt']" +314,"Using data from the year 2020, if you were to combine the permanent human populations of Aukland Island, Rose Island, and Budelli Island, how many total people would you have?",1,https://en.wikipedia.org/wiki/Auckland_Island#Human_presence_on_the_island,https://en.wikipedia.org/wiki/Rose_Island_(New_Zealand),https://en.wikipedia.org/wiki/Budelli,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Auckland_Island#Human_presence_on_the_island', 'https://en.wikipedia.org/wiki/Rose_Island_(New_Zealand)', 'https://en.wikipedia.org/wiki/Budelli']" +315,"I'm thinking of a famous zoologist who is alumni of Upsala University and is credited with formally describing the African Gray parrot. He published his first edition of a certain book with a Latin title in the Netherlands, in 1735, while attending the university. This book contained his system of classifying animals, and it was key to the book's title. What is the modern-day term for the classification system he famously wrote about?",Binomial nomenclature,https://en.wikipedia.org/wiki/Grey_parrot,https://en.wikipedia.org/wiki/Carl_Linnaeus,https://en.wikipedia.org/wiki/Systema_Naturae,,,,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Grey_parrot', 'https://en.wikipedia.org/wiki/Carl_Linnaeus', 'https://en.wikipedia.org/wiki/Systema_Naturae']" +316,"How many copies of Coit Tower would have to be stacked on top of the Willis Tower in order to exceed the height of the Chicago Spire, had it been completed? Give your answer as the lowest possible whole number of Coit Towers.",3,https://en.wikipedia.org/wiki/Willis_Tower,https://en.wikipedia.org/wiki/Chicago_Spire,https://en.wikipedia.org/wiki/Coit_Tower,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Willis_Tower', 'https://en.wikipedia.org/wiki/Chicago_Spire', 'https://en.wikipedia.org/wiki/Coit_Tower']" +317,"The man who owned The Washington Post in 1932 broke the trust established by his father to gain control of the paper, only to run into the ground, lose it, and get committed to a psychiatric hospital where he died. How many days after his death did the man who purchased it in 1933 die?",6563 days,https://en.wikipedia.org/wiki/The_Washington_Post,https://en.wikipedia.org/wiki/Edward_Beale_McLean,https://en.wikipedia.org/wiki/Eugene_Meyer_(financier),,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/The_Washington_Post', 'https://en.wikipedia.org/wiki/Edward_Beale_McLean', 'https://en.wikipedia.org/wiki/Eugene_Meyer_(financier)']" +318,How tall does the flower get on the very first orchid David L. Jones (botanist) described and published in his career?,20–30 millimetres (0.8–1 in),https://en.wikipedia.org/wiki/David_L._Jones_(botanist),https://en.wikipedia.org/wiki/Pterostylis_aestiva#Description,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/David_L._Jones_(botanist)', 'https://en.wikipedia.org/wiki/Pterostylis_aestiva#Description']" +319,"The founder of the new religious movement Thelema once attempted to summit the third highest mountain in the world, but failed. How many years did it take before this feat was successfully accomplished? ",50,https://en.wikipedia.org/wiki/Thelema,https://en.wikipedia.org/wiki/Aleister_Crowley,https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth,https://en.wikipedia.org/wiki/1905_Kanchenjunga_expedition,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Thelema', 'https://en.wikipedia.org/wiki/Aleister_Crowley', 'https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth', 'https://en.wikipedia.org/wiki/1905_Kanchenjunga_expedition']" +320,Human activity by employees of a certain company caused Lake Peigneur in Louisiana to become completely drained of water. How many years passed between the founding of that company and Lake Peigneur's collapse? Convert this number to Roman numerals.,LXXVIII,https://en.wikipedia.org/wiki/Lake_Peigneur,https://en.wikipedia.org/wiki/Texaco,,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Lake_Peigneur', 'https://en.wikipedia.org/wiki/Texaco']" +321,"Emory Kristof, the photographer who helped find the Titanic's wreckage, shares an alma mater with a co-creator of Seinfeld. How many years separate their births?",5 years,https://en.wikipedia.org/wiki/Emory_Kristof,"https://en.wikipedia.org/wiki/University_of_Maryland,_College_Park#Notable_alumni",https://en.wikipedia.org/wiki/Larry_David,,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Emory_Kristof', 'https://en.wikipedia.org/wiki/University_of_Maryland,_College_Park#Notable_alumni', 'https://en.wikipedia.org/wiki/Larry_David']" +322,Who manufactured the kits Tomáš Pekhart wore in 2008 in his senior career?,Puma and Umbro,https://en.wikipedia.org/wiki/Tom%C3%A1%C5%A1_Pekhart,https://en.wikipedia.org/wiki/Tottenham_Hotspur_F.C.,https://en.wikipedia.org/wiki/Southampton_F.C.,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Tom%C3%A1%C5%A1_Pekhart', 'https://en.wikipedia.org/wiki/Tottenham_Hotspur_F.C.', 'https://en.wikipedia.org/wiki/Southampton_F.C.']" +323,"How many years before Starbucks introduced juice-blended Frappuccinos, was the company that is accredited for inventing the Frappuccino, founded?",31 years,https://en.wikipedia.org/wiki/George_Howell_(entrepreneur)#The_Coffee_Connection,https://en.wikipedia.org/wiki/Frappuccino,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/George_Howell_(entrepreneur)#The_Coffee_Connection', 'https://en.wikipedia.org/wiki/Frappuccino']" +324,"Which Nobel laureates in Physics between 1901 and 1920 made contributions to quantum theory, and what were their specific contributions? List the laureates and their respective contributions.","Several Nobel laureates in Physics between 1901 and 1920 made significant contributions to quantum theory. Here are the laureates and their specific contributions: 1. **Max Planck (1918)** - Planck is considered the father of quantum theory. He introduced the idea of energy quanta and derived Planck's radiation law, which describes the electromagnetic radiation emitted by a black body in thermal equilibrium. 2. **Albert Einstein (1921)** - Although awarded the Nobel Prize in 1921, Einstein made crucial contributions to quantum theory with his explanation of the photoelectric effect, which demonstrated the particle nature of light. 3. **Niels Bohr (1922)** - Bohr received his Nobel Prize for his contributions to our understanding of atomic structure and radiation. His Bohr model of the atom incorporated quantum theory and explained how electrons could have stable orbits. 4. **Johannes Stark (1919)** - Stark was awarded the Nobel Prize for his discovery of the Doppler effect in canal rays and the splitting of spectral lines in electric fields (the Stark effect), both phenomena that provided insights into atomic structure and quantum mechanics. 5. **James Franck and Gustav Hertz (1925)** - Franck and Hertz's Nobel Prize was awarded for their experiments on electron impacts in atoms, which confirmed quantum energy levels within atoms.",https://en.wikipedia.org/wiki/Max_Planck,https://en.wikipedia.org/wiki/Albert_Einstein,https://en.wikipedia.org/wiki/Niels_Bohr,https://en.wikipedia.org/wiki/Johannes_Stark,https://en.wikipedia.org/wiki/James_Franck,https://en.wikipedia.org/wiki/Gustav_Hertz,https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics,https://en.wikipedia.org/wiki/Quantum_mechanics,https://en.wikipedia.org/wiki/Photoelectric_effect,https://en.wikipedia.org/wiki/Bohr_model,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Max_Planck', 'https://en.wikipedia.org/wiki/Albert_Einstein', 'https://en.wikipedia.org/wiki/Niels_Bohr', 'https://en.wikipedia.org/wiki/Johannes_Stark', 'https://en.wikipedia.org/wiki/James_Franck', 'https://en.wikipedia.org/wiki/Gustav_Hertz', 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics', 'https://en.wikipedia.org/wiki/Quantum_mechanics', 'https://en.wikipedia.org/wiki/Photoelectric_effect', 'https://en.wikipedia.org/wiki/Bohr_model']" +325,Which Labour MP served for Bury North while Chukka Umunna was the MP for Streatham?,James Frith,https://en.wikipedia.org/wiki/Bury_North_(UK_Parliament_constituency),https://en.wikipedia.org/wiki/Streatham_(UK_Parliament_constituency),,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Bury_North_(UK_Parliament_constituency)', 'https://en.wikipedia.org/wiki/Streatham_(UK_Parliament_constituency)']" +326,How much older was the songwriter of 'The Twist' when he wrote it than Chubby Checker was when he recorded the song?,12 years,https://en.wikipedia.org/wiki/The_Twist_(song),https://en.wikipedia.org/wiki/Hank_Ballard,https://en.wikipedia.org/wiki/Chubby_Checker,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/The_Twist_(song)', 'https://en.wikipedia.org/wiki/Hank_Ballard', 'https://en.wikipedia.org/wiki/Chubby_Checker']" +327,"As of August 4, 2024, what other idol groups, that are not NCT-related, is the only Canadian NCT member connected to?",SuperM,https://en.wikipedia.org/wiki/NCT_(group)#Members,https://en.wikipedia.org/wiki/Mark_Lee_(singer),,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/NCT_(group)#Members', 'https://en.wikipedia.org/wiki/Mark_Lee_(singer)']" +328,"Of the top two most watched television season finales (as of June 2024), which finale ran the longest in length and by how much?",The MASH finale ran for 52 minutes longer than the Cheers finale.,https://en.wikipedia.org/wiki/Series_finale,"https://en.wikipedia.org/wiki/Goodbye,_Farewell_and_Amen",https://en.wikipedia.org/wiki/One_for_the_Road_(Cheers),,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Series_finale', 'https://en.wikipedia.org/wiki/Goodbye,_Farewell_and_Amen', 'https://en.wikipedia.org/wiki/One_for_the_Road_(Cheers)']" +329,What compass direction (of the 4 cardinal and 4 ordinal directions) is the capital city of the state which houses the Jackson Hole Mountain Resort in relation to the centre of the state?,Southeast,https://en.wikipedia.org/wiki/Jackson_Hole_Mountain_Resort,https://en.wikipedia.org/wiki/Wyoming,"https://en.wikipedia.org/wiki/Cheyenne,_Wyoming",,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jackson_Hole_Mountain_Resort', 'https://en.wikipedia.org/wiki/Wyoming', 'https://en.wikipedia.org/wiki/Cheyenne,_Wyoming']" +330,"How many years older is the police force that covers the City of London, than the political party that came 4th in the 2019 European Parliament election in Lombardy? Write the number of years in binary",10101110,https://en.wikipedia.org/wiki/City_of_London,https://en.wikipedia.org/wiki/City_of_London_Police,https://en.wikipedia.org/wiki/2019_European_Parliament_election_in_Lombardy,https://en.wikipedia.org/wiki/Forza_Italia_(2013),,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/City_of_London', 'https://en.wikipedia.org/wiki/City_of_London_Police', 'https://en.wikipedia.org/wiki/2019_European_Parliament_election_in_Lombardy', 'https://en.wikipedia.org/wiki/Forza_Italia_(2013)']" +331,"Lauryn Hill has one older brother named Malaney. The year he was born, a famous baseball player who played his final season with the Philadelphia Athletics died in March. That player was elected to the Baseball Hall of Fame. 6 years later after his election, the only person to be inducted to the Baseball Hall of Fame had how many total career wins?",365,https://en.wikipedia.org/wiki/Lauryn_Hill#Early_life,https://en.wikipedia.org/wiki/1972#,https://en.wikipedia.org/wiki/Zack_Wheat#,https://en.wikipedia.org/wiki/List_of_members_of_the_Baseball_Hall_of_Fame,https://en.wikipedia.org/wiki/Pud_Galvin,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Lauryn_Hill#Early_life', 'https://en.wikipedia.org/wiki/1972#', 'https://en.wikipedia.org/wiki/Zack_Wheat#', 'https://en.wikipedia.org/wiki/List_of_members_of_the_Baseball_Hall_of_Fame', 'https://en.wikipedia.org/wiki/Pud_Galvin']" +332,What is an animal that is distantly related to the capybara and has the densest fur of all land-dwelling mammals?,Chinchilla,https://en.wikipedia.org/wiki/Capybara#,https://en.wikipedia.org/wiki/Chinchilla,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Capybara#', 'https://en.wikipedia.org/wiki/Chinchilla']" +333,Who was the United States President when Chile won their first Copa America?,Barack Obama,https://en.wikipedia.org/wiki/List_of_Copa_Am%C3%A9rica_finals#Finals,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Copa_Am%C3%A9rica_finals#Finals', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States']" +334,"Which female athlete achieved a ""world's best"" and a ""world record"" for the marathon during Tony Blair's second term as British Prime Minister.",Paula Radcliffe,https://en.wikipedia.org/wiki/Tony_Blair,https://en.wikipedia.org/wiki/Marathon_world_record_progression,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Tony_Blair', 'https://en.wikipedia.org/wiki/Marathon_world_record_progression']" +335,"Who were the key members of the original lineup of the rock band Queen, and what were their primary roles in the band? Which original member of Queen was also in a band named Smile and what year did that band begin and end?","The original lineup of Queen consisted of Freddie Mercury (lead vocals, piano), Brian May (guitar, vocals), Roger Taylor (drums, vocals), and John Deacon (bass guitar). Smile was an English rock band formed in London in 1968 with Brian May and Tim Staffell. It ended in 1970 when Staffell left to join another band, Humpy Bong.",https://en.wikipedia.org/wiki/Queen_(band),https://en.wikipedia.org/wiki/Smile_(band),,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Queen_(band)', 'https://en.wikipedia.org/wiki/Smile_(band)']" +336,"In the town where the most successful American Ace of World War 2 to survive that war was born, there is a college. What was the name and percentage of the largest demographic of full and part time students attending that college in 2020?","Black/African American, 80%.",https://en.wikipedia.org/wiki/List_of_World_War_II_aces_from_the_United_States,https://en.wikipedia.org/wiki/David_McCampbell,"https://en.wikipedia.org/wiki/Bessemer,_Alabama",https://en.wikipedia.org/wiki/Lawson_State_Community_College,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_World_War_II_aces_from_the_United_States', 'https://en.wikipedia.org/wiki/David_McCampbell', 'https://en.wikipedia.org/wiki/Bessemer,_Alabama', 'https://en.wikipedia.org/wiki/Lawson_State_Community_College']" +337,What geographical commonality do the person who first described Apriona brunneomarginata and painter Wolfgang Hutter share?,They were both born in Vienna.,https://en.wikipedia.org/wiki/Apriona_brunneomarginata,https://en.wikipedia.org/wiki/Stephan_von_Breuning_(entomologist),https://en.wikipedia.org/wiki/Wolfgang_Hutter,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Apriona_brunneomarginata', 'https://en.wikipedia.org/wiki/Stephan_von_Breuning_(entomologist)', 'https://en.wikipedia.org/wiki/Wolfgang_Hutter']" +338,What is population in 1968 of the city that hosted the olympic games in which Lucy Steele competed?,"15,739",https://en.wikipedia.org/wiki/Lucy_Steele,https://en.wikipedia.org/wiki/1992_Winter_Olympics,https://en.wikipedia.org/wiki/Albertville,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Lucy_Steele', 'https://en.wikipedia.org/wiki/1992_Winter_Olympics', 'https://en.wikipedia.org/wiki/Albertville']" +339,There's a comedy club on Sunset in West Hollywood that opened in 1972. One of its founders opened for one of Elvis' comeback performances. Where was this founder's first wife born?,"Marinette, Wisconsin",https://en.wikipedia.org/wiki/The_Comedy_Store,https://en.wikipedia.org/wiki/Sammy_Shore,https://en.wikipedia.org/wiki/Mitzi_Shore,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/The_Comedy_Store', 'https://en.wikipedia.org/wiki/Sammy_Shore', 'https://en.wikipedia.org/wiki/Mitzi_Shore']" +340,"As of August 3, 2024, which band was nominated three times for the Grammy Award for Best Metal Performance and also headlined the Opus Stage at Download Festival 2023?",Ghost,https://en.wikipedia.org/wiki/Grammy_Award_for_Best_Metal_Performance,https://en.wikipedia.org/wiki/Download_Festival,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Grammy_Award_for_Best_Metal_Performance', 'https://en.wikipedia.org/wiki/Download_Festival']" +341,"Spryo the Dragon released a sequel just a year after it came out. In the same year Spryo's sequel came out, what popular skateboarding based game hit shelves? ",Tony Hawk's Pro Skater,https://en.wikipedia.org/wiki/Spyro_the_Dragon,https://en.wikipedia.org/wiki/Spyro_2:_Ripto%27s_Rage!,https://en.wikipedia.org/wiki/Tony_Hawk%27s_Pro_Skater,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Spyro_the_Dragon', 'https://en.wikipedia.org/wiki/Spyro_2:_Ripto%27s_Rage!', 'https://en.wikipedia.org/wiki/Tony_Hawk%27s_Pro_Skater']" +342,Who was the president of Kenya when Uduak Amimo took a break from her talk show and the media to pursue her interest in coaching and social impact projects?,Uhuru Kenyatta,https://en.wikipedia.org/wiki/Uduak_Amimo,https://en.wikipedia.org/wiki/President_of_Kenya,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Uduak_Amimo', 'https://en.wikipedia.org/wiki/President_of_Kenya']" +343,Which month had the third-lowest mean daily minimum temperature (recorded from 1876–1905) in the Japanese city which in 1720 was estimated to be the largest in the world?,December.,https://en.wikipedia.org/wiki/List_of_largest_cities_throughout_history,https://en.wikipedia.org/wiki/Edo,https://en.wikipedia.org/wiki/Tokyo,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_largest_cities_throughout_history', 'https://en.wikipedia.org/wiki/Edo', 'https://en.wikipedia.org/wiki/Tokyo']" +344,Which political leader of the countries where the Potsdam agreement and the Wellington Convention were signed was older on the 1st of July 1905?,Richard Seddon,https://en.wikipedia.org/wiki/Potsdam_Agreement,https://en.wikipedia.org/wiki/Wellington_Convention,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_New_Zealand,https://en.wikipedia.org/wiki/List_of_chancellors_of_Germany,https://en.wikipedia.org/wiki/Richard_Seddon,https://en.wikipedia.org/wiki/Bernhard_von_B%C3%BClow,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Potsdam_Agreement', 'https://en.wikipedia.org/wiki/Wellington_Convention', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_New_Zealand', 'https://en.wikipedia.org/wiki/List_of_chancellors_of_Germany', 'https://en.wikipedia.org/wiki/Richard_Seddon', 'https://en.wikipedia.org/wiki/Bernhard_von_B%C3%BClow']" +345,"In the year that the film 'Moana' was released, who was the hitting coach of the team that lost the MLB World Series? ",Ty Van Burkleo,https://en.wikipedia.org/wiki/Moana_(2016_film),https://en.wikipedia.org/wiki/2016_World_Series,https://en.wikipedia.org/wiki/2016_Cleveland_Indians_season,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Moana_(2016_film)', 'https://en.wikipedia.org/wiki/2016_World_Series', 'https://en.wikipedia.org/wiki/2016_Cleveland_Indians_season']" +346,What sea borders the Irish County where the author of the short story Foster was born?,The Irish Sea,https://en.wikipedia.org/wiki/Foster_(short_story),https://en.wikipedia.org/wiki/Claire_Keegan,https://en.wikipedia.org/wiki/County_Wicklow,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Foster_(short_story)', 'https://en.wikipedia.org/wiki/Claire_Keegan', 'https://en.wikipedia.org/wiki/County_Wicklow']" +347,How many times in total did the Argentina women's field hockey team and the Uruguay women's field hockey team enter the Pan American Games and the Olympic Games from 2000 to 2020?,15,https://en.wikipedia.org/wiki/Field_hockey_at_the_Pan_American_Games,https://en.wikipedia.org/wiki/Field_hockey_at_the_Summer_Olympics,,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Field_hockey_at_the_Pan_American_Games', 'https://en.wikipedia.org/wiki/Field_hockey_at_the_Summer_Olympics']" +348,"If you add up the birth years of Emma Watson, Daniel Radcliffe and Rupert Grint, what is the sum?",5967,https://en.wikipedia.org/wiki/Daniel_Radcliffe,https://en.wikipedia.org/wiki/Emma_Watson,https://en.wikipedia.org/wiki/Rupert_Grint,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Daniel_Radcliffe', 'https://en.wikipedia.org/wiki/Emma_Watson', 'https://en.wikipedia.org/wiki/Rupert_Grint']" +349,The Eastern Shawnee Tribe of Oklahoma owns and operates the Indigo Sky Casino. Their tribal headquarters are located in a suburb of the metropolitan area of a historic 2011 tornado that killed 158 people. What is the name of the metropolitan area?,"Joplin, Missouri metropolitan area",https://en.wikipedia.org/wiki/Eastern_Shawnee_Tribe_of_Oklahoma,"https://en.wikipedia.org/wiki/Wyandotte,_Oklahoma",https://en.wikipedia.org/wiki/2011_Joplin_tornado,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Eastern_Shawnee_Tribe_of_Oklahoma', 'https://en.wikipedia.org/wiki/Wyandotte,_Oklahoma', 'https://en.wikipedia.org/wiki/2011_Joplin_tornado']" +350,How many years apart was the founding of Snell & Wilmer from the birth of the man who got 3rd place at the 1954 Masters golf tournament?,16 years,https://en.wikipedia.org/wiki/Snell_&_Wilmer,https://en.wikipedia.org/wiki/1954_Masters_Tournament,https://en.wikipedia.org/wiki/Billy_Joe_Patton,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Snell_&_Wilmer', 'https://en.wikipedia.org/wiki/1954_Masters_Tournament', 'https://en.wikipedia.org/wiki/Billy_Joe_Patton']" +351,Which other astronaut flew the same type of aircraft during the Korean War as the first man to step on the moon?,John Glenn,https://en.wikipedia.org/wiki/Moon_landing,https://en.wikipedia.org/wiki/Neil_Armstrong,https://en.wikipedia.org/wiki/Grumman_F9F_Panther,https://en.wikipedia.org/wiki/John_Glenn,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Moon_landing', 'https://en.wikipedia.org/wiki/Neil_Armstrong', 'https://en.wikipedia.org/wiki/Grumman_F9F_Panther', 'https://en.wikipedia.org/wiki/John_Glenn']" +352,How many days did it take for World War 2 to end after the death of Alois Burgstaller?,136 days,https://en.wikipedia.org/wiki/Alois_Burgstaller,https://en.wikipedia.org/wiki/World_War_II,,,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Alois_Burgstaller', 'https://en.wikipedia.org/wiki/World_War_II']" +353,Adolf Hitler was born exactly 110 years before which US mass shooting?,Columbine,https://en.wikipedia.org/wiki/Adolf_Hitler,https://en.wikipedia.org/wiki/Mass_shootings_in_the_United_States,https://en.wikipedia.org/wiki/Columbine_High_School_massacre,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Adolf_Hitler', 'https://en.wikipedia.org/wiki/Mass_shootings_in_the_United_States', 'https://en.wikipedia.org/wiki/Columbine_High_School_massacre']" +354,How old was the 31st President of the United States when the second nuclear weapon ever used in warfare was dropped?,70 years old,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,https://en.wikipedia.org/wiki/Herbert_Hoover,https://en.wikipedia.org/wiki/Nuclear_weapon,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States', 'https://en.wikipedia.org/wiki/Herbert_Hoover', 'https://en.wikipedia.org/wiki/Nuclear_weapon']" +355,What is the total number of pages in the first two first-edition books in the Emperyan Series by Rebecca Yarros?,1135,https://en.wikipedia.org/wiki/Fourth_Wing,https://en.wikipedia.org/wiki/Iron_Flame,,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Fourth_Wing', 'https://en.wikipedia.org/wiki/Iron_Flame']" +356,"As of August 3, 2024, excluding white, what colour features on the flag of the city that houses the racecourse where Danon The Kid made his racing debut?",Red,https://en.wikipedia.org/wiki/Danon_The_Kid,https://en.wikipedia.org/wiki/Hanshin_Racecourse,"https://en.wikipedia.org/wiki/Takarazuka,_Hy%C5%8Dgo",,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Danon_The_Kid', 'https://en.wikipedia.org/wiki/Hanshin_Racecourse', 'https://en.wikipedia.org/wiki/Takarazuka,_Hy%C5%8Dgo']" +357,What was the first skyscraper built in this player's hometown before he became the 160th pick of the 1980's NFL Draft?,The Life & Casualty Tower,https://en.wikipedia.org/wiki/1980_NFL_draft,https://en.wikipedia.org/wiki/Preston_Brown_(wide_receiver),"https://en.wikipedia.org/wiki/Nashville,_Tennessee",https://en.wikipedia.org/wiki/Life_%26_Casualty_Tower,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/1980_NFL_draft', 'https://en.wikipedia.org/wiki/Preston_Brown_(wide_receiver)', 'https://en.wikipedia.org/wiki/Nashville,_Tennessee', 'https://en.wikipedia.org/wiki/Life_%26_Casualty_Tower']" +358,"Regarding the award that the man who created the initial sketch for the Eiffel Tower received, how many of the award models have hanging devices that are in the shape of a crown?",Eight,https://en.wikipedia.org/wiki/Eiffel_Tower,https://en.wikipedia.org/wiki/Maurice_Koechlin,https://en.wikipedia.org/wiki/Legion_of_Honour#Legal_status_and_leadership,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Eiffel_Tower', 'https://en.wikipedia.org/wiki/Maurice_Koechlin', 'https://en.wikipedia.org/wiki/Legion_of_Honour#Legal_status_and_leadership']" +359,"Excluding blue, what other colour appears on the 2004 flag of the region in which Alastair Galbraith's home city is situated?",Yellow,https://en.wikipedia.org/wiki/Alastair_Galbraith,https://en.wikipedia.org/wiki/Dunedin,https://en.wikipedia.org/wiki/Otago,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Alastair_Galbraith', 'https://en.wikipedia.org/wiki/Dunedin', 'https://en.wikipedia.org/wiki/Otago']" +360,"When the San Francisco 49ers and San Diego Chargers met in Super Bowl XXIX, what was the #1 film at the box office in the U.S.?",Legends of the Fall,"https://en.wikipedia.org/wiki/Super_Bowl_XXIX#:~:text=The%2049ers%20defeated%20the%20Chargers,ravaged%20the%20city%20in%201992.",https://en.wikipedia.org/wiki/List_of_1995_box_office_number-one_films_in_the_United_States,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Super_Bowl_XXIX#:~:text=The%2049ers%20defeated%20the%20Chargers,ravaged%20the%20city%20in%201992.', 'https://en.wikipedia.org/wiki/List_of_1995_box_office_number-one_films_in_the_United_States']" +361,What was the difference in original sale price between Tom Thomson’s “Northern River” and “Northern Lake”,The difference in price was $250.,https://en.wikipedia.org/wiki/Tom_Thomson,https://en.wikipedia.org/wiki/Northern_River_(painting),,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Tom_Thomson', 'https://en.wikipedia.org/wiki/Northern_River_(painting)']" +362,What NBA team was founded three years before the Apollo 11 crew landed on the moon?,The Chicago Bulls,https://en.wikipedia.org/wiki/Moon_landing,https://en.wikipedia.org/wiki/National_Basketball_Association,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Moon_landing', 'https://en.wikipedia.org/wiki/National_Basketball_Association']" +363,How many more career home runs did the MLB player who had the highest slugging percentage in 1954 have than the player who was the the first African American to play in Major League Baseball?,519,https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_titles_leaders,https://en.wikipedia.org/wiki/Willie_Mays#,https://en.wikipedia.org/wiki/Jackie_Robinson#,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_titles_leaders', 'https://en.wikipedia.org/wiki/Willie_Mays#', 'https://en.wikipedia.org/wiki/Jackie_Robinson#']" +364,What are the coordinates for the beach outside the UK that shares its name to a Welsh town that is home to the Great Orme?,34°0′37″S 18°20′34″E,https://en.wikipedia.org/wiki/Great_Orme,https://en.wikipedia.org/wiki/Llandudno,"https://en.wikipedia.org/wiki/Llandudno,_Western_Cape",,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Great_Orme', 'https://en.wikipedia.org/wiki/Llandudno', 'https://en.wikipedia.org/wiki/Llandudno,_Western_Cape']" +365,"I'm thinking of the Weird Al Yankovic parody of American Pie, what Weird Al album did it first appear on?",Running with Scissors,https://en.wikipedia.org/wiki/American_Pie_(song),https://en.wikipedia.org/wiki/The_Saga_Begins,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/American_Pie_(song)', 'https://en.wikipedia.org/wiki/The_Saga_Begins']" +366,What year was the band leader of the group who originally performed the song sampled in Kayne West's song Power born?,1946,https://en.wikipedia.org/wiki/My_Beautiful_Dark_Twisted_Fantasy,https://en.wikipedia.org/wiki/King_Crimson,https://en.wikipedia.org/wiki/Robert_Fripp,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/My_Beautiful_Dark_Twisted_Fantasy', 'https://en.wikipedia.org/wiki/King_Crimson', 'https://en.wikipedia.org/wiki/Robert_Fripp']" +367,Frank Jevons was the 9th Master of Hatfield College. His death came how many years after the death of his predecessor in the same role?,5 years,"https://en.wikipedia.org/wiki/Hatfield_College,_Durham",https://en.wikipedia.org/wiki/Frank_Jevons,https://en.wikipedia.org/wiki/Archibald_Robertson_(bishop),,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Hatfield_College,_Durham', 'https://en.wikipedia.org/wiki/Frank_Jevons', 'https://en.wikipedia.org/wiki/Archibald_Robertson_(bishop)']" +368,"Renowned Abstract Expressionist painter Clyfford Still graduated from college in 1933. As of August 1, 2024, what other alumnus from that university was nominated for a Pulitzer Prize?",Don Magnuson,https://en.wikipedia.org/wiki/Clyfford_Still,https://en.wikipedia.org/wiki/Spokane_University#Notable_alumni,https://en.wikipedia.org/wiki/Don_Magnuson,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Clyfford_Still', 'https://en.wikipedia.org/wiki/Spokane_University#Notable_alumni', 'https://en.wikipedia.org/wiki/Don_Magnuson']" +369,"What is the the sum of the ages of the men who were executed by firing squad in Kilmainham Gaol, on the 12th of May, during the same year as the Battle of the Somme, when the died?",80,https://en.wikipedia.org/wiki/Battle_of_the_Somme,https://en.wikipedia.org/wiki/1916,https://en.wikipedia.org/wiki/James_Connolly,https://en.wikipedia.org/wiki/Se%C3%A1n_Mac_Diarmada,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Battle_of_the_Somme', 'https://en.wikipedia.org/wiki/1916', 'https://en.wikipedia.org/wiki/James_Connolly', 'https://en.wikipedia.org/wiki/Se%C3%A1n_Mac_Diarmada']" +370,"When the maker of the third-party console title ""ActRaiser"", merged with the makers of the console title ""Chrono Trigger"", what percentage of the company did the makers of ""ActRaiser"" make up?",20% of the company.,https://en.wikipedia.org/wiki/Square_Enix,https://en.wikipedia.org/wiki/Enix,https://en.wikipedia.org/wiki/Chrono_Trigger,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Square_Enix', 'https://en.wikipedia.org/wiki/Enix', 'https://en.wikipedia.org/wiki/Chrono_Trigger']" +371,"How many years apart were the start of the Haitian and French revolutions , how many years longer did the Haitian Revolution last than the French Revolution, and how old were their respective leaders when each revolution began?",The French Revolution began 2 years before the Haitian Revolution. The Haitian Revolution lasted 2 years longer. Louis XVI was 34 years old when the French Revolution began and 36 when the Haitian Revolution began.,https://en.wikipedia.org/wiki/French_Revolution,https://en.wikipedia.org/wiki/Haitian_Revolution,https://en.wikipedia.org/wiki/Louis_XVI#,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/French_Revolution', 'https://en.wikipedia.org/wiki/Haitian_Revolution', 'https://en.wikipedia.org/wiki/Louis_XVI#']" +372,Are mulberries more related to raspberries or cannabis?,Cannabis,https://en.wikipedia.org/wiki/Morus_(plant),https://en.wikipedia.org/wiki/Rosales,https://en.wikipedia.org/wiki/Cannabaceae,https://en.wikipedia.org/wiki/Rosaceae,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Morus_(plant)', 'https://en.wikipedia.org/wiki/Rosales', 'https://en.wikipedia.org/wiki/Cannabaceae', 'https://en.wikipedia.org/wiki/Rosaceae']" +373,"There is a speech-sound disorder, not caused by a structural abnormality, for which one symptom is rearranging sounds of a word. The term for the disorder was first defined in 1908 by a German neuroscientist with the first name Hugo. Why was the Nobel Peace Prize given to an American the same year Hugo died?",For his crucial role in bringing about the Dawes Plan.,https://en.wikipedia.org/wiki/Speech_disorder,https://en.wikipedia.org/wiki/Apraxia_of_speech,https://en.wikipedia.org/wiki/Hugo_Liepmann,https://en.wikipedia.org/wiki/List_of_Nobel_Peace_Prize_laureates,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Speech_disorder', 'https://en.wikipedia.org/wiki/Apraxia_of_speech', 'https://en.wikipedia.org/wiki/Hugo_Liepmann', 'https://en.wikipedia.org/wiki/List_of_Nobel_Peace_Prize_laureates']" +374,"How old was Russian vodka tycoon Yuri Shefler when Serene, the yacht he commissioned, was delivered to him?",43,https://en.wikipedia.org/wiki/Serene_(yacht),https://en.wikipedia.org/wiki/Yuri_Shefler,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Serene_(yacht)', 'https://en.wikipedia.org/wiki/Yuri_Shefler']" +375,Who won the World Series the same year that Andy Roddick won his only Grand Slam title in 2003?,The Florida Marlins,https://en.wikipedia.org/wiki/Andy_Roddick_career_statistics#Singles:_5_finals_(1–4),https://en.wikipedia.org/wiki/2003_World_Series,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Andy_Roddick_career_statistics#Singles:_5_finals_(1–4)', 'https://en.wikipedia.org/wiki/2003_World_Series']" +376,What was the name of the work written by Louis Pierre Vieillot published two years after he described the great blue turao as Musophaga cristata?,Ornithologie,https://en.wikipedia.org/wiki/Great_blue_turaco,https://en.wikipedia.org/wiki/Louis_Pierre_Vieillot,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Great_blue_turaco', 'https://en.wikipedia.org/wiki/Louis_Pierre_Vieillot']" +377,What military awards were received by the General originally scheduled to lead Operation Torch before Lieutenant General Dwight D. Eisenhower was given command of the operation?,"Joseph Stilwell was originally scheduled to lead Operation Torch before Lieutenant General Dwight D. Eisenhower was given command of the operation. Stillwell received a Distinguished Service Cross, two Army Distinguished Service Medals, a Legion of Merit award, and a Bronze Star during his military career.",https://en.wikipedia.org/wiki/Operation_Torch,https://en.wikipedia.org/wiki/Joseph_Stilwell,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Operation_Torch', 'https://en.wikipedia.org/wiki/Joseph_Stilwell']" +378,"From United States Former President Bill Clinton through Former President Donald Trump, which president in this time period has published the greatest number of books as of August 1, 2024? Please exclude from that count any books you find that are authored by someone else about a particular former president.",Former President Bill Clinton has published seven books through 2024.,https://en.wikipedia.org/wiki/Bill_Clinton,https://en.wikipedia.org/wiki/George_W._Bush,https://en.wikipedia.org/wiki/Barack_Obama,https://en.wikipedia.org/wiki/Donald_Trump,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Bill_Clinton', 'https://en.wikipedia.org/wiki/George_W._Bush', 'https://en.wikipedia.org/wiki/Barack_Obama', 'https://en.wikipedia.org/wiki/Donald_Trump']" +379,What US president was born in the same year that the Treaty of Resht was signed?,George Washington,https://en.wikipedia.org/wiki/Treaty_of_Resht,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Treaty_of_Resht', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States']" +380,Which horse won the Kentucky Derby during the same calendar year in which John Hinckley Jr. attempted to assassinate U.S. President Ronald Reagan?,Pleasant Colony,https://en.wikipedia.org/wiki/Attempted_assassination_of_Ronald_Reagan,https://en.wikipedia.org/wiki/Kentucky_Derby,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Attempted_assassination_of_Ronald_Reagan', 'https://en.wikipedia.org/wiki/Kentucky_Derby']" +381,Milton Friedman won the Nobel Prize for Economics in 1976. What was the name of the Nobel Peace Prize winning wife of the economist who won the Nobel Prize for Economics two years before Friedman did?,Alva Myrdal,https://en.wikipedia.org/wiki/Milton_Friedman,https://en.wikipedia.org/wiki/List_of_Nobel_Memorial_Prize_laureates_in_Economic_Sciences,https://en.wikipedia.org/wiki/Gunnar_Myrdal,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Milton_Friedman', 'https://en.wikipedia.org/wiki/List_of_Nobel_Memorial_Prize_laureates_in_Economic_Sciences', 'https://en.wikipedia.org/wiki/Gunnar_Myrdal']" +382,"When Justin Trudeau was elected as Prime Minister of Canada, who was the current Prime Minister of France?",Manuel Valls,https://en.wikipedia.org/wiki/Justin_Trudeau,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_France,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Justin_Trudeau', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_France']" +383,This town was the original location of the tallest Christmas Tree displayed at the Rockefeller Center. This was also home to the author of a famous children's book series. What is the name of this series?,Doctor Dolittle,https://en.wikipedia.org/wiki/Rockefeller_Center_Christmas_Tree,"https://en.wikipedia.org/wiki/Killingworth,_Connecticut",https://en.wikipedia.org/wiki/Hugh_Lofting,https://en.wikipedia.org/wiki/Doctor_Dolittle,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Rockefeller_Center_Christmas_Tree', 'https://en.wikipedia.org/wiki/Killingworth,_Connecticut', 'https://en.wikipedia.org/wiki/Hugh_Lofting', 'https://en.wikipedia.org/wiki/Doctor_Dolittle']" +384,"Houston, Texas had dozens of airports as of January 1, 2024. Find the three-letter IATA code of the airport in which the longest runway was exactly 7000 feet long, and rearrange those letters to match that of another airport at that time. Here are your hints: The last letter of the re-arranged code is ""J"". The new airport was located in China. With this information, what was the IATA code of this airport?",LNJ,https://en.wikipedia.org/wiki/List_of_airports_in_the_Greater_Houston_Area,https://en.wikipedia.org/wiki/Lists_of_airports#By_code,https://en.wikipedia.org/wiki/List_of_airports_by_IATA_airport_code:_L,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_airports_in_the_Greater_Houston_Area', 'https://en.wikipedia.org/wiki/Lists_of_airports#By_code', 'https://en.wikipedia.org/wiki/List_of_airports_by_IATA_airport_code:_L']" +385,How many years earlier did Wimbledon start compared to the birthdate of the winner of the 2019 tournament.,110 years,https://en.m.wikipedia.org/wiki/Wimbledon_Championships,https://en.m.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions,https://en.m.wikipedia.org/wiki/Novak_Djokovic,,,,,,,,,Numerical reasoning,"['https://en.m.wikipedia.org/wiki/Wimbledon_Championships', 'https://en.m.wikipedia.org/wiki/List_of_Wimbledon_gentlemen%27s_singles_champions', 'https://en.m.wikipedia.org/wiki/Novak_Djokovic']" +386,The starship USS Enterprise has a registry number containing the founding year of what Ivy League university?,Yale University,https://en.wikipedia.org/wiki/USS_Enterprise_(NCC-1701),https://en.wikipedia.org/wiki/Ivy_League,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/USS_Enterprise_(NCC-1701)', 'https://en.wikipedia.org/wiki/Ivy_League']" +387,What two cities hosted the Summer Olympic Games between when the television shows featuring characters Olivia Benson and Meredith Grey started airing?,"Sydney, Australia, and Athens, Greece",https://en.wikipedia.org/wiki/Olivia_Benson,https://en.wikipedia.org/wiki/Meredith_Grey,https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Olivia_Benson', 'https://en.wikipedia.org/wiki/Meredith_Grey', 'https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities']" +388,Who had their twenty-first number one hit on the US Billboard Hot Country Songs chart the same week Carly Rae Jepsen hit #39 on the Australia ARIA Top 50 Singles chart?,Kenny Chesney,https://en.wikipedia.org/wiki/Call_Me_Maybe#Year-end_charts,https://en.wikipedia.org/wiki/List_of_Billboard_number-one_country_songs_of_2012,https://en.wikipedia.org/wiki/Reality_(Kenny_Chesney_song),,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Call_Me_Maybe#Year-end_charts', 'https://en.wikipedia.org/wiki/List_of_Billboard_number-one_country_songs_of_2012', 'https://en.wikipedia.org/wiki/Reality_(Kenny_Chesney_song)']" +389,What was the word that featured the least in the longest recorded quotation by the chimpanzee named after the recipient of the 2011 Sydney Peace Prize?,You,https://en.wikipedia.org/wiki/Sydney_Peace_Prize,https://en.wikipedia.org/wiki/Noam_Chomsky#In_academia,https://en.wikipedia.org/wiki/Nim_Chimpsky#Quotations,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Sydney_Peace_Prize', 'https://en.wikipedia.org/wiki/Noam_Chomsky#In_academia', 'https://en.wikipedia.org/wiki/Nim_Chimpsky#Quotations']" +390,What song was #1 on Billboard's Hot 100 for the most days during the Cuban Missile Crisis?,Monster Mash' by Bobby 'Boris' Pickett & the Crypt-Kickers,https://en.wikipedia.org/wiki/Cuban_Missile_Crisis,https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_1962,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Cuban_Missile_Crisis', 'https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_1962']" +391,Which ancient archaeoastronomical site in the U.K. is also associated with the summer solstice during which time light illuminates a quartz-rich stone in the chamber.,Bryn Celli Ddu,https://en.wikipedia.org/wiki/Summer_solstice,https://en.wikipedia.org/wiki/List_of_archaeoastronomical_sites_by_country,https://en.wikipedia.org/wiki/Bryn_Celli_Ddu,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Summer_solstice', 'https://en.wikipedia.org/wiki/List_of_archaeoastronomical_sites_by_country', 'https://en.wikipedia.org/wiki/Bryn_Celli_Ddu']" +392,What is the price difference of an iPhone (8GB) from when it was first released compared to the price of the iPhone X when it was released?,$400.00,https://en.wikipedia.org/wiki/IPhone_(1st_generation),"https://en.wikipedia.org/wiki/IPhone_X#:~:text=The%20iPhone%20X%20(Roman%20numeral,released%20on%20November%203%2C%202017.",,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/IPhone_(1st_generation)', 'https://en.wikipedia.org/wiki/IPhone_X#:~:text=The%20iPhone%20X%20(Roman%20numeral,released%20on%20November%203%2C%202017.']" +393,What's the star sign of the author of A Court of Thorns and Roses?,Pisces,https://en.wikipedia.org/wiki/A_Court_of_Thorns_and_Roses,https://en.wikipedia.org/wiki/Sarah_J._Maas,https://en.wikipedia.org/wiki/Astrological_sign,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/A_Court_of_Thorns_and_Roses', 'https://en.wikipedia.org/wiki/Sarah_J._Maas', 'https://en.wikipedia.org/wiki/Astrological_sign']" +394,"What was the average launch mass of Apollo 11, Apollo 12, and Apollo 13 in kilograms, rounded to the nearest integer?","The average launch mass in kilograms rounded to the nearest integer of Apollo 11, Apollo 12, and Apollo 13 is 47,906 kilograms.",https://en.wikipedia.org/wiki/Apollo_11,https://en.wikipedia.org/wiki/Apollo_12,https://en.wikipedia.org/wiki/Apollo_13,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Apollo_11', 'https://en.wikipedia.org/wiki/Apollo_12', 'https://en.wikipedia.org/wiki/Apollo_13']" +395,What was the second starring role of the actress who won an Oscar for portraying union activist Cyrstal Lee Sutton?,The Flying Nun,https://en.wikipedia.org/wiki/Crystal_Lee_Sutton,https://en.wikipedia.org/wiki/Sally_Field,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Crystal_Lee_Sutton', 'https://en.wikipedia.org/wiki/Sally_Field']" +396,"What is the scientific name of an amphibian that is listed as endangered by the Canadian government (as of 2024), and its only population in Canada occurs on an island which is the southernmost inhabited part of Canada?",Ambystoma texanum,https://en.wikipedia.org/wiki/List_of_Wildlife_Species_at_Risk_(Canada)#Amphibians,https://en.wikipedia.org/wiki/Small-mouth_salamander,"https://en.wikipedia.org/wiki/Pelee,_Ontario",,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Wildlife_Species_at_Risk_(Canada)#Amphibians', 'https://en.wikipedia.org/wiki/Small-mouth_salamander', 'https://en.wikipedia.org/wiki/Pelee,_Ontario']" +397,What were the names of the parents of the first overall pick in the 2007 NHL entry draft?,Donna and Patrick were the names of Patrick Kane's parents.,https://en.wikipedia.org/wiki/2007_NHL_entry_draft#Round_one,https://en.wikipedia.org/wiki/Patrick_Kane#Early_life,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/2007_NHL_entry_draft#Round_one', 'https://en.wikipedia.org/wiki/Patrick_Kane#Early_life']" +398,"Of the counties that Wisconsin Highway 34 runs through, what is the seat of the most populous county based on 2020 census data?",Wausau,https://en.wikipedia.org/wiki/Wisconsin_Highway_34,"https://en.wikipedia.org/wiki/Wood_County,_Wisconsin","https://en.wikipedia.org/wiki/Portage_County,_Wisconsin","https://en.wikipedia.org/wiki/Marathon_County,_Wisconsin",,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wisconsin_Highway_34', 'https://en.wikipedia.org/wiki/Wood_County,_Wisconsin', 'https://en.wikipedia.org/wiki/Portage_County,_Wisconsin', 'https://en.wikipedia.org/wiki/Marathon_County,_Wisconsin']" +399,Emma Lazarus's most famous poem inspired the founding of an order of nursing nuns. What disease does this order specialize in treating?,Cancer,https://en.wikipedia.org/wiki/Emma_Lazarus,https://en.wikipedia.org/wiki/Dominican_Sisters_of_Hawthorne,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Emma_Lazarus', 'https://en.wikipedia.org/wiki/Dominican_Sisters_of_Hawthorne']" +400,How many times did the victor of the Immortal Game of 1851 lose a chess match to an American-born opponent?,2,https://en.wikipedia.org/wiki/Immortal_Game,https://en.wikipedia.org/wiki/Adolf_Anderssen,https://en.wikipedia.org/wiki/Paul_Morphy,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Immortal_Game', 'https://en.wikipedia.org/wiki/Adolf_Anderssen', 'https://en.wikipedia.org/wiki/Paul_Morphy']" +401,"How many years were between the publication of a book considered 'one of the seminal works of fiction of the 20th century', and the Japanese release of the retail version of the game ""Resident Evil: Revelations 2"" for the PS3, of which the author of the 20th century book was a great inspiration for the plot?",100,https://en.wikipedia.org/wiki/Resident_Evil:_Revelations_2#,https://en.wikipedia.org/wiki/Franz_Kafka#Stories,,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Resident_Evil:_Revelations_2#', 'https://en.wikipedia.org/wiki/Franz_Kafka#Stories']" +402,The actress who played Aunt Rose in A Brooklyn State of Mind (1997) also starred in a mafia movie where she sang a popular Sicilian song. How many years later did her version of the song occur after the earliest recording?,45 years,https://en.wikipedia.org/wiki/A_Brooklyn_State_of_Mind#Cast,https://en.wikipedia.org/wiki/Morgana_King#Film_debut,https://en.wikipedia.org/wiki/C%27%C3%A8_la_luna_mezzo_mare#Notable_recordings,https://en.wikipedia.org/wiki/The_Godfather,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/A_Brooklyn_State_of_Mind#Cast', 'https://en.wikipedia.org/wiki/Morgana_King#Film_debut', 'https://en.wikipedia.org/wiki/C%27%C3%A8_la_luna_mezzo_mare#Notable_recordings', 'https://en.wikipedia.org/wiki/The_Godfather']" +403,"Put in chronological order the Major League Baseball seasons in which Barry Bonds, Tony Gwynn, and Benny Kauff hit for a .370 batting average.","1914, 1987, 2002",https://en.wikipedia.org/wiki/Barry_Bonds,https://en.wikipedia.org/wiki/Tony_Gwynn,https://en.wikipedia.org/wiki/Benny_Kauff,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Barry_Bonds', 'https://en.wikipedia.org/wiki/Tony_Gwynn', 'https://en.wikipedia.org/wiki/Benny_Kauff']" +404,What famous film maker once provided editorial assistance for a 90s documentary on Mongolian-Tuvan throat singing before directing a series of superhero movies?,Christopher Nolan,https://en.wikipedia.org/wiki/Tuvan_throat_singing,https://en.wikipedia.org/wiki/Genghis_Blues,https://en.wikipedia.org/wiki/Christopher_Nolan,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Tuvan_throat_singing', 'https://en.wikipedia.org/wiki/Genghis_Blues', 'https://en.wikipedia.org/wiki/Christopher_Nolan']" +405,"How many of the first 8 Harry Potter films based on the original 7 books were released in years when a United States presidential election took place, and what movies were they?","One of the original 8 Harry Potter films based on the original 7 books coincided with a U.S. presidential election, and the film was Harry Potter and the Prisoner of Azkaban.",https://en.wikipedia.org/wiki/Harry_Potter_(film_series),https://en.wikipedia.org/wiki/United_States_presidential_election,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Harry_Potter_(film_series)', 'https://en.wikipedia.org/wiki/United_States_presidential_election']" +406,In which year did the 4th Sheriff of Yorkshire to be part of the House of Plantagenet die?,1190,https://en.wikipedia.org/wiki/Sheriff_of_Yorkshire,https://en.wikipedia.org/wiki/Ranulf_de_Glanvill,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Sheriff_of_Yorkshire', 'https://en.wikipedia.org/wiki/Ranulf_de_Glanvill']" +407,How many more medals did France win in the 2008 Summer Olympics than in the 2004 Summer Olympics?,10,https://en.wikipedia.org/wiki/2004_Summer_Olympics,https://en.wikipedia.org/wiki/2008_Summer_Olympics,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/2004_Summer_Olympics', 'https://en.wikipedia.org/wiki/2008_Summer_Olympics']" +408,"How many colours are on the flag of the country whose capital is the southernmost by latitude out of all landlocked countries, as of 2024? What are the names of these colours?","4 - blue, white, green, black",https://en.wikipedia.org/wiki/List_of_national_capitals_by_latitude,https://en.wikipedia.org/wiki/Landlocked_country,https://en.wikipedia.org/wiki/Flag_of_Lesotho,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_national_capitals_by_latitude', 'https://en.wikipedia.org/wiki/Landlocked_country', 'https://en.wikipedia.org/wiki/Flag_of_Lesotho']" +409,"This 90s rock musical depiction of La bohème, had this music, lyrics and story written by a talent who passed the day before their Off-Broadway preview performance. Aside from Broadway, what was the name of his Netflix success?","Tick, Tick... Boom!",https://en.wikipedia.org/wiki/La_boh%C3%A8me,https://en.wikipedia.org/wiki/Rent_(musical),https://en.wikipedia.org/wiki/Jonathan_Larson,"https://en.wikipedia.org/wiki/Tick,_Tick..._Boom!",,,,,,,,Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/La_boh%C3%A8me', 'https://en.wikipedia.org/wiki/Rent_(musical)', 'https://en.wikipedia.org/wiki/Jonathan_Larson', 'https://en.wikipedia.org/wiki/Tick,_Tick..._Boom!']" +410,"What German-born Author had books published in 1995, 1999, & 2005 detailing their studies and activism with what Tang's animal mascot of the time?",Birutė Galdikas,https://en.wikipedia.org/wiki/Tang_(drink_mix),https://en.wikipedia.org/wiki/Orangutan,https://en.wikipedia.org/wiki/Birut%C4%97_Galdikas,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Tang_(drink_mix)', 'https://en.wikipedia.org/wiki/Orangutan', 'https://en.wikipedia.org/wiki/Birut%C4%97_Galdikas']" +411,This author won the Popular Fiction Book of the Year award in 2009 at the Irish Book Awards. What is their astrological sign?,Virgo,https://en.wikipedia.org/wiki/Irish_Book_Awards,https://en.wikipedia.org/wiki/Marian_Keyes,https://en.wikipedia.org/wiki/Astrological_sign,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Irish_Book_Awards', 'https://en.wikipedia.org/wiki/Marian_Keyes', 'https://en.wikipedia.org/wiki/Astrological_sign']" +412,What is the name of the 2nd track on the album by Corinne Bailey Rae that came out 10 years before her album The Heart Speaks in Whispers?,Enchantment,https://en.wikipedia.org/wiki/Corinne_Bailey_Rae_discography,https://en.wikipedia.org/wiki/Corinne_Bailey_Rae_(album),,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Corinne_Bailey_Rae_discography', 'https://en.wikipedia.org/wiki/Corinne_Bailey_Rae_(album)']" +413,Which of the following albums came out on a date closest to the date that Nancy Kerrigan was assaulted? What about which album was closest to the date of Harding's plea deal? Awake by Dream Theater Inside Out by Fates Warning Promised Land by Queensryche Dreamspace by Stratovarius,Dreamspace' by Stratovarius.,https://en.wikipedia.org/wiki/Assault_of_Nancy_Kerrigan,https://en.wikipedia.org/wiki/Awake_(Dream_Theater_album),https://en.wikipedia.org/wiki/Inside_Out_(Fates_Warning_album),https://en.wikipedia.org/wiki/Promised_Land_(Queensrÿche_album),https://en.wikipedia.org/wiki/Dreamspace,,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Assault_of_Nancy_Kerrigan', 'https://en.wikipedia.org/wiki/Awake_(Dream_Theater_album)', 'https://en.wikipedia.org/wiki/Inside_Out_(Fates_Warning_album)', 'https://en.wikipedia.org/wiki/Promised_Land_(Queensrÿche_album)', 'https://en.wikipedia.org/wiki/Dreamspace']" +414,Which U.S. National Park was the first to be established after the Portland Trail Blazers won their only playoff series with LaMarcus Aldridge on the team?,Gateway Arch National Park,https://en.wikipedia.org/wiki/LaMarcus_Aldridge#2013%E2%80%9314_season,https://en.wikipedia.org/wiki/2013%E2%80%9314_Portland_Trail_Blazers_season#Playoffs,https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/LaMarcus_Aldridge#2013%E2%80%9314_season', 'https://en.wikipedia.org/wiki/2013%E2%80%9314_Portland_Trail_Blazers_season#Playoffs', 'https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States']" +415,"As of August 3rd, 2024, which Moose Jaw Warrior with a retired number was born on May 29, 1967?",Mike Keane,https://en.wikipedia.org/wiki/Moose_Jaw_Warriors,https://en.wikipedia.org/wiki/Mike_Keane,https://en.wikipedia.org/wiki/Theoren_Fleury,https://en.wikipedia.org/wiki/Kelly_Buchberger,https://en.wikipedia.org/wiki/Ryan_Smyth,,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Moose_Jaw_Warriors', 'https://en.wikipedia.org/wiki/Mike_Keane', 'https://en.wikipedia.org/wiki/Theoren_Fleury', 'https://en.wikipedia.org/wiki/Kelly_Buchberger', 'https://en.wikipedia.org/wiki/Ryan_Smyth']" +416,"As of July 2024, which protagonist of a 'shonen jump' series shares a name with a station on a West Japan Railway Company regional line?",Light Yagami,https://en.wikipedia.org/wiki/West_Japan_Railway_Company,https://en.wikipedia.org/wiki/Geibi_Line,https://en.wikipedia.org/wiki/Weekly_Sh%C5%8Dnen_Jump,https://en.wikipedia.org/wiki/Death_Note,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/West_Japan_Railway_Company', 'https://en.wikipedia.org/wiki/Geibi_Line', 'https://en.wikipedia.org/wiki/Weekly_Sh%C5%8Dnen_Jump', 'https://en.wikipedia.org/wiki/Death_Note']" +417,"What's the fifth song on the fifth album of the pop singer who was parodied in the fifth song on ""Weird Al"" Yankovic's fifth album?",Keep Walking,https://en.wikipedia.org/wiki/%22Weird_Al%22_Yankovic,https://en.wikipedia.org/wiki/Even_Worse,https://en.wikipedia.org/wiki/Tiffany_Darwish,https://en.wikipedia.org/wiki/The_Color_of_Silence,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/%22Weird_Al%22_Yankovic', 'https://en.wikipedia.org/wiki/Even_Worse', 'https://en.wikipedia.org/wiki/Tiffany_Darwish', 'https://en.wikipedia.org/wiki/The_Color_of_Silence']" +418,I'm thinking of a painting. It was done by the same man who painted The Anti-Slavery Society Convention in the 1840's. The painting is about an election. Can you tell me the name of it?,Mock Election,https://en.wikipedia.org/wiki/World_Anti-Slavery_Convention,https://en.wikipedia.org/wiki/Benjamin_Haydon,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/World_Anti-Slavery_Convention', 'https://en.wikipedia.org/wiki/Benjamin_Haydon']" +419,In what year was the former South Korean prime minister who is from the same clan as the oldest member of the band BTS born?,1948,https://en.wikipedia.org/wiki/BTS,https://en.wikipedia.org/wiki/Jin_(singer),https://en.wikipedia.org/wiki/Gwangsan_Kim_clan,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/BTS', 'https://en.wikipedia.org/wiki/Jin_(singer)', 'https://en.wikipedia.org/wiki/Gwangsan_Kim_clan']" +420,What is the birth year of the American President who once pet the cat who was buried at the Hagia Sofia in 2020?,1961,https://en.wikipedia.org/wiki/Hagia_Sophia,https://en.wikipedia.org/wiki/Gli,https://en.wikipedia.org/wiki/Barack_Obama,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Hagia_Sophia', 'https://en.wikipedia.org/wiki/Gli', 'https://en.wikipedia.org/wiki/Barack_Obama']" +421,"There was a popular movie that came out in 2016 starring Emma Stone and Ryan Gosling, tell me where the director of this movie was born.","Providence, Rhode Island",https://en.wikipedia.org/wiki/La_La_Land,https://en.wikipedia.org/wiki/Damien_Chazelle,https://en.wikipedia.org/wiki/Ryan_Gosling,https://en.wikipedia.org/wiki/Emma_Stone,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/La_La_Land', 'https://en.wikipedia.org/wiki/Damien_Chazelle', 'https://en.wikipedia.org/wiki/Ryan_Gosling', 'https://en.wikipedia.org/wiki/Emma_Stone']" +422,By what amount was the budget of Peter Jackson's King Kong higher than the budget of John Guillermin's version of King Kong?,$183 million,https://en.wikipedia.org/wiki/King_Kong_(franchise),https://en.wikipedia.org/wiki/King_Kong_(2005_film),https://en.wikipedia.org/wiki/King_Kong_(1976_film),,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/King_Kong_(franchise)', 'https://en.wikipedia.org/wiki/King_Kong_(2005_film)', 'https://en.wikipedia.org/wiki/King_Kong_(1976_film)']" +423,"What was the age difference at their deaths (in years) between Edgar Allan Poe and his rival, Rufus Wilmot Griswold, multiplied by 100?",200,https://en.wikipedia.org/wiki/Rufus_Wilmot_Griswold,https://en.wikipedia.org/wiki/Edgar_Allan_Poe,,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Rufus_Wilmot_Griswold', 'https://en.wikipedia.org/wiki/Edgar_Allan_Poe']" +424,How old was the vice president to the fifth US president when he died?,50,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,https://en.wikipedia.org/wiki/Daniel_D._Tompkins,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States', 'https://en.wikipedia.org/wiki/Daniel_D._Tompkins']" +425,Who was the manager for the MLB World Series winning team the year that another team broke the record for the longest consecutive winning streak in a regular season? Base your answer on the following: -- The team who broke the record did so in the 2010s,A. J. Hinch,https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_longest_winning_streaks,https://en.wikipedia.org/wiki/2017_Cleveland_Indians_season,https://en.wikipedia.org/wiki/2017_World_Series,https://en.wikipedia.org/wiki/A._J._Hinch,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_longest_winning_streaks', 'https://en.wikipedia.org/wiki/2017_Cleveland_Indians_season', 'https://en.wikipedia.org/wiki/2017_World_Series', 'https://en.wikipedia.org/wiki/A._J._Hinch']" +426,What award was won in 2003 by the Swiss architecture firm that designed Roche Tower?,The Stirling Prize.,https://en.wikipedia.org/wiki/Roche_Tower,https://en.wikipedia.org/wiki/Herzog_%26_de_Meuron,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Roche_Tower', 'https://en.wikipedia.org/wiki/Herzog_%26_de_Meuron']" +427,How many New Zealanders have won Australian Idol during seasons 1 to 8?,1,https://en.wikipedia.org/wiki/Australian_Idol,https://en.wikipedia.org/wiki/Guy_Sebastian,https://en.wikipedia.org/wiki/Casey_Donovan_(singer),https://en.wikipedia.org/wiki/Kate_DeAraugo,https://en.wikipedia.org/wiki/Damien_Leith,https://en.wikipedia.org/wiki/Natalie_Gauci,https://en.wikipedia.org/wiki/Wes_Carr,https://en.wikipedia.org/wiki/Stan_Walker,https://en.wikipedia.org/wiki/Royston_Sagigi-Baira,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Australian_Idol', 'https://en.wikipedia.org/wiki/Guy_Sebastian', 'https://en.wikipedia.org/wiki/Casey_Donovan_(singer)', 'https://en.wikipedia.org/wiki/Kate_DeAraugo', 'https://en.wikipedia.org/wiki/Damien_Leith', 'https://en.wikipedia.org/wiki/Natalie_Gauci', 'https://en.wikipedia.org/wiki/Wes_Carr', 'https://en.wikipedia.org/wiki/Stan_Walker', 'https://en.wikipedia.org/wiki/Royston_Sagigi-Baira']" +428,Who was the mayor of France's 25th President's hometown when they were first elected President?,Brigitte Fouré,https://en.wikipedia.org/wiki/List_of_presidents_of_France#Presidents_2,https://en.wikipedia.org/wiki/Emmanuel_Macron,https://en.wikipedia.org/wiki/Amiens,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_presidents_of_France#Presidents_2', 'https://en.wikipedia.org/wiki/Emmanuel_Macron', 'https://en.wikipedia.org/wiki/Amiens']" +429,"Of the Jason Statham movies that came out the year Dennis Hopper died, which could he have lived to see the premiere of?",13,https://en.wikipedia.org/wiki/Dennis_Hopper,https://en.wikipedia.org/wiki/Jason_Statham,https://en.wikipedia.org/wiki/13_(2010_film),https://en.wikipedia.org/wiki/The_Expendables_(2010_film),,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Dennis_Hopper', 'https://en.wikipedia.org/wiki/Jason_Statham', 'https://en.wikipedia.org/wiki/13_(2010_film)', 'https://en.wikipedia.org/wiki/The_Expendables_(2010_film)']" +430,Whose memoir was co-written with the author of Pill Head: The Secret Life of a Painkiller Addict and published post-humously 2 years after her death?,Edith Windsor,https://en.wikipedia.org/wiki/Edith_Windsor,https://en.wikipedia.org/wiki/Joshua_Lyon,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Edith_Windsor', 'https://en.wikipedia.org/wiki/Joshua_Lyon']" +431,"What day of the year do John of Lancaster (Duke of Bedford), Fritz Koenig (German Sculptor), Edith Windsor (LGBT Activist), and Ulf Merbold (German Physicist and Astronaut) all have in common? ",They are all born on June 20.,"https://en.wikipedia.org/wiki/John_of_Lancaster,_Duke_of_Bedford",https://en.wikipedia.org/wiki/Fritz_Koenig,https://en.wikipedia.org/wiki/Edith_Windsor,https://en.wikipedia.org/wiki/Ulf_Merbold,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/John_of_Lancaster,_Duke_of_Bedford', 'https://en.wikipedia.org/wiki/Fritz_Koenig', 'https://en.wikipedia.org/wiki/Edith_Windsor', 'https://en.wikipedia.org/wiki/Ulf_Merbold']" +432,"In 2024's version of the world, which country was the birthplace of the Emperor who reigned from the year 363 to 364 over the Empire that the Goths played a major part in collapsing?",Serbia,https://en.wikipedia.org/wiki/Goths,https://en.wikipedia.org/wiki/Western_Roman_Empire,https://en.wikipedia.org/wiki/Jovian_(emperor),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Goths', 'https://en.wikipedia.org/wiki/Western_Roman_Empire', 'https://en.wikipedia.org/wiki/Jovian_(emperor)']" +433,"Who was Prime Minister in Australia at the same time that Norman Gunston released ""Salute to ABBA""?",Malcolm Fraser,https://en.wikipedia.org/wiki/Norman_Gunston,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Australia#,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Norman_Gunston', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Australia#']" +434,Which one was longer and by how much? James Cameron's film Titanic (1997) or the actual sinking of the Titanic in 1912?,"James Cameron's film Titanic was longer than the actual sinking Titanic, with a running time of 195 minutes, 35 minutes longer than the actual sinking of the Titanic in 1912.",https://en.wikipedia.org/wiki/Titanic,https://en.wikipedia.org/wiki/Titanic_(1997_film),,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Titanic', 'https://en.wikipedia.org/wiki/Titanic_(1997_film)']" +435,"I'm thinking of the screenplay, co-wrote by the same author as Lonesome Dove, that won an Oscar for Best Adapted Screenplay 20 years after Lonesome Dove won the Pulitzer Prize. What was the name of the screenplay?",Brokeback Mountain,https://en.wikipedia.org/wiki/Lonesome_Dove ,https://en.wikipedia.org/wiki/78th_Academy_Awards,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Lonesome_Dove ', 'https://en.wikipedia.org/wiki/78th_Academy_Awards']" +436,What was the founding name of the company that ran the coal mining camp in the city where baseball great Willie Mays was born?,Sewanee Furnace Company,https://en.wikipedia.org/wiki/Willie_Mays,"https://en.wikipedia.org/wiki/Westfield,_Alabama","https://en.wikipedia.org/wiki/Tennessee_Coal,_Iron_and_Railroad_Company",,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Willie_Mays', 'https://en.wikipedia.org/wiki/Westfield,_Alabama', 'https://en.wikipedia.org/wiki/Tennessee_Coal,_Iron_and_Railroad_Company']" +437,"Which original Saturday Night Live cast member's daughter, tied with Hannah Waddingham for the Best Supporting Actress in a Streaming Series, Comedy award at the 1st Hollywood Critics Association TV Awards in 2021?",Laraine Newman,https://en.wikipedia.org/wiki/1st_Hollywood_Critics_Association_TV_Awards,https://en.wikipedia.org/wiki/Hannah_Einbinder,https://en.wikipedia.org/wiki/Hollywood_Creative_Alliance,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1st_Hollywood_Critics_Association_TV_Awards', 'https://en.wikipedia.org/wiki/Hannah_Einbinder', 'https://en.wikipedia.org/wiki/Hollywood_Creative_Alliance']" +438,Who was the youngest climber to podium at the first year climbing was in the olympics?,Alberto Ginés López (18 at the time).,https://en.wikipedia.org/wiki/Sport_climbing_at_the_Summer_Olympics,https://en.wikipedia.org/wiki/Alberto_Gin%C3%A9s_L%C3%B3pez,https://en.wikipedia.org/wiki/Nathaniel_Coleman,https://en.wikipedia.org/wiki/Jakob_Schubert,https://en.wikipedia.org/wiki/Janja_Garnbret,https://en.wikipedia.org/wiki/Miho_Nonaka,https://en.wikipedia.org/wiki/Akiyo_Noguchi,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Sport_climbing_at_the_Summer_Olympics', 'https://en.wikipedia.org/wiki/Alberto_Gin%C3%A9s_L%C3%B3pez', 'https://en.wikipedia.org/wiki/Nathaniel_Coleman', 'https://en.wikipedia.org/wiki/Jakob_Schubert', 'https://en.wikipedia.org/wiki/Janja_Garnbret', 'https://en.wikipedia.org/wiki/Miho_Nonaka', 'https://en.wikipedia.org/wiki/Akiyo_Noguchi']" +439,"As of January 1st, 2024, how many buildings in New York City were 750ft or taller the last time Halley's Comet came close to Earth?",12,https://en.wikipedia.org/wiki/Halley%27s_Comet,https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_New_York_City,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Halley%27s_Comet', 'https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_New_York_City']" +440,What novel by Ernest Hemingway won a Pulitzer prize for fiction that was subsequently overturned twelve years before the same author won again?,For Whom the Bell Tolls by Ernest Hemingway,https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction#Repeat_winners,https://en.wikipedia.org/wiki/For_Whom_the_Bell_Tolls#Pulitzer_Prize_snub,https://en.wikipedia.org/wiki/The_Old_Man_and_the_Sea#Reception_and_legacy,https://en.wikipedia.org/wiki/Ernest_Hemingway_bibliography,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction#Repeat_winners', 'https://en.wikipedia.org/wiki/For_Whom_the_Bell_Tolls#Pulitzer_Prize_snub', 'https://en.wikipedia.org/wiki/The_Old_Man_and_the_Sea#Reception_and_legacy', 'https://en.wikipedia.org/wiki/Ernest_Hemingway_bibliography']" +441,Who was the successor of the Egyptian Pharaoh that was in power when the Treasury of Atreus was completed?,Merneptah,https://en.wikipedia.org/wiki/Treasury_of_Atreus,https://en.wikipedia.org/wiki/Periodization_of_ancient_Egypt,https://en.wikipedia.org/wiki/Nineteenth_Dynasty_of_Egypt#Pharaohs_of_the_19th_Dynasty,https://en.wikipedia.org/wiki/Ramesses_II,https://en.wikipedia.org/wiki/Merneptah,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Treasury_of_Atreus', 'https://en.wikipedia.org/wiki/Periodization_of_ancient_Egypt', 'https://en.wikipedia.org/wiki/Nineteenth_Dynasty_of_Egypt#Pharaohs_of_the_19th_Dynasty', 'https://en.wikipedia.org/wiki/Ramesses_II', 'https://en.wikipedia.org/wiki/Merneptah']" +442,"What is the age difference between the youngest and oldest person, in the 20th Century, to win two Nobel Prizes?",20 years.,https://en.wikipedia.org/wiki/Nobel_Prize#Multiple_laureates,https://en.wikipedia.org/wiki/Marie_Curie,https://en.wikipedia.org/wiki/Linus_Pauling,https://en.wikipedia.org/wiki/John_Bardeen,https://en.wikipedia.org/wiki/Frederick_Sanger,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Nobel_Prize#Multiple_laureates', 'https://en.wikipedia.org/wiki/Marie_Curie', 'https://en.wikipedia.org/wiki/Linus_Pauling', 'https://en.wikipedia.org/wiki/John_Bardeen', 'https://en.wikipedia.org/wiki/Frederick_Sanger']" +443,What season of The Challenge was airing when Bridgeton premiered?,"Season 36, Double Agents",https://en.wikipedia.org/wiki/The_Challenge_(TV_series),https://en.wikipedia.org/wiki/Bridgerton,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Challenge_(TV_series)', 'https://en.wikipedia.org/wiki/Bridgerton']" +444,"How many of Mark Calaway's consecutive Wrestlemania wins occurred in matches that were longer than the final match result in the same year's competition, not including years where Calaway was part of the final result?",Five,https://en.wikipedia.org/wiki/The_Undertaker,https://en.wikipedia.org/wiki/The_Streak_(professional_wrestling),https://en.wikipedia.org/wiki/WrestleMania_VII#Results,https://en.wikipedia.org/wiki/WrestleMania_VIII#Results,https://en.wikipedia.org/wiki/WrestleMania_IX#Results,https://en.wikipedia.org/wiki/WrestleMania_XI#Results,https://en.wikipedia.org/wiki/WrestleMania_XII#Results,https://en.wikipedia.org/wiki/WrestleMania_13#Results,https://en.wikipedia.org/wiki/WrestleMania_XIV#Results,https://en.wikipedia.org/wiki/WrestleMania_XV#Results,"https://en.wikipedia.org/wiki/WrestleMania_X-Seven#Results, https://en.wikipedia.org/wiki/WrestleMania_X8#Results, https://en.wikipedia.org/wiki/WrestleMania_XIX#Results, https://en.wikipedia.org/wiki/WrestleMania_XX#Results, https://en.wikipedia.org/wiki/WrestleMania_21#Results, https://en.wikipedia.org/wiki/WrestleMania_22#Results, https://en.wikipedia.org/wiki/WrestleMania_23#Results, https://en.wikipedia.org/wiki/WrestleMania_XXIV#Results, https://en.wikipedia.org/wiki/WrestleMania_25#Results, https://en.wikipedia.org/wiki/WrestleMania_XXVI#Results, https://en.wikipedia.org/wiki/WrestleMania_XXVII#Results, https://en.wikipedia.org/wiki/WrestleMania_XXVIII#Results, https://en.wikipedia.org/wiki/WrestleMania_29#Results",Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/The_Undertaker', 'https://en.wikipedia.org/wiki/The_Streak_(professional_wrestling)', 'https://en.wikipedia.org/wiki/WrestleMania_VII#Results', 'https://en.wikipedia.org/wiki/WrestleMania_VIII#Results', 'https://en.wikipedia.org/wiki/WrestleMania_IX#Results', 'https://en.wikipedia.org/wiki/WrestleMania_XI#Results', 'https://en.wikipedia.org/wiki/WrestleMania_XII#Results', 'https://en.wikipedia.org/wiki/WrestleMania_13#Results', 'https://en.wikipedia.org/wiki/WrestleMania_XIV#Results', 'https://en.wikipedia.org/wiki/WrestleMania_XV#Results', 'https://en.wikipedia.org/wiki/WrestleMania_X-Seven#Results, https://en.wikipedia.org/wiki/WrestleMania_X8#Results, https://en.wikipedia.org/wiki/WrestleMania_XIX#Results, https://en.wikipedia.org/wiki/WrestleMania_XX#Results, https://en.wikipedia.org/wiki/WrestleMania_21#Results, https://en.wikipedia.org/wiki/WrestleMania_22#Results, https://en.wikipedia.org/wiki/WrestleMania_23#Results, https://en.wikipedia.org/wiki/WrestleMania_XXIV#Results, https://en.wikipedia.org/wiki/WrestleMania_25#Results, https://en.wikipedia.org/wiki/WrestleMania_XXVI#Results, https://en.wikipedia.org/wiki/WrestleMania_XXVII#Results, https://en.wikipedia.org/wiki/WrestleMania_XXVIII#Results, https://en.wikipedia.org/wiki/WrestleMania_29#Results']" +445,What number do you get when you add up the numbers in the postcode of the hospital John Lennon was born in?,14,https://en.wikipedia.org/wiki/John_Lennon,https://en.wikipedia.org/wiki/Liverpool_Maternity_Hospital,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/John_Lennon', 'https://en.wikipedia.org/wiki/Liverpool_Maternity_Hospital']" +446,Which set director born in 1936 won the Academy Award for Best Production Design during the 71st Academy Awards?,Jill Quertier,https://en.wikipedia.org/wiki/Academy_Award_for_Best_Production_Design,https://en.wikipedia.org/wiki/Martin_Childs,https://en.wikipedia.org/wiki/Jill_Quertier,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Academy_Award_for_Best_Production_Design', 'https://en.wikipedia.org/wiki/Martin_Childs', 'https://en.wikipedia.org/wiki/Jill_Quertier']" +447,"The star of the TV show ""The Bear"" had his breakout role on a Showtime show. The actress who played his older sister on that Showtime show released an album in 2007. The label who released her album released their first album 27 years previous. From their first album, what number did the title track reach on the Billboard Hot 100?",Number 3,https://en.wikipedia.org/wiki/The_Bear_(TV_series),https://en.wikipedia.org/wiki/Jeremy_Allen_White,https://en.wikipedia.org/wiki/Shameless_(American_TV_series),https://en.wikipedia.org/wiki/Emmy_Rossum#Awards_and_nominations,https://en.wikipedia.org/wiki/Geffen_Records#History,https://en.wikipedia.org/wiki/The_Wanderer_(Donna_Summer_album),,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/The_Bear_(TV_series)', 'https://en.wikipedia.org/wiki/Jeremy_Allen_White', 'https://en.wikipedia.org/wiki/Shameless_(American_TV_series)', 'https://en.wikipedia.org/wiki/Emmy_Rossum#Awards_and_nominations', 'https://en.wikipedia.org/wiki/Geffen_Records#History', 'https://en.wikipedia.org/wiki/The_Wanderer_(Donna_Summer_album)']" +448,"When Taylor Swift first released her studio album ""1989,"" how many studio albums had Katy Perry already released?",4,https://en.wikipedia.org/wiki/Taylor_Swift_albums_discography,https://en.wikipedia.org/wiki/Katy_Perry_discography,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Taylor_Swift_albums_discography', 'https://en.wikipedia.org/wiki/Katy_Perry_discography']" +449,Who lived longer one of the Bronte sisters or Jane Austen?,Jane Austen,https://en.wikipedia.org/wiki/Bront%C3%AB_family,https://en.wikipedia.org/wiki/Jane_Austen,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Bront%C3%AB_family', 'https://en.wikipedia.org/wiki/Jane_Austen']" +450,Who was the head coach of the team that won the Superbowl the year that the show Law & Order: SVU was released?,Mike Shanahan,https://en.wikipedia.org/wiki/Law_%26_Order:_Special_Victims_Unit,https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions,https://en.wikipedia.org/wiki/1998_Denver_Broncos_season,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Law_%26_Order:_Special_Victims_Unit', 'https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions', 'https://en.wikipedia.org/wiki/1998_Denver_Broncos_season']" +451,"How many times taller is the 7th highest mountain the world than Novake, Poljčane in Slovenia? Round your answer to one decimal place.",31.5,"https://en.wikipedia.org/wiki/Novake,_Polj%C4%8Dane",https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Novake,_Polj%C4%8Dane', 'https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth']" +452,The Wisconsin Butter Fire led to dams being built to protect a lake that was the site of the plane crash death of which famous musician?,Otis Redding,https://en.wikipedia.org/wiki/Wisconsin_Butter_Fire,https://en.wikipedia.org/wiki/Lake_Monona,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wisconsin_Butter_Fire', 'https://en.wikipedia.org/wiki/Lake_Monona']" +453,What is the English meaning of the name of the smaller of the two constellations which resides in the middle of the Summer Triangle?,Arrow.,https://en.wikipedia.org/wiki/Summer_Triangle,https://en.wikipedia.org/wiki/Sagitta,https://en.wikipedia.org/wiki/Vulpecula,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Summer_Triangle', 'https://en.wikipedia.org/wiki/Sagitta', 'https://en.wikipedia.org/wiki/Vulpecula']" +454,"During the year that Serbia became an independent republic after separation from Montenegro, who won the Nobel Prize for Literature?",Orhan Pamuk,https://en.wikipedia.org/wiki/Socialist_Republic_of_Serbia,"https://en.wikipedia.org/wiki/2006_Nobel_Prize_in_Literature#:~:text=The%202006%20Nobel%20Prize%20in,clash%20and%20interlacing%20of%20cultures.%22",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Socialist_Republic_of_Serbia', 'https://en.wikipedia.org/wiki/2006_Nobel_Prize_in_Literature#:~:text=The%202006%20Nobel%20Prize%20in,clash%20and%20interlacing%20of%20cultures.%22']" +455,How many years older is the the library where Galileo's middle finger was first exhibited than the first public library to be lit using electric lightbulbs?,222,https://en.wikipedia.org/wiki/Galileo%27s_middle_finger#Exhibition_history,https://en.wikipedia.org/wiki/Laurentian_Library#Architecture,https://en.wikipedia.org/wiki/Literary_and_Philosophical_Society_of_Newcastle_upon_Tyne#,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Galileo%27s_middle_finger#Exhibition_history', 'https://en.wikipedia.org/wiki/Laurentian_Library#Architecture', 'https://en.wikipedia.org/wiki/Literary_and_Philosophical_Society_of_Newcastle_upon_Tyne#']" +456,"If the person who played the joker in ""The Dark Knight"" (2008) had children, how old will they be on the films 50th anniversary?","Heath Ledger had one daughter named Matilda Ledger who was born on October 28, 2005. Since the film was released on the 18th of July in 2008, Matilda will be 52 on the films 50th anniversary.",https://en.wikipedia.org/wiki/Heath_Ledger,https://en.wikipedia.org/wiki/Michelle_Williams_(actress),https://en.wikipedia.org/wiki/The_Dark_Knight,,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Heath_Ledger', 'https://en.wikipedia.org/wiki/Michelle_Williams_(actress)', 'https://en.wikipedia.org/wiki/The_Dark_Knight']" +457,What was David Fincher's most recently released feature film when Britney Spears' first feature film was released in the U.S.?,Fight Club.,https://en.wikipedia.org/wiki/Crossroads_(2002_film)#Reception,https://en.wikipedia.org/wiki/David_Fincher_filmography,https://en.wikipedia.org/wiki/Panic_Room#Theatrical_run,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Crossroads_(2002_film)#Reception', 'https://en.wikipedia.org/wiki/David_Fincher_filmography', 'https://en.wikipedia.org/wiki/Panic_Room#Theatrical_run']" +458,What was the last album the Grateful Dead released prior to the death of Doors vocalist Jim Morrison?,American Beauty,https://en.wikipedia.org/wiki/Jim_Morrison,https://en.wikipedia.org/wiki/Grateful_Dead_discography,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jim_Morrison', 'https://en.wikipedia.org/wiki/Grateful_Dead_discography']" +459,David Fincher has two movies in the 90's that have numbers in the title. What is the number in the title of David Fincher's later-released movie multiplied by the ratio of the sum of all of the molars in an aardvark over the number of adult teeth in a dog?,13/3 or 4.333... (infinitely repeating decimal),https://en.wikipedia.org/wiki/David_Fincher,https://en.wikipedia.org/wiki/Aardvark,https://en.wikipedia.org/wiki/Puppy_teething,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/David_Fincher', 'https://en.wikipedia.org/wiki/Aardvark', 'https://en.wikipedia.org/wiki/Puppy_teething']" +460,The band Franz Ferdinand is named after Archduke Franz Ferdinand of Austria and a racehorse that the band watched win a race. How many years before the assassination of Archduke Franz Ferdinand was that race established?,The Northumberland Plate horse race was established 81 years before the assassination of Archduke Franz Ferdinand,https://en.wikipedia.org/wiki/Assassination_of_Archduke_Franz_Ferdinand,https://en.wikipedia.org/wiki/Franz_Ferdinand_(band),https://en.wikipedia.org/wiki/Northumberland_Plate,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Assassination_of_Archduke_Franz_Ferdinand', 'https://en.wikipedia.org/wiki/Franz_Ferdinand_(band)', 'https://en.wikipedia.org/wiki/Northumberland_Plate']" +461,"Which university did the actor who has appeared in the most movies in the American pie film series (including spinoffs, as of 2020) deliver a commencement address at in 2012?","Dalhousie University, in Halifax, Nova Scotia",https://en.wikipedia.org/wiki/American_Pie_(film_series),https://en.wikipedia.org/wiki/Eugene_Levy,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/American_Pie_(film_series)', 'https://en.wikipedia.org/wiki/Eugene_Levy']" +462,"Since the turn of the 21st century, there has only been one year in which the Pulitzer Prize for Fiction has not been awarded. Among the finalists that year, one novel was published posthumously. The author of this posthumous publication is best known for a different novel published 15 years prior. The opening lines of this book later inspired a song by a band. The name of this band can be determined with the following calculation: find the age of the author on January 1 of the year the latter book was published, add four, and subtract the sum from the year this author was a Pulitzer Prize finalist. The name of this band was inspired by yet another book. What is the name of this book and its author?",On the Road by Jack Kerouac,https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction,https://en.wikipedia.org/wiki/David_Foster_Wallace,https://en.wikipedia.org/wiki/Infinite_Jest#Adaptations,https://en.wikipedia.org/wiki/The_1975,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction', 'https://en.wikipedia.org/wiki/David_Foster_Wallace', 'https://en.wikipedia.org/wiki/Infinite_Jest#Adaptations', 'https://en.wikipedia.org/wiki/The_1975']" +463,How old was Akira Toriyama when Pokemon Diamond and Pearl was released in America?,Akira Toriyama was 52.,https://en.wikipedia.org/wiki/Akira_Toriyama,https://en.wikipedia.org/wiki/Pok%C3%A9mon_Diamond_and_Pearl,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Akira_Toriyama', 'https://en.wikipedia.org/wiki/Pok%C3%A9mon_Diamond_and_Pearl']" +464,"As of 2020, what was the population of the city where Dippin' Dots were founded?","27,137",https://en.wikipedia.org/wiki/Dippin%27_Dots,"https://en.wikipedia.org/wiki/Paducah,_Kentucky",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Dippin%27_Dots', 'https://en.wikipedia.org/wiki/Paducah,_Kentucky']" +465,How old were the founders of the firm that designed the Empire State Building when construction on the building began?,Richmond Shreve was 52 years old and William Lamb was 46 years old.,https://en.wikipedia.org/wiki/Empire_State_Building,"https://en.wikipedia.org/wiki/Shreve,_Lamb_%26_Harmon",https://en.wikipedia.org/wiki/Richmond_Shreve,https://en.wikipedia.org/wiki/William_F._Lamb,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Empire_State_Building', 'https://en.wikipedia.org/wiki/Shreve,_Lamb_%26_Harmon', 'https://en.wikipedia.org/wiki/Richmond_Shreve', 'https://en.wikipedia.org/wiki/William_F._Lamb']" +466,"In 2015, Emmanuel Lubezki was cinematographer for a film that was directed by the man who also directed the 2014 film Birdman. What is that film?",The Revenant,https://en.wikipedia.org/wiki/Birdman_(film),https://en.wikipedia.org/wiki/Alejandro_González_Iñárritu,https://en.wikipedia.org/wiki/Emmanuel_Lubezki#Feature_film,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Birdman_(film)', 'https://en.wikipedia.org/wiki/Alejandro_González_Iñárritu', 'https://en.wikipedia.org/wiki/Emmanuel_Lubezki#Feature_film']" +467,A puzzle released in the 1970's gained popularity and inspired the establishment of an international speed competition. What was the average time of all winners of this competition between 2005 and 2015 rounded to the nearest 100th?,10.45,https://en.wikipedia.org/wiki/Rubik%27s_Cube,https://en.wikipedia.org/wiki/Speedcubing,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Rubik%27s_Cube', 'https://en.wikipedia.org/wiki/Speedcubing']" +468,Who was the British Prime Minister in the year that the Glastonbury Festival was launched?,"Edward Heath, September 1970",https://en.wikipedia.org/wiki/Glastonbury_Festival,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Glastonbury_Festival', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom']" +469,"Which South Korean Cross-country skier had the highest rank at the Olympics in the Men's 15 km, among South Korean skiers only, between 2002 and 2006?",Park Byeong-ju,https://en.wikipedia.org/wiki/South_Korea_at_the_2006_Winter_Olympics,https://en.wikipedia.org/wiki/South_Korea_at_the_2002_Winter_Olympics,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/South_Korea_at_the_2006_Winter_Olympics', 'https://en.wikipedia.org/wiki/South_Korea_at_the_2002_Winter_Olympics']" +470,Which country did tennis' first Golden Slam winner represent when they achieved it?,West Germany,https://en.wikipedia.org/wiki/Grand_Slam_(tennis)#,https://en.wikipedia.org/wiki/Tennis_at_the_1988_Summer_Olympics_%E2%80%93_Women%27s_singles,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Grand_Slam_(tennis)#', 'https://en.wikipedia.org/wiki/Tennis_at_the_1988_Summer_Olympics_%E2%80%93_Women%27s_singles']" +471,Which two MLB teams played in the World Series just weeks after the O.J. Simpson murder trial had ended?,Atlanta Braves and Cleveland Indians,https://en.wikipedia.org/wiki/Murder_trial_of_O._J._Simpson,https://en.wikipedia.org/wiki/1995_World_Series,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Murder_trial_of_O._J._Simpson', 'https://en.wikipedia.org/wiki/1995_World_Series']" +472,Who was the Vice Admiral in charge of the Carrier Division the Japanese carrier Hiyō was in when she was sunk?,Vice Admiral Kakuji Kakuta,https://en.wikipedia.org/wiki/Japanese_aircraft_carrier_Hiyō,https://en.wikipedia.org/wiki/Battle_of_the_Philippine_Sea,https://en.wikipedia.org/wiki/Philippine_Sea_order_of_battle,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Japanese_aircraft_carrier_Hiyō', 'https://en.wikipedia.org/wiki/Battle_of_the_Philippine_Sea', 'https://en.wikipedia.org/wiki/Philippine_Sea_order_of_battle']" +473,"In January of 2024, how many colleges were in the district in which the William C. Davis Science building can be found?",Alamo Colleges District has 5 schools.,https://en.wikipedia.org/wiki/William_Conan_Davis,https://en.wikipedia.org/wiki/St._Philip%27s_College_(United_States),https://en.wikipedia.org/wiki/Alamo_Colleges_District,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/William_Conan_Davis', 'https://en.wikipedia.org/wiki/St._Philip%27s_College_(United_States)', 'https://en.wikipedia.org/wiki/Alamo_Colleges_District']" +474,Can you add one minute to the time of the title track from the album released by Ray Charles in the year before Phil Quartararo took over as president of its record label?,"5:08 (""Strong Love Affair"" has an individual track time of 4:08)",https://en.wikipedia.org/wiki/Phil_Quartararo#Warner_Bros._Records,https://en.wikipedia.org/wiki/Warner_Records#End_of_an_era:_Ostin_and_Waronker_depart,https://en.wikipedia.org/wiki/Ray_Charles_discography,https://en.wikipedia.org/wiki/Strong_Love_Affair#Track_listing,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Phil_Quartararo#Warner_Bros._Records', 'https://en.wikipedia.org/wiki/Warner_Records#End_of_an_era:_Ostin_and_Waronker_depart', 'https://en.wikipedia.org/wiki/Ray_Charles_discography', 'https://en.wikipedia.org/wiki/Strong_Love_Affair#Track_listing']" +475,"As of 2020, who is the longest-serving president of the university where Hannah Arendt is buried?",Leon Botstein,https://en.wikipedia.org/wiki/Hannah_Arendt,https://en.wikipedia.org/wiki/Bard_College,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Hannah_Arendt', 'https://en.wikipedia.org/wiki/Bard_College']" +476,What was state of the schools regarding integration in the hometown of Miller Williams when he began his master's degree in Zoology?,They were segregated until 1955. Miller Williams began working on his master's in 1952.,https://en.wikipedia.org/wiki/Miller_Williams,"https://en.wikipedia.org/wiki/Hoxie,_Arkansas",,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Miller_Williams', 'https://en.wikipedia.org/wiki/Hoxie,_Arkansas']" +477,"Do zebra, giraffe or hippos wean the earliest?",Giraffe,https://en.wikipedia.org/wiki/Plains_zebra,https://en.wikipedia.org/wiki/Giraffe,https://en.wikipedia.org/wiki/Hippopotamus,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Plains_zebra', 'https://en.wikipedia.org/wiki/Giraffe', 'https://en.wikipedia.org/wiki/Hippopotamus']" +478,How many years were there between Heath Ledger's birth and the first successful use of a special human extraction technology that appeared in a Batman movie that Ledger starred in?,21 years.,https://en.wikipedia.org/wiki/Fulton_surface-to-air_recovery_system,https://en.wikipedia.org/wiki/The_Dark_Knight,https://en.wikipedia.org/wiki/Heath_Ledger,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Fulton_surface-to-air_recovery_system', 'https://en.wikipedia.org/wiki/The_Dark_Knight', 'https://en.wikipedia.org/wiki/Heath_Ledger']" +479,How many feature films had James Cameron directed by the time Barack Obama was inaugurated as President of the United States?,Seven.,https://en.wikipedia.org/wiki/Barack_Obama,https://en.wikipedia.org/wiki/James_Cameron#Filmography,https://en.wikipedia.org/wiki/Avatar_(2009_film),,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Barack_Obama', 'https://en.wikipedia.org/wiki/James_Cameron#Filmography', 'https://en.wikipedia.org/wiki/Avatar_(2009_film)']" +480,Who was the president of the USA the year that the European Convention on Human Rights came into effect?,Dwight D Eisenhower,https://en.wikipedia.org/wiki/European_Convention_on_Human_Rights,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States#Presidents,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/European_Convention_on_Human_Rights', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States#Presidents']" +481,"As of 1st November 2023, Which two South African drivers took part in the Formula One the year in which a micro-nation claimed territory off the coast of Suffolk, England?",Dave Charlton and Luki Botha,https://en.wikipedia.org/wiki/List_of_micronations,https://en.wikipedia.org/wiki/1967_Formula_One_season#Teams_and_drivers,,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_micronations', 'https://en.wikipedia.org/wiki/1967_Formula_One_season#Teams_and_drivers']" +482,How old was the first minister of the Ministry of Digital Affairs in Poland when Andrzej Halicki became minister?,"The first minister of the Ministry of Digital Affairs in Poland Anna Streżyńska was 47 years old when Andrzej Halicki became minister in 2014. Anna Streżyńska was born on May 11th, 1967.",https://en.wikipedia.org/wiki/Ministry_of_Digital_Affairs,https://en.wikipedia.org/wiki/Anna_Stre%C5%BCy%C5%84ska,https://en.wikipedia.org/wiki/Ministry_of_Administration_and_Digitization_(Poland),,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Ministry_of_Digital_Affairs', 'https://en.wikipedia.org/wiki/Anna_Stre%C5%BCy%C5%84ska', 'https://en.wikipedia.org/wiki/Ministry_of_Administration_and_Digitization_(Poland)']" +483,"Looking at the Best Actor and Best Actress categories for the 2023 Academy Awards, how many children did all of the winners and nominees have combined as of August 1, 2024?",13,https://en.wikipedia.org/wiki/95th_Academy_Awards#Awards,https://en.wikipedia.org/wiki/Brendan_Fraser,https://en.wikipedia.org/wiki/Austin_Butler,https://en.wikipedia.org/wiki/Colin_Farrell,https://en.wikipedia.org/wiki/Paul_Mescal,https://en.wikipedia.org/wiki/Bill_Nighy,https://en.wikipedia.org/wiki/Michelle_Yeoh,https://en.wikipedia.org/wiki/Cate_Blanchett,https://en.wikipedia.org/wiki/Ana_de_Armas,https://en.wikipedia.org/wiki/Andrea_Riseborough,https://en.wikipedia.org/wiki/Michelle_Williams_(actress),Numerical reasoning,"['https://en.wikipedia.org/wiki/95th_Academy_Awards#Awards', 'https://en.wikipedia.org/wiki/Brendan_Fraser', 'https://en.wikipedia.org/wiki/Austin_Butler', 'https://en.wikipedia.org/wiki/Colin_Farrell', 'https://en.wikipedia.org/wiki/Paul_Mescal', 'https://en.wikipedia.org/wiki/Bill_Nighy', 'https://en.wikipedia.org/wiki/Michelle_Yeoh', 'https://en.wikipedia.org/wiki/Cate_Blanchett', 'https://en.wikipedia.org/wiki/Ana_de_Armas', 'https://en.wikipedia.org/wiki/Andrea_Riseborough', 'https://en.wikipedia.org/wiki/Michelle_Williams_(actress)']" +484,Shakshouka and menemen are considered similar dishes. What two main ingredients do they have in common?,Egg and tomato.,https://en.wikipedia.org/wiki/Menemen_(food),https://en.wikipedia.org/wiki/Shakshouka,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Menemen_(food)', 'https://en.wikipedia.org/wiki/Shakshouka']" +485,What is the title of the song that had the second highest record sales recorded on the Discography of American Historical Recordings in the same year that Fred J. Rath was elected mayor of Utica?,Blue Yodel No. 1 (T for Texas),"https://en.m.wikipedia.org/wiki/List_of_mayors_of_Utica,_New_York",https://en.m.wikipedia.org/wiki/1928_in_music,,,,,,,,,,Tabular reasoning,"['https://en.m.wikipedia.org/wiki/List_of_mayors_of_Utica,_New_York', 'https://en.m.wikipedia.org/wiki/1928_in_music']" +486,"Give me the difference in time, measured in hours not days, between the first solo woman to thru-hike the Appalachian Trail and the fastest recorded solo woman to walk the Appalachian Trail before 2012.",2128 hours,https://en.wikipedia.org/wiki/Appalachian_Trail,https://en.wikipedia.org/wiki/Grandma_Gatewood,https://en.wikipedia.org/wiki/Jennifer_Pharr_Davis,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Appalachian_Trail', 'https://en.wikipedia.org/wiki/Grandma_Gatewood', 'https://en.wikipedia.org/wiki/Jennifer_Pharr_Davis']" +487,"If Mr. Rogers were still alive, how old would he have been when the movie ""A Beautiful Day in the Neighborhood"", featuring Tom Hanks came out in the U.S.?","Fred McFeely Rogers would have been 91 years old when ""A Beautiful Day in the Neighborhood"" was released in the U.S.",https://en.wikipedia.org/wiki/Fred_Rogers,https://en.wikipedia.org/wiki/A_Beautiful_Day_in_the_Neighborhood,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Fred_Rogers', 'https://en.wikipedia.org/wiki/A_Beautiful_Day_in_the_Neighborhood']" +488,Who became the prime minister of Canada in the same year that Jonathan Tremblay's re-election bid as representative for the electoral district of Montmorency—Charlevoix—Haute-Côte-Nord ended in defeat?,Justin Trudeau,https://en.wikipedia.org/wiki/Jonathan_Tremblay,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Canada,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jonathan_Tremblay', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Canada']" +489,How many days after Peter Tosh died did Bunny Wailer pass away?,"12,226 days",https://en.wikipedia.org/wiki/Peter_Tosh,https://en.wikipedia.org/wiki/Bunny_Wailer,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Peter_Tosh', 'https://en.wikipedia.org/wiki/Bunny_Wailer']" +490,How old was Lucy Lawless when season six of Xena: Warrior Princess first aired?,32,https://en.wikipedia.org/wiki/List_of_Xena:_Warrior_Princess_episodes,https://en.wikipedia.org/wiki/Lucy_Lawless,,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Xena:_Warrior_Princess_episodes', 'https://en.wikipedia.org/wiki/Lucy_Lawless']" +491,"Taylor Swift's debut single is named after another famous country singer. As of August 1, 2024, when is his wife's birthday?","September 21, 1967",https://en.wikipedia.org/wiki/Taylor_Swift,https://en.wikipedia.org/wiki/Tim_McGraw,https://en.wikipedia.org/wiki/Faith_Hill,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Taylor_Swift', 'https://en.wikipedia.org/wiki/Tim_McGraw', 'https://en.wikipedia.org/wiki/Faith_Hill']" +492,What is the name of the rock climber/businessman who co-founded the environmental group that the co-founder of Netflix joined?,Yvon Chouinard,https://en.wikipedia.org/wiki/Netflix,https://en.wikipedia.org/wiki/Marc_Randolph,https://en.wikipedia.org/wiki/One_Percent_for_the_Planet,https://en.wikipedia.org/wiki/Yvon_Chouinard,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Netflix', 'https://en.wikipedia.org/wiki/Marc_Randolph', 'https://en.wikipedia.org/wiki/One_Percent_for_the_Planet', 'https://en.wikipedia.org/wiki/Yvon_Chouinard']" +493,"Meghan Markle's veil, worn at her 2018 wedding to Prince Harry, featured a flower for each Common Wealth country. What colour is the flower that was used to represent New Zealand?",Yellow,https://en.wikipedia.org/wiki/Wedding_dress_of_Meghan_Markle,https://en.wikipedia.org/wiki/Sophora_microphylla,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wedding_dress_of_Meghan_Markle', 'https://en.wikipedia.org/wiki/Sophora_microphylla']" +494,How many years did it take after the FIFA ban was lifted for someone to wear a hijab in the World Cup?,11 years,https://en.wikipedia.org/wiki/Kit_(association_football),https://en.wikipedia.org/wiki/Nouhaila_Benzina,,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/Kit_(association_football)', 'https://en.wikipedia.org/wiki/Nouhaila_Benzina']" +495,"What is the name of the lead role of the play that Harry C. Bradley's second wife was in, in 1906?",Lord Fancourt Babberly,https://w.wiki/ASFv,https://en.wikipedia.org/wiki/Lottie_Alter,https://en.wikipedia.org/wiki/Charley%27s_Aunt,,,,,,,,,Multiple constraints,"['https://w.wiki/ASFv', 'https://en.wikipedia.org/wiki/Lottie_Alter', 'https://en.wikipedia.org/wiki/Charley%27s_Aunt']" +496,"In the largest Northern City in California, the widow of a firearms magnate built a infamous mansion that became a tourist attraction only nine months after her death in the early 1900s. The mansion is not only a magnet for ghost hunters and horror story writers, but also architecture enthusiasts as the owner of the house spent 22 years constructing and adding on additional rooms and stairways and features to the already intricate mansion. The house at one point had 500 rooms, 10,000 windows, 47 fireplaces, and 6 kitchens. What exact date was this structure added to the U.S. National Register of Historic Places?","August 7, 1974",https://en.wikipedia.org/wiki/Northern_California#Cities,"https://en.wikipedia.org/wiki/San_Jose,_California",https://en.wikipedia.org/wiki/National_Register_of_Historic_Places,https://en.wikipedia.org/wiki/Winchester_Mystery_House,,,,,,,,Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Northern_California#Cities', 'https://en.wikipedia.org/wiki/San_Jose,_California', 'https://en.wikipedia.org/wiki/National_Register_of_Historic_Places', 'https://en.wikipedia.org/wiki/Winchester_Mystery_House']" +497,"Fossils of the extinct sea snail, Alvania belgica, were originally found in a European country. What is the capital of that country?",City of Brussels,https://en.wikipedia.org/wiki/Alvania_belgica,https://en.wikipedia.org/wiki/City_of_Brussels,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Alvania_belgica', 'https://en.wikipedia.org/wiki/City_of_Brussels']" +498,How many more points did Michael Jordan average in his sophomore season (regular season) in the NBA than the first black NBA player averaged during his career (regular season)? Show me a math equation to justify your answer.,14.3 PPG 22.7 - 8.4 = 14.3,https://en.wikipedia.org/wiki/Michael_Jordan,https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_NBA,https://en.wikipedia.org/wiki/Earl_Lloyd,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Michael_Jordan', 'https://en.wikipedia.org/wiki/Race_and_ethnicity_in_the_NBA', 'https://en.wikipedia.org/wiki/Earl_Lloyd']" +499,"What is the title of the book, written by Stephanie Meyer, in her vampire book series that was published in the same year that the first Black president of the United States was elected?",Breaking Dawn,https://en.wikipedia.org/wiki/African-American_presidents_of_the_United_States_in_popular_culture,https://en.wikipedia.org/wiki/Barack_Obama,https://en.wikipedia.org/wiki/Stephenie_Meyer,https://en.wikipedia.org/wiki/Breaking_Dawn,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/African-American_presidents_of_the_United_States_in_popular_culture', 'https://en.wikipedia.org/wiki/Barack_Obama', 'https://en.wikipedia.org/wiki/Stephenie_Meyer', 'https://en.wikipedia.org/wiki/Breaking_Dawn']" +500,The US President who officially opened the Tennessee Centennial and International Exposition was married in what year?,1871,https://en.wikipedia.org/wiki/Tennessee_Centennial_and_International_Exposition,https://en.wikipedia.org/wiki/William_McKinley,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Tennessee_Centennial_and_International_Exposition', 'https://en.wikipedia.org/wiki/William_McKinley']" +501,How old was the U.S. President's wife when the Camp David Accords were signed?,She was 51 years old.,https://en.wikipedia.org/wiki/Camp_David_Accords,https://en.wikipedia.org/wiki/Jimmy_Carter,https://en.wikipedia.org/wiki/Rosalynn_Carter,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Camp_David_Accords', 'https://en.wikipedia.org/wiki/Jimmy_Carter', 'https://en.wikipedia.org/wiki/Rosalynn_Carter']" +502,"Out of all of Steven Spielberg's Oscar winning movies up until 2020, which one has made the most money?",Saving Private Ryan,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Steven_Spielberg,https://en.wikipedia.org/wiki/Schindler%27s_List,https://en.wikipedia.org/wiki/Saving_Private_Ryan,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Steven_Spielberg', 'https://en.wikipedia.org/wiki/Schindler%27s_List', 'https://en.wikipedia.org/wiki/Saving_Private_Ryan']" +503,"The number I am thinking about is the atomic number of the heavy metal that shares a name with the tier of the awards programme founded and chaired by Prince Philip that takes , at most, the same number of months to complete as the duration of an average pregnancy. What is the sum of the digits in this number?",11,"https://en.wikipedia.org/wiki/Prince_Philip,_Duke_of_Edinburgh",https://en.wikipedia.org/wiki/The_Duke_of_Edinburgh%27s_Award,https://en.wikipedia.org/wiki/Pregnancy,https://en.wikipedia.org/wiki/Silver,,,,,,,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Prince_Philip,_Duke_of_Edinburgh', 'https://en.wikipedia.org/wiki/The_Duke_of_Edinburgh%27s_Award ', 'https://en.wikipedia.org/wiki/Pregnancy', 'https://en.wikipedia.org/wiki/Silver']" +504,"Concerning just the winners between 2019 and 2024, which Pulitzer Prize-winning author was born in Maryland?",Barbara Kingsolver,https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction,https://en.wikipedia.org/wiki/Richard_Powers,https://en.wikipedia.org/wiki/Jayne_Anne_Phillips,https://en.wikipedia.org/wiki/Barbara_Kingsolver,https://en.wikipedia.org/wiki/Hernan_Diaz_(writer),https://en.wikipedia.org/wiki/Joshua_Cohen_(writer),https://en.wikipedia.org/wiki/Louise_Erdrich,https://en.wikipedia.org/wiki/Colson_Whitehead,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction', 'https://en.wikipedia.org/wiki/Richard_Powers', 'https://en.wikipedia.org/wiki/Jayne_Anne_Phillips', 'https://en.wikipedia.org/wiki/Barbara_Kingsolver', 'https://en.wikipedia.org/wiki/Hernan_Diaz_(writer)', 'https://en.wikipedia.org/wiki/Joshua_Cohen_(writer)', 'https://en.wikipedia.org/wiki/Louise_Erdrich', 'https://en.wikipedia.org/wiki/Colson_Whitehead']" +505,"What rank did Hermann Goring hold in the Luftwaffe during World War II, before Robert Ritter von Greim, and how does this rank compare to the equivalent ranks in other branches of the German military?","Reichsmarschall, which was a rank above ""General de Luftwaffe."" This rank does not exist in other branches of the German military and was unique to Goring himself as Robert Ritter von Greim held the title of Generalfeldmarschall after Hermann. ",https://en.wikipedia.org/wiki/General_der_Luftwaffe,https://en.wikipedia.org/wiki/General_of_the_branch,https://en.wikipedia.org/wiki/Luftwaffe,https://en.wikipedia.org/wiki/Hermann_G%C3%B6ring,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/General_der_Luftwaffe', 'https://en.wikipedia.org/wiki/General_of_the_branch', 'https://en.wikipedia.org/wiki/Luftwaffe', 'https://en.wikipedia.org/wiki/Hermann_G%C3%B6ring']" +506,What is the full name of the district where the Memory of Mankind project is located?,Bezirk Gmunden,https://en.wikipedia.org/wiki/Memory_of_Mankind,https://en.wikipedia.org/wiki/Hallstatt,https://en.wikipedia.org/wiki/Gmunden_District,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Memory_of_Mankind', 'https://en.wikipedia.org/wiki/Hallstatt', 'https://en.wikipedia.org/wiki/Gmunden_District']" +507,How old was the journalist who reviewed the first iPad for the Wall St Journal when the first iPad came out?,Walt Mossberg was 63 when the first iPad was released.,https://en.wikipedia.org/wiki/IPad#,https://en.wikipedia.org/wiki/Walt_Mossberg,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/IPad#', 'https://en.wikipedia.org/wiki/Walt_Mossberg']" +508,Mass Effect 2 once won the D.I.C.E. Award for Game of the Year. Who was the director for the game that won the same award twelve years earlier?,Martin Hollis,https://en.wikipedia.org/wiki/List_of_Game_of_the_Year_awards,https://en.wikipedia.org/wiki/GoldenEye_007_(1997_video_game),https://en.wikipedia.org/wiki/D.I.C.E._Award_for_Game_of_the_Year,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Game_of_the_Year_awards', 'https://en.wikipedia.org/wiki/GoldenEye_007_(1997_video_game)', 'https://en.wikipedia.org/wiki/D.I.C.E._Award_for_Game_of_the_Year']" +509,"As of August 3, 2024, what is the capital of the country with the largest energy complex in South Eastern Europe?","Sofia is the capital of Bulgaria, which is home to the largest energy complex in South Eastern Europe, the Maritsa Iztok Complex in Galabovo. ",https://en.wikipedia.org/wiki/Maritsa_Iztok_Complex,https://en.wikipedia.org/wiki/Bulgaria,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Maritsa_Iztok_Complex', 'https://en.wikipedia.org/wiki/Bulgaria']" +510,"As of August 2024, which cast member fired from Saturday Night Live appeared on the show Hot Ones?",Shane Gillis,https://en.wikipedia.org/wiki/Saturday_Night_Live#Controversies,https://en.wikipedia.org/wiki/List_of_Hot_Ones_episodes#Season_23_(2024),,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Saturday_Night_Live#Controversies', 'https://en.wikipedia.org/wiki/List_of_Hot_Ones_episodes#Season_23_(2024)']" +511,How many more solo studio albums did Taylor Swift release than Beyonce between the years 2003-2023 (not including re-recorded albums)?,3,https://en.wikipedia.org/wiki/Beyonc%C3%A9#,https://en.wikipedia.org/wiki/Taylor_Swift#,,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Beyonc%C3%A9#', 'https://en.wikipedia.org/wiki/Taylor_Swift#']" +512,Andre the Giant's favorite acting role was for a 1987 film. Who was the director of that film?,Rob Reiner,https://en.wikipedia.org/wiki/Andr%C3%A9_the_Giant,https://en.wikipedia.org/wiki/The_Princess_Bride_(film),,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Andr%C3%A9_the_Giant', 'https://en.wikipedia.org/wiki/The_Princess_Bride_(film)']" +513,What lasted longer: the reign of Queen Elizabeth II or the life of Julius Caesar?,The reign of Queen Elizabeth II,https://en.wikipedia.org/wiki/Elizabeth_II,https://en.wikipedia.org/wiki/Julius_Caesar,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Elizabeth_II', 'https://en.wikipedia.org/wiki/Julius_Caesar']" +514,What is the birthday of the actor that portrayed the character David Cronenberg based on Marshall McLuhan?,"March 6, 1926",https://en.wikipedia.org/wiki/Marshall_McLuhan,https://en.wikipedia.org/wiki/Videodrome,https://en.wikipedia.org/wiki/Jack_Creley,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Marshall_McLuhan', 'https://en.wikipedia.org/wiki/Videodrome', 'https://en.wikipedia.org/wiki/Jack_Creley']" +515,"The first president of the International Olympic Committee was born on a Greek island, belonging to which island group?",Cyclades,https://en.m.wikipedia.org/wiki/President_of_the_International_Olympic_Committee,https://en.m.wikipedia.org/wiki/Demetrios_Vikelas,https://en.m.wikipedia.org/wiki/Ermoupoli,,,,,,,,,Multiple constraints,"['https://en.m.wikipedia.org/wiki/President_of_the_International_Olympic_Committee', 'https://en.m.wikipedia.org/wiki/Demetrios_Vikelas', 'https://en.m.wikipedia.org/wiki/Ermoupoli']" +516,Which film featuring a solar eclipse in its opening scene is adapted from the same source material as a David Lynch movie?,Dune: Part Two,https://en.wikipedia.org/wiki/List_of_films_featuring_eclipses,https://en.wikipedia.org/wiki/David_Lynch_filmography,https://en.wikipedia.org/wiki/Dune_(novel),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_films_featuring_eclipses', 'https://en.wikipedia.org/wiki/David_Lynch_filmography', 'https://en.wikipedia.org/wiki/Dune_(novel)']" +517,What age was the Director of Inception (2010) when the film was released in the UK?,39,https://en.wikipedia.org/wiki/Inception,https://en.wikipedia.org/wiki/Christopher_Nolan,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Inception', 'https://en.wikipedia.org/wiki/Christopher_Nolan']" +518,"During the same year the Orlando Magic lost to the Los Angeles Lakers in their second NBA Finals appearance, what strain of flu spread into pandemic status throughout the world?",The Swine Flu.,https://en.wikipedia.org/wiki/Orlando_Magic,https://en.wikipedia.org/wiki/2009_swine_flu_pandemic,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Orlando_Magic', 'https://en.wikipedia.org/wiki/2009_swine_flu_pandemic']" +519,Who won the Academy Award for Best Actor the year that John Steinbeck was awarded the Nobel prize for Literature?,Gregory Peck,https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature#1960,https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor#1960s,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature#1960', 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor#1960s']" +520,What was the population in 2020 of the city that is in the name of the football team that won the Super Bowl five years before Matthew McConaughey won best actor at the Academy Awards?,"302,971",https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Matthew_McConaughey,https://en.wikipedia.org/wiki/Super_Bowl_XLIII,https://en.wikipedia.org/wiki/Pittsburgh,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Matthew_McConaughey', 'https://en.wikipedia.org/wiki/Super_Bowl_XLIII', 'https://en.wikipedia.org/wiki/Pittsburgh']" +521,"How many England caps were won by university rugby teammates Will Carling, Chris Oti, and Andy Mullins? Round your answer to the nearest ten.",90,https://en.wikipedia.org/wiki/Will_Carling,https://en.wikipedia.org/wiki/Chris_Oti,https://en.wikipedia.org/wiki/Andy_Mullins_(rugby_union),,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/Will_Carling', 'https://en.wikipedia.org/wiki/Chris_Oti', 'https://en.wikipedia.org/wiki/Andy_Mullins_(rugby_union)']" +522,What two actors starred in both The Craft and Scream in the same year?,Neve Cambell and Skeet Ulrich,https://en.wikipedia.org/wiki/Scream_(1996_film),https://en.wikipedia.org/wiki/The_Craft_(film),,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Scream_(1996_film)', 'https://en.wikipedia.org/wiki/The_Craft_(film)']" +523,How many letters were in the name of the first single by the artist who played the first concert at Principality Stadium?,12,https://en.wikipedia.org/wiki/Millennium_Stadium,https://en.wikipedia.org/wiki/List_of_concerts_at_the_Millennium_Stadium,https://en.wikipedia.org/wiki/Manic_Street_Preachers,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Millennium_Stadium', 'https://en.wikipedia.org/wiki/List_of_concerts_at_the_Millennium_Stadium', 'https://en.wikipedia.org/wiki/Manic_Street_Preachers']" +524,"How many more letters are in the first name of the eighth Director of Special Forces (United Kingdom) than the runner who won Silver in the 1985 UK Athletics Championship 10,000 meters event? Give the answer in morse code.",....-,https://en.wikipedia.org/wiki/Director_Special_Forces,https://en.wikipedia.org/wiki/1985_UK_Athletics_Championships,https://en.wikipedia.org/wiki/Morse_code,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Director_Special_Forces', 'https://en.wikipedia.org/wiki/1985_UK_Athletics_Championships', 'https://en.wikipedia.org/wiki/Morse_code']" +525,What popular ice cream dessert shares its birthplace with Fred Rogers?,The Banana Split,https://en.wikipedia.org/wiki/Fred_Rogers,"https://en.wikipedia.org/wiki/Latrobe,_Pennsylvania",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Fred_Rogers', 'https://en.wikipedia.org/wiki/Latrobe,_Pennsylvania']" +526,"If the author of the philosophical magnum opus Being and Time were to adopt the last name of the winner of the Nobel Prize for literature in 1964 and the middle name of the person to first break the 4-minute mile as the middle name, what would the full name be?",Martin Gilbert Sartre,https://en.wikipedia.org/wiki/Being_and_Time,https://en.wikipedia.org/wiki/1964_Nobel_Prize_in_Literature,https://en.wikipedia.org/wiki/Four-minute_mile,https://en.wikipedia.org/wiki/Roger_Bannister,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Being_and_Time', 'https://en.wikipedia.org/wiki/1964_Nobel_Prize_in_Literature', 'https://en.wikipedia.org/wiki/Four-minute_mile', 'https://en.wikipedia.org/wiki/Roger_Bannister']" +527,What is the percentage increase of total deaths of Japanese during World War One versus World War Two? Use the largest number count found to calculate as these numbers are often ranges.,66409.3% increase,https://en.wikipedia.org/wiki/World_War_I_casualties,https://en.wikipedia.org/wiki/World_War_II_casualties,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/World_War_I_casualties', 'https://en.wikipedia.org/wiki/World_War_II_casualties']" +528,"Baldur's Gate 3 was released in 2023 and the game and its staff have received countless awards. How old will Astarion's voice actor be on August 15, 2035?",57 years old,https://en.wikipedia.org/wiki/Baldur%27s_Gate_3#,https://en.wikipedia.org/wiki/Neil_Newbon,,,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Baldur%27s_Gate_3#', 'https://en.wikipedia.org/wiki/Neil_Newbon']" +529,How old would Olivia Newton-John have been at the release of Grease 2 in the United States?,33 years old.,https://en.wikipedia.org/wiki/Olivia_Newton-John,https://en.wikipedia.org/wiki/Grease_2,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Olivia_Newton-John', 'https://en.wikipedia.org/wiki/Grease_2']" +530,"As of 2024, what is the street address of the headquarters of the American online video sharing platform that was launched in the same year as the founding of the Vancouver Cherry Blossom festival?","901 Cherry Avenue, San Bruno, California, United States",https://en.wikipedia.org/wiki/Vancouver_Cherry_Blossom_Festival,https://en.wikipedia.org/wiki/YouTube,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Vancouver_Cherry_Blossom_Festival', 'https://en.wikipedia.org/wiki/YouTube']" +531,"Coached in the 2023/24 season by Dusty May, this college basketball player was the 2023/24 co-winner of the AAC Player of the Year Award. Who is that player?",Johnell Davis,https://en.wikipedia.org/wiki/Florida_Atlantic_Owls_men%27s_basketball,https://en.wikipedia.org/wiki/American_Athletic_Conference_Men%27s_Basketball_Player_of_the_Year,https://en.wikipedia.org/wiki/Johnell_Davis,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Florida_Atlantic_Owls_men%27s_basketball', 'https://en.wikipedia.org/wiki/American_Athletic_Conference_Men%27s_Basketball_Player_of_the_Year', 'https://en.wikipedia.org/wiki/Johnell_Davis']" +532,Who was number 3 on the 1976-77 NBA World Champions team?,Herm Gilliam,https://en.wikipedia.org/wiki/1977_NBA_Finals,https://en.wikipedia.org/wiki/1976%E2%80%9377_Portland_Trail_Blazers_season,,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/1977_NBA_Finals', 'https://en.wikipedia.org/wiki/1976%E2%80%9377_Portland_Trail_Blazers_season']" +533,"As of August 3rd 2024, how many countries are full members of the council that oversees the sport that Henry Simonds plays?",12,https://en.wikipedia.org/wiki/Henry_Simonds,https://en.wikipedia.org/wiki/Cricket#Governance,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Henry_Simonds', 'https://en.wikipedia.org/wiki/Cricket#Governance']" +534,How many pages do the first edition harry potter series books have combined?,3407,https://en.wikipedia.org/wiki/Harry_Potter,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Philosopher%27s_Stone,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Chamber_of_Secrets,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Prisoner_of_Azkaban,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Goblet_of_Fire,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Order_of_the_Phoenix,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince,https://en.wikipedia.org/wiki/Harry_Potter_and_the_Deathly_Hallows,,,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Harry_Potter', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Philosopher%27s_Stone', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Chamber_of_Secrets', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Prisoner_of_Azkaban', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Goblet_of_Fire', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Order_of_the_Phoenix', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Half-Blood_Prince', 'https://en.wikipedia.org/wiki/Harry_Potter_and_the_Deathly_Hallows']" +535,"I was one of Bad Religion's drummers between 1990 and 2023. I am from California, but not Los Angeles. Who am I?",Brooks Wackerman,https://en.wikipedia.org/wiki/List_of_Bad_Religion_members,https://en.wikipedia.org/wiki/Jamie_Miller_(drummer),https://en.wikipedia.org/wiki/Lucky_Lehrer,https://en.wikipedia.org/wiki/Bobby_Schayer,https://en.wikipedia.org/wiki/Brooks_Wackerman,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Bad_Religion_members', 'https://en.wikipedia.org/wiki/Jamie_Miller_(drummer)', 'https://en.wikipedia.org/wiki/Lucky_Lehrer', 'https://en.wikipedia.org/wiki/Bobby_Schayer', 'https://en.wikipedia.org/wiki/Brooks_Wackerman']" +536,What was the pseudonym of one of the co-founders of the Eagle Awards that won Favourite Specialist Comics Publication/Trade Publication 1977 and 1978?,Burt,https://en.wikipedia.org/wiki/Eagle_Awards,https://en.wikipedia.org/wiki/Richard_Burton_(comics),,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Eagle_Awards', 'https://en.wikipedia.org/wiki/Richard_Burton_(comics)']" +537,I am thinking of a movie where Hans Zimmer won a Grammy Award for his work. He won the Grammy award the same year that he did his first musical score for film director Michael Bay. Can you please tell me the name of that movie?,Crimson Tide,https://en.wikipedia.org/wiki/Michael_Bay,https://en.wikipedia.org/wiki/Hans_Zimmer#Grammy_Awards,https://en.wikipedia.org/wiki/Bad_Boys_(1995_film),https://en.wikipedia.org/wiki/The_Rock_(film),https://en.wikipedia.org/wiki/Armageddon_(1998_film),https://en.wikipedia.org/wiki/Pearl_Harbor_(film),https://en.wikipedia.org/wiki/Bad_Boys_II,https://en.wikipedia.org/wiki/The_Island_(2005_film),https://en.wikipedia.org/wiki/Transformers_(film),https://en.wikipedia.org/wiki/Transformers:_Revenge_of_the_Fallen,"https://en.wikipedia.org/wiki/Transformers:_Dark_of_the_Moon, https://en.wikipedia.org/wiki/Pain_%26_Gain, https://en.wikipedia.org/wiki/Transformers:_Age_of_Extinction, https://en.wikipedia.org/wiki/13_Hours:_The_Secret_Soldiers_of_Benghazi, https://en.wikipedia.org/wiki/Transformers:_The_Last_Knight, https://en.wikipedia.org/wiki/6_Underground_(film), https://en.wikipedia.org/wiki/Ambulance_(2022_film)",Multiple constraints,"['https://en.wikipedia.org/wiki/Michael_Bay', 'https://en.wikipedia.org/wiki/Hans_Zimmer#Grammy_Awards', 'https://en.wikipedia.org/wiki/Bad_Boys_(1995_film)', 'https://en.wikipedia.org/wiki/The_Rock_(film)', 'https://en.wikipedia.org/wiki/Armageddon_(1998_film)', 'https://en.wikipedia.org/wiki/Pearl_Harbor_(film)', 'https://en.wikipedia.org/wiki/Bad_Boys_II', 'https://en.wikipedia.org/wiki/The_Island_(2005_film)', 'https://en.wikipedia.org/wiki/Transformers_(film)', 'https://en.wikipedia.org/wiki/Transformers:_Revenge_of_the_Fallen', 'https://en.wikipedia.org/wiki/Transformers:_Dark_of_the_Moon, https://en.wikipedia.org/wiki/Pain_%26_Gain, https://en.wikipedia.org/wiki/Transformers:_Age_of_Extinction, https://en.wikipedia.org/wiki/13_Hours:_The_Secret_Soldiers_of_Benghazi, https://en.wikipedia.org/wiki/Transformers:_The_Last_Knight, https://en.wikipedia.org/wiki/6_Underground_(film), https://en.wikipedia.org/wiki/Ambulance_(2022_film)']" +538,"As of July 4th, 2024, what is the ratio of the number of years in the Early Dynastic Period of Egypt to the number of years since the United States declared independence? Round your answer to the nearest whole number.",2,https://en.wikipedia.org/wiki/Ancient_Egypt,https://en.wikipedia.org/wiki/United_States,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Ancient_Egypt', 'https://en.wikipedia.org/wiki/United_States']" +539,Who was the British monarch when Michigan was admitted as a state in the United States of America?,King William IV.,https://en.wikipedia.org/wiki/Michigan,https://en.wikipedia.org/wiki/List_of_British_monarchs,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Michigan', 'https://en.wikipedia.org/wiki/List_of_British_monarchs']" +540,"Of the participants in the first round of the 2024 Yucatán Open (Doubles), which Mexican player had received two wild cards during their career prior to August 4, 2024?",Alan Fernando Rubio Fierros,https://en.wikipedia.org/wiki/2024_Yucat%C3%A1n_Open_%E2%80%93_Doubles,https://en.wikipedia.org/wiki/George_Goldhoff,https://en.wikipedia.org/wiki/James_Trotter_(tennis),https://en.wikipedia.org/wiki/Alan_Fernando_Rubio_Fierros,https://en.wikipedia.org/wiki/Rodrigo_Pacheco_M%C3%A9ndez,https://en.wikipedia.org/wiki/Roberto_Cid_Subervi,https://en.wikipedia.org/wiki/Nick_Hardt,https://en.wikipedia.org/wiki/Juan_Pablo_Paz_(tennis),https://en.wikipedia.org/wiki/Jack_Vance_(tennis),https://en.wikipedia.org/wiki/Stefan_Kozlov,"https://en.wikipedia.org/wiki/Murkel_Dellien, https://en.wikipedia.org/wiki/Facundo_Mena, https://en.wikipedia.org/wiki/Louis_Wessels, https://en.wikipedia.org/wiki/Gabi_Adrian_Boitan, https://en.wikipedia.org/wiki/Trey_Hilderbrand, https://en.wikipedia.org/wiki/Oliver_Crawford_(tennis), https://en.wikipedia.org/wiki/Tristan_Boyer, https://en.wikipedia.org/wiki/Thomas_Fancutt, https://en.wikipedia.org/wiki/Hunter_Reese, https://en.wikipedia.org/wiki/Alex_Hern%C3%A1ndez_(tennis),",Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/2024_Yucat%C3%A1n_Open_%E2%80%93_Doubles', 'https://en.wikipedia.org/wiki/George_Goldhoff', 'https://en.wikipedia.org/wiki/James_Trotter_(tennis)', 'https://en.wikipedia.org/wiki/Alan_Fernando_Rubio_Fierros', 'https://en.wikipedia.org/wiki/Rodrigo_Pacheco_M%C3%A9ndez', 'https://en.wikipedia.org/wiki/Roberto_Cid_Subervi', 'https://en.wikipedia.org/wiki/Nick_Hardt', 'https://en.wikipedia.org/wiki/Juan_Pablo_Paz_(tennis)', 'https://en.wikipedia.org/wiki/Jack_Vance_(tennis)', 'https://en.wikipedia.org/wiki/Stefan_Kozlov', 'https://en.wikipedia.org/wiki/Murkel_Dellien, https://en.wikipedia.org/wiki/Facundo_Mena, https://en.wikipedia.org/wiki/Louis_Wessels, https://en.wikipedia.org/wiki/Gabi_Adrian_Boitan, https://en.wikipedia.org/wiki/Trey_Hilderbrand, https://en.wikipedia.org/wiki/Oliver_Crawford_(tennis), https://en.wikipedia.org/wiki/Tristan_Boyer, https://en.wikipedia.org/wiki/Thomas_Fancutt, https://en.wikipedia.org/wiki/Hunter_Reese, https://en.wikipedia.org/wiki/Alex_Hern%C3%A1ndez_(tennis), ']" +541,Which Kiwi author died the same year as the first British Prime Minister to be elected after the end of World War 1?,Katherine Mansfield,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,https://en.wikipedia.org/wiki/Bonar_Law,https://en.wikipedia.org/wiki/1923#Deaths,https://en.wikipedia.org/wiki/World_War_I,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom', 'https://en.wikipedia.org/wiki/Bonar_Law', 'https://en.wikipedia.org/wiki/1923#Deaths', 'https://en.wikipedia.org/wiki/World_War_I']" +542,"What war memorial was constructed and completed at the same time as the beginning of WWII, located on an island in the Potomac River in the US capital, and honors four of the eight federal uniformed services of the US that lost their lives at sea during the war and other previous conflicts?",Navy-Merchant Marine Memorial,https://en.wikipedia.org/wiki/Navy_%E2%80%93_Merchant_Marine_Memorial,"https://en.wikipedia.org/wiki/Columbia_Island_(Washington,_D.C.)",https://en.wikipedia.org/wiki/Uniformed_services_of_the_United_States,https://en.wikipedia.org/wiki/World_War_II,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Navy_%E2%80%93_Merchant_Marine_Memorial', 'https://en.wikipedia.org/wiki/Columbia_Island_(Washington,_D.C.)', 'https://en.wikipedia.org/wiki/Uniformed_services_of_the_United_States', 'https://en.wikipedia.org/wiki/World_War_II']" +543,"As of 2024, which islands in the Indonesian archipelago are home to the UNESCO World Heritage Site for the largest species of extant lizard?",The Lesser Sundra Islands,https://en.wikipedia.org/wiki/List_of_largest_extant_lizards,https://en.wikipedia.org/wiki/Indonesia,https://en.wikipedia.org/wiki/Komodo_National_Park,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_largest_extant_lizards', 'https://en.wikipedia.org/wiki/Indonesia', 'https://en.wikipedia.org/wiki/Komodo_National_Park']" +544,"Which of these series was published earliest? Wedding Peach, Tokyo Mew Mew, Sailor Moon",Sailor Moon,https://en.wikipedia.org/wiki/Sailor_Moon,https://en.wikipedia.org/wiki/Wedding_Peach,https://en.wikipedia.org/wiki/Tokyo_Mew_Mew,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Sailor_Moon', 'https://en.wikipedia.org/wiki/Wedding_Peach', 'https://en.wikipedia.org/wiki/Tokyo_Mew_Mew']" +545,How many published literary works had 1998's Nobel Prize in Literatures' recipient have at the time they received the award? Only count publications in the author's native language.,21,https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature,https://en.wikipedia.org/wiki/José_Saramago#,,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature', 'https://en.wikipedia.org/wiki/José_Saramago#']" +546,What is the name of the high school and hometown of the Olympic Gold Medalist winner who won 4-3 against Hassan Yazdani at the Japan Olympics in 2020?,Graham High School in St. Paris Ohio,https://en.wikipedia.org/wiki/2020_Summer_Olympics,https://en.wikipedia.org/wiki/Wrestling_at_the_2020_Summer_Olympics#Medalists,https://en.wikipedia.org/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_86_kg,"https://en.wikipedia.org/wiki/David_Taylor_(wrestler,_born_1990)","https://en.wikipedia.org/wiki/Graham_High_School_(St._Paris,_Ohio)",,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/2020_Summer_Olympics', 'https://en.wikipedia.org/wiki/Wrestling_at_the_2020_Summer_Olympics#Medalists', 'https://en.wikipedia.org/wiki/Wrestling_at_the_2020_Summer_Olympics_%E2%80%93_Men%27s_freestyle_86_kg', 'https://en.wikipedia.org/wiki/David_Taylor_(wrestler,_born_1990)', 'https://en.wikipedia.org/wiki/Graham_High_School_(St._Paris,_Ohio)']" +547,"Sworn in for his second term, who was US President during the only year in history to feature Triple Crown winners in both horse racing AND baseball (War Admiral and St. Louis Cardinals' left fielder Joe Medwick, respectively), as of August 3, 2024?",Franklin D. Roosevelt (1937),https://en.wikipedia.org/wiki/Triple_Crown_of_Thoroughbred_Racing_(United_States)#Winners_of_the_Triple_Crown,https://en.wikipedia.org/wiki/Triple_Crown_(baseball),https://en.wikipedia.org/wiki/Franklin_D._Roosevelt,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Triple_Crown_of_Thoroughbred_Racing_(United_States)#Winners_of_the_Triple_Crown', 'https://en.wikipedia.org/wiki/Triple_Crown_(baseball)', 'https://en.wikipedia.org/wiki/Franklin_D._Roosevelt']" +548,"How many years after Anton Grylewicz's date of birth was the second SpongeBob Squarepants movie released? Round down to the nearest year (e.g. January 1999 to December 2000 = 1 year, despite being closer to 2).",130,https://en.wikipedia.org/wiki/Anton_Grylewicz,https://en.wikipedia.org/wiki/SpongeBob_SquarePants#Franchise,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Anton_Grylewicz', 'https://en.wikipedia.org/wiki/SpongeBob_SquarePants#Franchise']" +549,"Out of the following man-portable launchers, which entered service with their respective military last? A) FGM-148 Javelin B) 9K38 Igla C) FIM-92 Stinger.",A) FGM-148 Javelin,https://en.wikipedia.org/wiki/FGM-148_Javelin,https://en.wikipedia.org/wiki/9K38_Igla,https://en.wikipedia.org/wiki/FIM-92_Stinger,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/FGM-148_Javelin', 'https://en.wikipedia.org/wiki/9K38_Igla', 'https://en.wikipedia.org/wiki/FIM-92_Stinger']" +550,"As of August 4, 2024, Rosamund Pike voices a character in a podcast about the so-called ""First Female President."" How many siblings does the character have?",10,https://en.wikipedia.org/wiki/Edith!,https://en.wikipedia.org/wiki/Edith_Wilson,,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Edith!', 'https://en.wikipedia.org/wiki/Edith_Wilson']" +551,"Which angle is greater: the recommended angle a glass should be tilted when pouring a pint of Guinness, or the angle the nose of a light aircraft is tilted to effect liftoff?",The recommended angle a glass should be tilted when pouring a pint of Guinness,https://en.wikipedia.org/wiki/Guinness,https://en.wikipedia.org/wiki/Takeoff,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Guinness', 'https://en.wikipedia.org/wiki/Takeoff']" +552,Which NFL team won the second Super Bowl after Donald Trump was elected to be President during the term directly after Obama?,The Philadelphia Eagles won the second Super Bowl during Trump's term in 2018.,https://en.wikipedia.org/wiki/Philadelphia_Eagles,https://en.wikipedia.org/wiki/Donald_Trump,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Philadelphia_Eagles', 'https://en.wikipedia.org/wiki/Donald_Trump']" +553,The city which has India's most famous biryani dish was from 1956 until 2014 the capital of a state which subsequently split into two new states; what is the capital of the state which does not contain the city from where the famous biryani originates?,Amaravati.,https://en.wikipedia.org/wiki/Biryani,https://en.wikipedia.org/wiki/Hyderabadi_biryani,https://en.wikipedia.org/wiki/Hyderabad,https://en.wikipedia.org/wiki/Andhra_Pradesh_(1956%E2%80%932014),https://en.wikipedia.org/wiki/Telangana,https://en.wikipedia.org/wiki/Andhra_Pradesh,https://en.wikipedia.org/wiki/Amaravati,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Biryani', 'https://en.wikipedia.org/wiki/Hyderabadi_biryani', 'https://en.wikipedia.org/wiki/Hyderabad', 'https://en.wikipedia.org/wiki/Andhra_Pradesh_(1956%E2%80%932014)', 'https://en.wikipedia.org/wiki/Telangana', 'https://en.wikipedia.org/wiki/Andhra_Pradesh', 'https://en.wikipedia.org/wiki/Amaravati']" +554,I live in a US state that was admitted to the union in January 1959. What was the approximate population of the capital of my state according to the last census of the 20th century?,"26,751",https://en.wikipedia.org/wiki/List_of_U.S._states_by_date_of_admission_to_the_Union,https://en.wikipedia.org/wiki/Alaska,"https://en.wikipedia.org/wiki/Juneau,_Alaska",,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_U.S._states_by_date_of_admission_to_the_Union', 'https://en.wikipedia.org/wiki/Alaska', 'https://en.wikipedia.org/wiki/Juneau,_Alaska']" +555,"When Metallica released its album ""Master of Puppets,"" how old were founding members James Hetfield and Lars Ulrich?",22,https://en.wikipedia.org/wiki/Master_of_Puppets,https://en.wikipedia.org/wiki/James_Hetfield,https://en.wikipedia.org/wiki/Lars_Ulrich,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Master_of_Puppets', 'https://en.wikipedia.org/wiki/James_Hetfield', 'https://en.wikipedia.org/wiki/Lars_Ulrich']" +556,"As of August 3rd 2024, the only documented genus of poison dart frog to be used in blow darts by Aboriginal South Americans contains many alkaloids, the most toxic of which, when exposed, irreversibly opens what kind of channels within nerve cells?",Sodium,https://en.wikipedia.org/wiki/Poison_dart_frog,https://en.wikipedia.org/wiki/Phyllobates,https://en.wikipedia.org/wiki/Batrachotoxin,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Poison_dart_frog', 'https://en.wikipedia.org/wiki/Phyllobates', 'https://en.wikipedia.org/wiki/Batrachotoxin']" +557,What's the name of the third track on the third studio album of the KPop girl group that started in 1997 and had 5 members.,Missing You,https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(1990s),https://en.wikipedia.org/wiki/Baby_Vox#Studio_albums,https://en.wikipedia.org/wiki/Come_Come_Come_Baby,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_South_Korean_idol_groups_(1990s)', 'https://en.wikipedia.org/wiki/Baby_Vox#Studio_albums', 'https://en.wikipedia.org/wiki/Come_Come_Come_Baby']" +558,Use this information: -The Dragonfly Sea is a novel by Yvonne Adhiambo Owuor. -It has a publisher. -The publisher has two co-founders. What was the original name of the university where the male co-founder of this publisher studied?,King's College,https://en.wikipedia.org/wiki/The_Dragonfly_Sea,https://en.wikipedia.org/wiki/Alfred_A._Knopf,https://en.wikipedia.org/wiki/Alfred_A._Knopf_Sr.,https://en.wikipedia.org/wiki/Columbia_University,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/The_Dragonfly_Sea', 'https://en.wikipedia.org/wiki/Alfred_A._Knopf', 'https://en.wikipedia.org/wiki/Alfred_A._Knopf_Sr.', 'https://en.wikipedia.org/wiki/Columbia_University']" +559,What was the most-sold album in the year that the Dallas Stars won the Stanley Cup in the 1990s?,Millennium - Backstreet Boys (1999),https://en.wikipedia.org/wiki/List_of_Stanley_Cup_champions,"https://en.wikipedia.org/wiki/List_of_Billboard_200_number-one_albums_of_1999#:~:text=Millennium%20became%20the%20best%2Dselling,nomination%20at%20the%20Grammy%20Awards.",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Stanley_Cup_champions', 'https://en.wikipedia.org/wiki/List_of_Billboard_200_number-one_albums_of_1999#:~:text=Millennium%20became%20the%20best%2Dselling,nomination%20at%20the%20Grammy%20Awards.']" +560,"Counting Crows burst onto the scene with hits like ""Mr. Jones,"" and ""Round Here."" Which of their albums came out the same year as the first Atlanta Summer Olympics?",Recovering the Satellites,https://en.wikipedia.org/wiki/Summer_Olympic_Games,https://en.wikipedia.org/wiki/Counting_Crows,https://en.wikipedia.org/wiki/Recovering_the_Satellites,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Summer_Olympic_Games', 'https://en.wikipedia.org/wiki/Counting_Crows', 'https://en.wikipedia.org/wiki/Recovering_the_Satellites']" +561,"What are the lyrics to the popular, unofficial fight song of the university that the main character of Gilmore Girls attended?","Boola boola, boola boola, boola boola, boola, boola Oh when we're through with those poor fellows They will holler boola, boo Rah, rah! Oh Yale, Eli Yale Oh Yale, Eli Yale Oh Yale, Eli Yale Oh Yale, Eli Yale",https://en.wikipedia.org/wiki/Gilmore_Girls,https://en.wikipedia.org/wiki/Yale_University,https://en.wikipedia.org/wiki/Boola_Boola,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Gilmore_Girls', 'https://en.wikipedia.org/wiki/Yale_University', 'https://en.wikipedia.org/wiki/Boola_Boola']" +562,A unified team competed in the 1992 Summer Olympic games. How many years old was the first leader of the largest member nation of that team at the time of the game?,61 years old.,https://en.wikipedia.org/wiki/Unified_Team_at_the_Olympics,https://en.wikipedia.org/wiki/Soviet_Union,https://en.wikipedia.org/wiki/List_of_presidents_of_Russia,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Unified_Team_at_the_Olympics', 'https://en.wikipedia.org/wiki/Soviet_Union', 'https://en.wikipedia.org/wiki/List_of_presidents_of_Russia']" +563,"In 1994, Linus Roache starred in Priest. Who composed the music on his next film?",Edward Shearmur,https://en.wikipedia.org/wiki/Priest_(1994_film),https://en.wikipedia.org/wiki/Linus_Roache,https://en.wikipedia.org/wiki/The_Wings_of_the_Dove_(1997_film),,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Priest_(1994_film)', 'https://en.wikipedia.org/wiki/Linus_Roache', 'https://en.wikipedia.org/wiki/The_Wings_of_the_Dove_(1997_film)']" +564,"How many years elapsed between the release of the song ""I Think I'm Go Go"" by the band Squeeze and the theatrical premier of E.T. the movie?",2,https://en.wikipedia.org/wiki/I_Think_I'm_Go_Go,https://en.wikipedia.org/wiki/E.T._the_Extra-Terrestrial,,,,,,,,,,Numerical reasoning | Temporal reasoning,"[""https://en.wikipedia.org/wiki/I_Think_I'm_Go_Go"", 'https://en.wikipedia.org/wiki/E.T._the_Extra-Terrestrial']" +565,Who was the Super Bowl MVP's wife the year the building Rodney Gordon designed was sold to Capital and City Group?,Abby McGrew,https://en.wikipedia.org/wiki/Rodney_Gordon,"https://en.wikipedia.org/wiki/Target_House,_London",https://en.wikipedia.org/wiki/Super_Bowl,https://en.wikipedia.org/wiki/Super_Bowl_XLVI,https://en.wikipedia.org/wiki/Eli_Manning,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Rodney_Gordon', 'https://en.wikipedia.org/wiki/Target_House,_London', 'https://en.wikipedia.org/wiki/Super_Bowl', 'https://en.wikipedia.org/wiki/Super_Bowl_XLVI', 'https://en.wikipedia.org/wiki/Eli_Manning']" +566,"Who designed the first 5 figurines in Wroclaw, Poland which now number in the hundreds and are a popularly looked for by tourists?",Tomasz Moczek,https://en.wikipedia.org/wiki/Wroc%C5%82aw,https://en.wikipedia.org/wiki/Wroc%C5%82aw_Dwarfs,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wroc%C5%82aw', 'https://en.wikipedia.org/wiki/Wroc%C5%82aw_Dwarfs']" +567,What language is the main character's name in in the Disney film that came out in 1994?,Swahili,https://en.wikipedia.org/wiki/List_of_Walt_Disney_Animation_Studios_films#ep29,https://en.wikipedia.org/wiki/The_Lion_King,https://en.wikipedia.org/wiki/Simba,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Walt_Disney_Animation_Studios_films#ep29', 'https://en.wikipedia.org/wiki/The_Lion_King', 'https://en.wikipedia.org/wiki/Simba']" +568,"How many fewer races did Sebastian Vettel complete before he retired compared to his hero, Michael Schumacher? ",Seven,https://en.wikipedia.org/wiki/Sebastian_Vettel,https://en.wikipedia.org/wiki/Michael_Schumacher,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Sebastian_Vettel', 'https://en.wikipedia.org/wiki/Michael_Schumacher']" +569,Who was the MVP in the season that Cam Plante played in the National Hockey League?,Wayne Gretzky,https://en.wikipedia.org/wiki/Cam_Plante,https://en.wikipedia.org/wiki/1984%E2%80%9385_NHL_season,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Cam_Plante', 'https://en.wikipedia.org/wiki/1984%E2%80%9385_NHL_season']" +570,How old was Katie Couric when Oprah Winfrey was 8 years old?,Katie Couric was 5 years old when Oprah Winfrey was 8 years old.,https://en.wikipedia.org/wiki/Oprah_Winfrey,https://en.wikipedia.org/wiki/Katie_Couric,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Oprah_Winfrey', 'https://en.wikipedia.org/wiki/Katie_Couric']" +571,"In 2003, Audible entered an agreement of exclusivity with a major brand. Who founded that brand?","Steve Jobs, Steve Wozniak and Ronald Wayne",https://en.wikipedia.org/wiki/Audible_(service),https://en.wikipedia.org/wiki/Apple_Inc.,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Audible_(service)', 'https://en.wikipedia.org/wiki/Apple_Inc.']" +572,What is the difference in elevation between Mount Rainier and Condor Mountain? What is the answer in feet?,"2,689 feet.",https://en.wikipedia.org/wiki/Condor_(mountain),https://en.wikipedia.org/wiki/Mount_Rainier,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Condor_(mountain)', 'https://en.wikipedia.org/wiki/Mount_Rainier']" +573,"As of August 3, 2024, what is the main specialization of the hospital designed by Vasco Morais Palmeiro Regaleira that is located in the civil parish where the Monteiro-Mor Palace resides?",Pulmonary medicine,https://en.wikipedia.org/wiki/Monteiro-Mor_Palace,https://en.wikipedia.org/wiki/Lumiar,https://en.wikipedia.org/wiki/Hospital_Pulido_Valente,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Monteiro-Mor_Palace', 'https://en.wikipedia.org/wiki/Lumiar', 'https://en.wikipedia.org/wiki/Hospital_Pulido_Valente']" +574,"If the man that the SS Edmund Fitzgerald was named after was living at the time of the ship's sinking, how old was he? If he was already deceased, how long had he been dead? You may just use the year without regard for the date of birth.","Edmund Fitzgerald, the man for whom the ill-fated ship SS Edmund Fitzgerald was named was born in 1895, he was 80 years old in 1975 when she sank.",https://en.wikipedia.org/wiki/SS_Edmund_Fitzgerald,https://en.wikipedia.org/wiki/Edmund_Fitzgerald_(disambiguation),,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/SS_Edmund_Fitzgerald', 'https://en.wikipedia.org/wiki/Edmund_Fitzgerald_(disambiguation)']" +575,How old was Benjamin Franklin when Wolfgang Amadeus Mozart was born?,50 years old.,https://en.wikipedia.org/wiki/Benjamin_Franklin,https://en.wikipedia.org/wiki/Wolfgang_Amadeus_Mozart,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Benjamin_Franklin', 'https://en.wikipedia.org/wiki/Wolfgang_Amadeus_Mozart']" +576,"The female of the pair who have been called ""the worst guests in 'Below Deck Sailing Yacht' history"" made her TV debut five seasons before which Bachelor?",Jake Pavelka,https://en.wikipedia.org/wiki/Erica_Rose_(television_personality),https://en.wikipedia.org/wiki/The_Bachelor_(American_TV_series)_season_14,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Erica_Rose_(television_personality)', 'https://en.wikipedia.org/wiki/The_Bachelor_(American_TV_series)_season_14']" +577,"As of 1 August 2024, How many more seasons did Outrageous Fortune have compared to bro'Town?",1,https://en.wikipedia.org/wiki/Bro%27Town,https://en.wikipedia.org/wiki/Outrageous_Fortune_(TV_series),,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Bro%27Town', 'https://en.wikipedia.org/wiki/Outrageous_Fortune_(TV_series)']" +578,What attraction in Walt Disney World opened exactly 50 years after the theme park originally opened?,Remy's Ratatouille Adventure,https://en.wikipedia.org/wiki/Walt_Disney_World,https://en.wikipedia.org/wiki/Remy%27s_Ratatouille_Adventure,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Walt_Disney_World', 'https://en.wikipedia.org/wiki/Remy%27s_Ratatouille_Adventure']" +579,"Which of these statements is true as of August 3rd, 2024? a) 221 Eos is roughly double the diameter of 1844 Susilva. b) 1844 Susilva is roughly double the diameter of 221 Eos. c) 221 Eos's diameter is roughly 150% of the diameter of 1844 Susilva. d) 1844 Susilva's diameter is roughly 20% the diameter of 221 Eos.",d) 1844 Susilva's diameter is roughly 20% the diameter of 221 Eos.,https://en.wikipedia.org/wiki/1844_Susilva,https://en.wikipedia.org/wiki/221_Eos,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/1844_Susilva', 'https://en.wikipedia.org/wiki/221_Eos']" +580,How old was the Miss Miss Venezuela 1970 winner on the day the 68th Academy Awards was held?,"Bella La Rosa was 46 years old on March 25, 1996.",https://en.wikipedia.org/wiki/Miss_Venezuela_1970,https://en.wikipedia.org/wiki/Bella_La_Rosa,https://en.wikipedia.org/wiki/68th_Academy_Awards,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Miss_Venezuela_1970', 'https://en.wikipedia.org/wiki/Bella_La_Rosa', 'https://en.wikipedia.org/wiki/68th_Academy_Awards']" +581,"Among Aristotle, Alexander the Great, Socrates, and Plato, which of them taught the others? What was their order chronologically? ","Socrates taught Plato and Plato taught Aristotle. Aristotle taught Alexander the Great. So, chronologically, it was Socrates, Plato, Aristotle, and finally Alexander the Great. ",https://en.wikipedia.org/wiki/Aristotle,https://en.wikipedia.org/wiki/Socrates,https://en.wikipedia.org/wiki/Plato,https://en.wikipedia.org/wiki/Alexander_the_Great,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Aristotle', 'https://en.wikipedia.org/wiki/Socrates', 'https://en.wikipedia.org/wiki/Plato', 'https://en.wikipedia.org/wiki/Alexander_the_Great']" +582,Where was Robert Vesco living when Bank of Credit and Commerce International was formally liquidated?,Cuba,https://en.wikipedia.org/wiki/Bank_of_Credit_and_Commerce_International,https://en.wikipedia.org/wiki/Robert_Vesco,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Bank_of_Credit_and_Commerce_International', 'https://en.wikipedia.org/wiki/Robert_Vesco']" +583,Who was the Prime Minister of Canada the first time that The Toronto Maple Leafs won The Stanley Cup?,R. B. Bennett,https://en.wikipedia.org/wiki/List_of_Stanley_Cup_champions,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Canada,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Stanley_Cup_champions', 'https://en.wikipedia.org/wiki/List_of_prime_ministers_of_Canada']" +584,Which political party held the most seats in the Leeds City Council election during the year that philosopher John Wall was born?,Conservatives held the most seats in the Leeds City Council election in 1965 when John Wall was born.,https://en.wikipedia.org/wiki/John_Wall_(philosopher),https://en.wikipedia.org/wiki/1965_Leeds_City_Council_election,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/John_Wall_(philosopher)', 'https://en.wikipedia.org/wiki/1965_Leeds_City_Council_election']" +585,What was the release date of the movie directed by Gordon Douglas which featured American decathlete who was a 1984 olympic torch runner and first African American to light the Olympic Cauldron?,"April 2, 1961",https://en.wikipedia.org/wiki/1984_Summer_Olympics,https://en.wikipedia.org/wiki/Rafer_Johnson,https://en.wikipedia.org/wiki/The_Sins_of_Rachel_Cade,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1984_Summer_Olympics', 'https://en.wikipedia.org/wiki/Rafer_Johnson', 'https://en.wikipedia.org/wiki/The_Sins_of_Rachel_Cade']" +586,How long after Archduke Franz Ferdinand received Artstetten Castle did he have his first child?,Twelve Years,https://en.wikipedia.org/wiki/Artstetten_Castle,https://en.wikipedia.org/wiki/Archduke_Franz_Ferdinand_of_Austria,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Artstetten_Castle', 'https://en.wikipedia.org/wiki/Archduke_Franz_Ferdinand_of_Austria']" +587,Who was the manager of the team that won the first Football League after the death of Queen Victoria?,Tom Watson was the manager of Liverpool F.C. in 1901.,https://en.wikipedia.org/wiki/Queen_Victoria,https://en.wikipedia.org/wiki/1900%E2%80%9301_Football_League#Final_league_tables,https://en.wikipedia.org/wiki/List_of_Liverpool_F.C._managers#Managers,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Queen_Victoria', 'https://en.wikipedia.org/wiki/1900%E2%80%9301_Football_League#Final_league_tables', 'https://en.wikipedia.org/wiki/List_of_Liverpool_F.C._managers#Managers']" +588,What is the nickname for the city where Mette Solli was born? Give the answer in Norwegian.,Rosenes by,https://en.wikipedia.org/wiki/Mette_Solli,https://en.wikipedia.org/wiki/Molde,,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/Mette_Solli', 'https://en.wikipedia.org/wiki/Molde']" +589,Tell me the singer that I am thinking about. Use this information to determine who it is: The song hit #1 on the billboard in 2015. The singer is from Canada. The artist was born before the dissolution of Czechoslovakia.,The Weeknd,https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_2015,https://en.wikipedia.org/wiki/List_of_Canadian_musicians,https://en.wikipedia.org/wiki/Dissolution_of_Czechoslovakia,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_2015', 'https://en.wikipedia.org/wiki/List_of_Canadian_musicians', 'https://en.wikipedia.org/wiki/Dissolution_of_Czechoslovakia']" +590,The Nintendo Entertainment System shipped with a main processor that was a derivative of the CPU in an Apple Computer that was released after the Apple 1 and before 1980. When was the original model Apple device I am referring to discontinued?,1979,https://en.wikipedia.org/wiki/Nintendo_Entertainment_System,https://en.wikipedia.org/wiki/Ricoh_2A03,https://en.wikipedia.org/wiki/MOS_Technology_6502,https://en.wikipedia.org/wiki/List_of_Apple_products,https://en.wikipedia.org/wiki/Apple_II_(original),,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Nintendo_Entertainment_System', 'https://en.wikipedia.org/wiki/Ricoh_2A03', 'https://en.wikipedia.org/wiki/MOS_Technology_6502', 'https://en.wikipedia.org/wiki/List_of_Apple_products', 'https://en.wikipedia.org/wiki/Apple_II_(original)']" +591,What is the name of the home town of the top scorer for the 2018-19 Brisbane Roar Football Club?,Perth.,https://en.wikipedia.org/wiki/2018%E2%80%9319_Brisbane_Roar_FC_season,https://en.wikipedia.org/wiki/Adam_Taggart,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/2018%E2%80%9319_Brisbane_Roar_FC_season', 'https://en.wikipedia.org/wiki/Adam_Taggart']" +592,"As of 2020, who has experienced the coldest record temperature provided in Celsius, the Canadian territory, Yukon, or Yellowstone National Park?",The Canadian territory of Yukon experienced a colder temperature of -63C.,https://en.wikipedia.org/wiki/Yellowstone_National_Park,https://en.wikipedia.org/wiki/List_of_extreme_temperatures_in_Canada,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Yellowstone_National_Park', 'https://en.wikipedia.org/wiki/List_of_extreme_temperatures_in_Canada']" +593,What is the name of the sequel to this comedy movie that shares a name with Obie Trice's fourth studio album?,The Hangover Part II,https://en.wikipedia.org/wiki/Obie_Trice,https://en.wikipedia.org/wiki/The_Hangover,https://en.wikipedia.org/wiki/The_Hangover_Part_II,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Obie_Trice', 'https://en.wikipedia.org/wiki/The_Hangover', 'https://en.wikipedia.org/wiki/The_Hangover_Part_II']" +594,What does Lose Your Way by the British rock band Love Amongst Ruin have in common with Sally Lunn Buns?,They're both made in Bath,https://en.wikipedia.org/wiki/Lose_Your_Way_(album),"https://en.wikipedia.org/wiki/Bath,_Somerset#Culture",https://en.wikipedia.org/wiki/Sally_Lunn_bun,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Lose_Your_Way_(album)', 'https://en.wikipedia.org/wiki/Bath,_Somerset#Culture', 'https://en.wikipedia.org/wiki/Sally_Lunn_bun']" +595,"In the same year that HMS Holland 1 was launched, a British monarch died. How long had their predecessor ruled for?","6 years, 11 months, 25 days.",https://en.wikipedia.org/wiki/HMS_Holland_1,https://en.wikipedia.org/wiki/1901,https://en.wikipedia.org/wiki/Queen_Victoria,https://en.wikipedia.org/wiki/William_IV,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/HMS_Holland_1', 'https://en.wikipedia.org/wiki/1901', 'https://en.wikipedia.org/wiki/Queen_Victoria', 'https://en.wikipedia.org/wiki/William_IV']" +596,What BTS member turned 5 years old the soonest after BoA's first album was released?,Jimin,https://en.wikipedia.org/wiki/BoA#Discography,https://en.wikipedia.org/wiki/ID;_Peace_B,https://en.wikipedia.org/wiki/BTS,https://en.wikipedia.org/wiki/Jimin,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/BoA#Discography', 'https://en.wikipedia.org/wiki/ID;_Peace_B', 'https://en.wikipedia.org/wiki/BTS', 'https://en.wikipedia.org/wiki/Jimin']" +597,"As of August 3rd 2024, what's the name of the university in the city right off exit 183 on I-94 in Michigan?",Eastern Michigan University,https://en.wikipedia.org/wiki/Interstate_94_in_Michigan#,"https://en.wikipedia.org/wiki/Ypsilanti,_Michigan",,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Interstate_94_in_Michigan#', 'https://en.wikipedia.org/wiki/Ypsilanti,_Michigan']" +598,"A certain singer won the Grammy Award for Song of the Year in 2008 and in the same year, sang at a prominent person's 90th Birthday Party concert in London. How many vowels are in the given first name of the person who the birthday party was for?",4,https://en.wikipedia.org/wiki/Grammy_Award_for_Song_of_the_Year#2000s,https://en.wikipedia.org/wiki/Amy_Winehouse,https://en.wikipedia.org/wiki/Nelson_Mandela,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Grammy_Award_for_Song_of_the_Year#2000s', 'https://en.wikipedia.org/wiki/Amy_Winehouse', 'https://en.wikipedia.org/wiki/Nelson_Mandela']" +599,"Of Louis XVI, Louis XIV, and Louis XV, who was the youngest at their coronation? ",Louis XV,https://en.wikipedia.org/wiki/Louis_XIV,https://en.wikipedia.org/wiki/Louis_XV,https://en.wikipedia.org/wiki/Louis_XVI,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Louis_XIV', 'https://en.wikipedia.org/wiki/Louis_XV', 'https://en.wikipedia.org/wiki/Louis_XVI']" +600,"Who was the building named after on South Forest Avenue, which was built around 1959-1964 and designed by the architect married to Olga Lazovic?",Grady Gammage,https://en.wikipedia.org/wiki/Olgivanna_Lloyd_Wright,https://en.wikipedia.org/wiki/Frank_Lloyd_Wright,https://en.wikipedia.org/wiki/Gammage_Memorial_Auditorium,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Olgivanna_Lloyd_Wright', 'https://en.wikipedia.org/wiki/Frank_Lloyd_Wright', 'https://en.wikipedia.org/wiki/Gammage_Memorial_Auditorium']" +601,Where did Marion Couthouy Smith publish her books and poems between 1906 and 1918? Which years did each of these magazine companies first start?,"Marion Couthouy Smith published her books and poems in Harper's Magazine, Century Magazine, Atlantic Monthly, and The New England Magazine. Harpers Magazine was first published in 1850. Century Magazine was published in 1881. The Atlantic was founded in 1857. Lastly, The New England Magazine was first published in 1884.",https://en.wikipedia.org/wiki/Marion_Couthouy_Smith,https://en.wikipedia.org/wiki/Harper%27s_Magazine,https://en.wikipedia.org/wiki/The_Century_Magazine,https://en.wikipedia.org/wiki/The_Atlantic,https://en.wikipedia.org/wiki/The_New_England_Magazine,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Marion_Couthouy_Smith', 'https://en.wikipedia.org/wiki/Harper%27s_Magazine', 'https://en.wikipedia.org/wiki/The_Century_Magazine', 'https://en.wikipedia.org/wiki/The_Atlantic', 'https://en.wikipedia.org/wiki/The_New_England_Magazine']" +602,"What famous playable video game character is killed in their story for the purpose of creating a huge feeling of emptiness? To give a little help, this character also helps her family out by selling flowers she has grown, she knows and has traveled with a known eco-terrorist leader and also a owner of a bar in the slums.",Aerith Gainsborough.,https://en.wikipedia.org/wiki/Aerith_Gainsborough,https://en.wikipedia.org/wiki/Final_Fantasy_VII,https://en.wikipedia.org/wiki/Barret_Wallace,https://en.wikipedia.org/wiki/Tifa_Lockhart,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Aerith_Gainsborough', 'https://en.wikipedia.org/wiki/Final_Fantasy_VII', 'https://en.wikipedia.org/wiki/Barret_Wallace', 'https://en.wikipedia.org/wiki/Tifa_Lockhart']" +603,"What is the name of the popular vantage point that is featured in the 1980 comedy film ""The Gods Must Be Crazy"", and which provincial nature reserve is it located in as of 2024?",God's Window in Blyde River Canyon Nature Reserve,https://en.wikipedia.org/wiki/The_Gods_Must_Be_Crazy,https://en.wikipedia.org/wiki/Blyde_River_Canyon_Nature_Reserve#God.27s_Window,https://en.wikipedia.org/wiki/Blyde_River_Canyon_Nature_Reserve,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Gods_Must_Be_Crazy', 'https://en.wikipedia.org/wiki/Blyde_River_Canyon_Nature_Reserve#God.27s_Window', 'https://en.wikipedia.org/wiki/Blyde_River_Canyon_Nature_Reserve']" +604,During which year did the actor who played George Falconer in the film A Single Man receive his first Academy Award? Include the name of the film for which he won.,Colin Firth won his first Academy Award in 2011 for The King's Speech.,https://en.wikipedia.org/wiki/A_Single_Man,https://en.wikipedia.org/wiki/Colin_Firth,https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Colin_Firth,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/A_Single_Man', 'https://en.wikipedia.org/wiki/Colin_Firth', 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Colin_Firth']" +605,"How many years before the founding of Google, was George Orwell's book ""1984"" published?",49 years.,https://en.wikipedia.org/wiki/Nineteen_Eighty-Four,https://en.wikipedia.org/wiki/Google,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Nineteen_Eighty-Four', 'https://en.wikipedia.org/wiki/Google']" +606,How many more knock-outs did Joe Louis have than Muhammad Ali?,15,https://en.wikipedia.org/wiki/Muhammad_Ali,https://en.wikipedia.org/wiki/Joe_Louis,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Muhammad_Ali', 'https://en.wikipedia.org/wiki/Joe_Louis']" +607,In the same city of California that is home to Walt Disney Imagineering is a famous retail shopping mall that opened with 1.6 million square feet of retail space. What is the name of that mall?,The Glendale Galleria.,https://en.wikipedia.org/wiki/Walt_Disney_Imagineering,https://en.wikipedia.org/wiki/Glendale_Galleria,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Walt_Disney_Imagineering', 'https://en.wikipedia.org/wiki/Glendale_Galleria']" +608,"The parish church of Renče, Slovenia, is dedicated to two saints. What would be the tropical zodiac sign of someone born on the date of their annual festival?",Cancer,https://en.wikipedia.org/wiki/Ren%C4%8De,https://en.wikipedia.org/wiki/Hermagoras_of_Aquileia,https://en.wikipedia.org/wiki/Zodiac,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Ren%C4%8De', 'https://en.wikipedia.org/wiki/Hermagoras_of_Aquileia', 'https://en.wikipedia.org/wiki/Zodiac']" +609,What is the name of the river in the city where Ikea's headquarters are?,the Oude Rijn.,https://en.wikipedia.org/wiki/IKEA,https://en.wikipedia.org/wiki/Leiden,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/IKEA', 'https://en.wikipedia.org/wiki/Leiden']" +610,How much did the film in which Jake Gyllenhaal played his second lead role gross in its initial run at the box office?,"Donnie Darko grossed $517,375 in its initial run at the box office.",https://en.wikipedia.org/wiki/Jake_Gyllenhaal,https://en.wikipedia.org/wiki/Donnie_Darko,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jake_Gyllenhaal', 'https://en.wikipedia.org/wiki/Donnie_Darko']" +611,The actor who played Oliver Quick in Saltburn appeared in a music video for an artist who opened for Taylor Swift during the Latin American leg of the Eras Tour. What is the name of this music video?,Please Please Please,https://en.wikipedia.org/wiki/Saltburn_(film),https://en.wikipedia.org/wiki/The_Eras_Tour,https://en.wikipedia.org/wiki/Please_Please_Please_(Sabrina_Carpenter_song),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Saltburn_(film)', 'https://en.wikipedia.org/wiki/The_Eras_Tour', 'https://en.wikipedia.org/wiki/Please_Please_Please_(Sabrina_Carpenter_song)']" +612,"In the region known as Sulawesi Selatan, which includes the Selayar Islands, what Austronesian language is predominantly spoken by the local population and how does the name ""Sulawesi Selatan"" relate to the location of the region?","The predominant Austronesian language spoken in Sulawesi Selatan, including the Selayar Islands, is Makassarese. The name ""Sulawesi Selatan"" refers to the southern part of Sulawesi.",https://en.wikipedia.org/wiki/Selatan,https://en.wikipedia.org/wiki/South_Sulawesi,https://en.wikipedia.org/wiki/Selayar_Islands,https://en.wikipedia.org/wiki/Makassarese_language,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Selatan', 'https://en.wikipedia.org/wiki/South_Sulawesi', 'https://en.wikipedia.org/wiki/Selayar_Islands', 'https://en.wikipedia.org/wiki/Makassarese_language']" +613,"How many different Prime Ministers of the United Kingdom were there during the first term of Grover Cleveland's presidency, and who were they?","There were two different Prime Ministers of the United Kingdom during Grover Cleveland's first term as president, and they were William Ewart Gladstone and Robert Gascoyne-Cecil.",https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,"https://en.wikipedia.org/wiki/Grover_Cleveland#:~:text=Stephen%20Grover%20Cleveland%20(March%2018,serve%20non%2Dconsecutive%20presidential%20terms.",,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom', 'https://en.wikipedia.org/wiki/Grover_Cleveland#:~:text=Stephen%20Grover%20Cleveland%20(March%2018,serve%20non%2Dconsecutive%20presidential%20terms.']" +614,Which American high school can boast of an alumnus for whom the following is true: -Was inducted into the Pro Football Hall of Fame in 2018 -Played in 13 Pro Bowls -Played his first season of professional football for a team playing their first season in the NFL,"Kathleen Senior High School in Lakeland, Florida",https://en.wikipedia.org/wiki/List_of_Pro_Football_Hall_of_Fame_inductees,https://en.wikipedia.org/wiki/Pro_Bowl#Players_with_most_invitations,https://en.wikipedia.org/wiki/Baltimore_Ravens,https://en.wikipedia.org/wiki/Ray_Lewis,https://en.wikipedia.org/wiki/Kathleen_Senior_High_School,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_Pro_Football_Hall_of_Fame_inductees', 'https://en.wikipedia.org/wiki/Pro_Bowl#Players_with_most_invitations', 'https://en.wikipedia.org/wiki/Baltimore_Ravens', 'https://en.wikipedia.org/wiki/Ray_Lewis', 'https://en.wikipedia.org/wiki/Kathleen_Senior_High_School']" +615,Which 1963 Disney film starred the same actress who played a dual role in a Disney film two years earlier about twins who plot to reunite their separated parents?,Summer Magic,https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films,https://en.wikipedia.org/wiki/One_Hundred_and_One_Dalmatians,https://en.wikipedia.org/wiki/The_Absent-Minded_Professor,https://en.wikipedia.org/wiki/The_Parent_Trap_(1961_film),https://en.wikipedia.org/wiki/Nikki:_Wild_Dog_of_the_North,https://en.wikipedia.org/wiki/Greyfriars_Bobby_(film),https://en.wikipedia.org/wiki/Babes_in_Toyland_(1961_film),https://en.wikipedia.org/wiki/Son_of_Flubber,https://en.wikipedia.org/wiki/Miracle_of_the_White_Stallions,https://en.wikipedia.org/wiki/Savage_Sam_(film),"https://en.wikipedia.org/wiki/Summer_Magic_(film), https://en.wikipedia.org/wiki/The_Incredible_Journey_(film), https://en.wikipedia.org/wiki/The_Sword_in_the_Stone_(1963_film)",Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films', 'https://en.wikipedia.org/wiki/One_Hundred_and_One_Dalmatians', 'https://en.wikipedia.org/wiki/The_Absent-Minded_Professor', 'https://en.wikipedia.org/wiki/The_Parent_Trap_(1961_film)', 'https://en.wikipedia.org/wiki/Nikki:_Wild_Dog_of_the_North', 'https://en.wikipedia.org/wiki/Greyfriars_Bobby_(film)', 'https://en.wikipedia.org/wiki/Babes_in_Toyland_(1961_film)', 'https://en.wikipedia.org/wiki/Son_of_Flubber', 'https://en.wikipedia.org/wiki/Miracle_of_the_White_Stallions', 'https://en.wikipedia.org/wiki/Savage_Sam_(film)', 'https://en.wikipedia.org/wiki/Summer_Magic_(film), https://en.wikipedia.org/wiki/The_Incredible_Journey_(film), https://en.wikipedia.org/wiki/The_Sword_in_the_Stone_(1963_film)']" +616,"As of 2024, how many times could the country where shogi was invented fit inside the country where xiangqi was invented? Round to the nearest whole number.",25,https://en.wikipedia.org/wiki/Shogi,https://en.wikipedia.org/wiki/Xiangqi,https://en.wikipedia.org/wiki/China,https://en.wikipedia.org/wiki/Japan,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Shogi', 'https://en.wikipedia.org/wiki/Xiangqi', 'https://en.wikipedia.org/wiki/China', 'https://en.wikipedia.org/wiki/Japan']" +617,How many player entries were in the event that the winner of the 2008 Aussie Millions also won in 2010 at the PokerStars World Championship of Online Poker?,"1,240",https://en.wikipedia.org/wiki/Crown_Australian_Poker_Championship,https://en.wikipedia.org/wiki/Alexander_Kostritsyn,https://en.wikipedia.org/wiki/World_Championship_of_Online_Poker,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Crown_Australian_Poker_Championship', 'https://en.wikipedia.org/wiki/Alexander_Kostritsyn', 'https://en.wikipedia.org/wiki/World_Championship_of_Online_Poker']" +618,How many votes did the opposition party get in the Brant riding the election before Justin Trudeau was elected Prime Minister?,"16,351",https://en.wikipedia.org/wiki/Justin_Trudeau,https://en.wikipedia.org/wiki/List_of_Canadian_federal_general_elections,https://en.wikipedia.org/wiki/Results_of_the_2011_Canadian_federal_election,,,,,,,,,Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Justin_Trudeau', 'https://en.wikipedia.org/wiki/List_of_Canadian_federal_general_elections', 'https://en.wikipedia.org/wiki/Results_of_the_2011_Canadian_federal_election']" +619,Who was the screenwriter of the first collaboration film between Sunrise and the studio who animates Full Metal Alchemist?,Keiko Nobumoto,https://en.wikipedia.org/wiki/Fullmetal_Alchemist,https://en.wikipedia.org/wiki/Bones_(studio),https://en.wikipedia.org/wiki/Cowboy_Bebop:_Knockin%27_on_Heaven%27s_Door,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Fullmetal_Alchemist', 'https://en.wikipedia.org/wiki/Bones_(studio)', 'https://en.wikipedia.org/wiki/Cowboy_Bebop:_Knockin%27_on_Heaven%27s_Door']" +620,"If Anastasia Romanov had still been alive when the 90s cartoon movie based on her was released, how old would she be?",Duchess Anastasia Romanov would have been 96 years old.,https://en.m.wikipedia.org/wiki/Grand_Duchess_Anastasia_Nikolaevna_of_Russia,https://en.m.wikipedia.org/wiki/Anastasia_(1997_film),,,,,,,,,,Numerical reasoning,"['https://en.m.wikipedia.org/wiki/Grand_Duchess_Anastasia_Nikolaevna_of_Russia', 'https://en.m.wikipedia.org/wiki/Anastasia_(1997_film)']" +621,Only one of the founding members of the superhero team 'The Defenders' was not a doctor. How many letters are in his name?,Five (the name is Namor'),https://en.wikipedia.org/wiki/List_of_Defenders_members,https://en.wikipedia.org/wiki/Doctor_Strange,https://en.wikipedia.org/wiki/Hulk,https://en.wikipedia.org/wiki/Namor,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/List_of_Defenders_members', 'https://en.wikipedia.org/wiki/Doctor_Strange', 'https://en.wikipedia.org/wiki/Hulk', 'https://en.wikipedia.org/wiki/Namor']" +622,"If Alice turned 36 on the day John F. Kennedy was assassinated, how old would she be on the day the Berlin Wall fell?",61,https://en.wikipedia.org/wiki/Assassination_of_John_F._Kennedy,"https://en.wikipedia.org/wiki/Fall_of_the_Berlin_Wall#:~:text=The%20fall%20of%20the%20Berlin,restrictions%20were%20overwhelmed%20and%20discarded.",,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Assassination_of_John_F._Kennedy', 'https://en.wikipedia.org/wiki/Fall_of_the_Berlin_Wall#:~:text=The%20fall%20of%20the%20Berlin,restrictions%20were%20overwhelmed%20and%20discarded.']" +623,"What career was shared by one of the world's oldest fathers, who had a child at the age of 96, and a man who killed his wife, named Nancy, and their son, named Daniel?",professional wrestler,https://en.wikipedia.org/wiki/List_of_oldest_fathers,https://en.wikipedia.org/wiki/Ramjit_Raghav,https://en.wikipedia.org/wiki/Chris_Benoit#Death,https://en.wikipedia.org/wiki/Professional_wrestling#Occupational_hazards,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/List_of_oldest_fathers', 'https://en.wikipedia.org/wiki/Ramjit_Raghav', 'https://en.wikipedia.org/wiki/Chris_Benoit#Death', 'https://en.wikipedia.org/wiki/Professional_wrestling#Occupational_hazards']" +624,Which two rivers pass through the hometown of a famous Chinese philosopher who was born as Kong Qiu,"The Si River and the Yi River pass through Qufu, Confucius' home town.",https://en.wikipedia.org/wiki/Confucius#,https://en.wikipedia.org/wiki/Qufu#Geography,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Confucius#', 'https://en.wikipedia.org/wiki/Qufu#Geography']" +625,"How much larger was the concert capacity of the venue where Led Zeppelin recorded ""The Song Remains the Same"" than the venue where AC/DC recorded their first live album?","16,500",https://en.wikipedia.org/wiki/The_Song_Remains_the_Same_(album),https://en.wikipedia.org/wiki/Madison_Square_Garden,https://en.wikipedia.org/wiki/AC/DC_discography#Live_albums,https://en.wikipedia.org/wiki/If_You_Want_Blood_You%27ve_Got_It,"https://en.wikipedia.org/wiki/The_Apollo,_Glasgow",,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/The_Song_Remains_the_Same_(album)', 'https://en.wikipedia.org/wiki/Madison_Square_Garden', 'https://en.wikipedia.org/wiki/AC/DC_discography#Live_albums', 'https://en.wikipedia.org/wiki/If_You_Want_Blood_You%27ve_Got_It', 'https://en.wikipedia.org/wiki/The_Apollo,_Glasgow']" +626,Five Nights at Freddy's initial game release came one day short of the 19 year anniversary of the death of which Grateful Dead band member?,Jerry Garcia,https://en.wikipedia.org/wiki/Five_Nights_at_Freddy%27s,https://en.wikipedia.org/wiki/Grateful_Dead#Main_career_(1967–1995),,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Five_Nights_at_Freddy%27s', 'https://en.wikipedia.org/wiki/Grateful_Dead#Main_career_(1967–1995)']" +627,One episode title from classic Doctor Who series 12 features the name of a Pentateuch book. Can you tell me which part of this episode had the lowest viewing figures in roman numerals?,"III (the answer is Genesis of the Daleks, the 3rd part had the least amount of viewers when broadcast)",https://en.wikipedia.org/wiki/Old_Testament,https://en.wikipedia.org/wiki/Doctor_Who_season_12,https://en.wikipedia.org/wiki/Genesis_of_the_Daleks,,,,,,,,,Tabular reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Old_Testament', 'https://en.wikipedia.org/wiki/Doctor_Who_season_12', 'https://en.wikipedia.org/wiki/Genesis_of_the_Daleks']" +628,"Which Naruto characters from before the TV series ""Boruto: Naruto the next generation"", can perfectly use the Rasengan technique?","Naruto Uzumaki, Minato Namikaze and Jiraiya.",https://en.wikipedia.org/wiki/Naruto_Shippuden_the_Movie:_The_Lost_Tower,https://en.wikipedia.org/wiki/Naruto_Uzumaki,https://en.wikipedia.org/wiki/Jiraiya_(Naruto),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Naruto_Shippuden_the_Movie:_The_Lost_Tower', 'https://en.wikipedia.org/wiki/Naruto_Uzumaki', 'https://en.wikipedia.org/wiki/Jiraiya_(Naruto)']" +629,"Twin brothers and former linebackers Ricardo and Devon McDonald were drafted into the NFL in 1992 and 1993, respectively. How many more games did one twin play than the other during their NFL career?",56,https://en.wikipedia.org/wiki/Devon_McDonald,https://en.wikipedia.org/wiki/Ricardo_McDonald,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Devon_McDonald', 'https://en.wikipedia.org/wiki/Ricardo_McDonald']" +630,"How old was the United States Air Force when the 317th Fighter-Interceptor Squadron was inactivated, rounded to the nearest year?",22 years old,https://en.wikipedia.org/wiki/317th_Fighter-Interceptor_Squadron,https://en.wikipedia.org/wiki/United_States_Air_Force,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/317th_Fighter-Interceptor_Squadron', 'https://en.wikipedia.org/wiki/United_States_Air_Force']" +631,Who was the Argentinian president who resigned from the position the same year the José Martín Olaeta Stadium was inaugurated?,Pedro Ramírez,https://en.wikipedia.org/wiki/Estadio_Jos%C3%A9_Mart%C3%ADn_Olaeta,https://en.wikipedia.org/wiki/1944_in_Argentina,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Estadio_Jos%C3%A9_Mart%C3%ADn_Olaeta', 'https://en.wikipedia.org/wiki/1944_in_Argentina']" +632,"How many days is it from Damon Wayans's first episode as a cast member of Saturday Night Live to Damon Wayans's first episode as a cast member of In Living Color, including the days the first episodes premiered?",1619 days.,https://en.wikipedia.org/wiki/Damon_Wayans,https://en.wikipedia.org/wiki/Saturday_Night_Live_season_11,https://en.wikipedia.org/wiki/In_Living_Color,https://en.wikipedia.org/wiki/List_of_In_Living_Color_episodes,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Damon_Wayans', 'https://en.wikipedia.org/wiki/Saturday_Night_Live_season_11', 'https://en.wikipedia.org/wiki/In_Living_Color', 'https://en.wikipedia.org/wiki/List_of_In_Living_Color_episodes']" +633,"There is a famous Texas city where a Stadium built in 1930 hosted six games for the 1994 World Cup, but only after widening the field and permanently installing natural grass on the playing surface. What was the average (mean) number of attendance for these six games?","58,692",https://en.wikipedia.org/wiki/1994_FIFA_World_Cup,https://en.wikipedia.org/wiki/Cotton_Bowl_(stadium),https://en.wikipedia.org/wiki/Dallas,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/1994_FIFA_World_Cup', 'https://en.wikipedia.org/wiki/Cotton_Bowl_(stadium)', 'https://en.wikipedia.org/wiki/Dallas']" +634,The September Declaration has a rough equivalent in the Netherlands that takes place on the same day every year. In what century was the current name of this day (as of August 2024) chosen?,the 19th century,https://en.wikipedia.org/wiki/September_Declaration,https://en.wikipedia.org/wiki/Speech_from_the_throne#Netherlands,https://en.wikipedia.org/wiki/Prinsjesdag#History,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/September_Declaration', 'https://en.wikipedia.org/wiki/Speech_from_the_throne#Netherlands', 'https://en.wikipedia.org/wiki/Prinsjesdag#History']" +635,Who was team captain of the team that won the Stanley Cup the year Connor McDavid was born?,Steve Yzerman,https://en.wikipedia.org/wiki/Connor_McDavid,https://en.wikipedia.org/wiki/List_of_Stanley_Cup_champions,https://en.wikipedia.org/wiki/1996%E2%80%9397_Detroit_Red_Wings_season,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Connor_McDavid', 'https://en.wikipedia.org/wiki/List_of_Stanley_Cup_champions', 'https://en.wikipedia.org/wiki/1996%E2%80%9397_Detroit_Red_Wings_season']" +636,"From the date the Soviet Union first used their veto power in the UN security council, how many more years would the then-Soviet leader live?",Seven,https://en.wikipedia.org/wiki/List_of_vetoed_United_Nations_Security_Council_resolutions#Resolutions,https://en.wikipedia.org/wiki/List_of_leaders_of_the_Soviet_Union,https://en.wikipedia.org/wiki/Joseph_Stalin,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/List_of_vetoed_United_Nations_Security_Council_resolutions#Resolutions', 'https://en.wikipedia.org/wiki/List_of_leaders_of_the_Soviet_Union', 'https://en.wikipedia.org/wiki/Joseph_Stalin']" +637,How many years had the station that preceded the Salthill and Monkstown railway station on the historic Dublin and South Eastern Railway line been open on Christmas Day of 2005?,143 years,https://en.wikipedia.org/wiki/Salthill_and_Monkstown_railway_station,https://en.wikipedia.org/wiki/Dublin_Area_Rapid_Transit,https://en.wikipedia.org/wiki/Seapoint_railway_station,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Salthill_and_Monkstown_railway_station', 'https://en.wikipedia.org/wiki/Dublin_Area_Rapid_Transit', 'https://en.wikipedia.org/wiki/Seapoint_railway_station']" +638,How many countries were part of the commonwealth on the date Prince Charles ascended?,"On 8 September 2022, there were 56 countries as part of the Commonwealth.",https://en.wikipedia.org/wiki/Charles_III,"https://en.wikipedia.org/wiki/Member_states_of_the_Commonwealth_of_Nations#:~:text=The%20Republic%20of%20Ireland%20(as,former%20members%20of%20the%20Commonwealth.",,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Charles_III', 'https://en.wikipedia.org/wiki/Member_states_of_the_Commonwealth_of_Nations#:~:text=The%20Republic%20of%20Ireland%20(as,former%20members%20of%20the%20Commonwealth.']" +639,"In feet, subtract the diameter of the pitching plate (""rubber"") in softball, from the distance between the ""points"" of the bases in baseball, and multiply that figure by the year that Joe DiMaggio married Marilyn Monroe. ","144,596. The distance between the points of bases in baseball is 90 feet, subtract the diameter of the pitching plate in softball (16 feet), to get 74, and multiply that by 1954.",https://en.wikipedia.org/wiki/Softball,https://en.wikipedia.org/wiki/Baseball_field,https://en.wikipedia.org/wiki/Joe_DiMaggio,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Softball', 'https://en.wikipedia.org/wiki/Baseball_field', 'https://en.wikipedia.org/wiki/Joe_DiMaggio']" +640,What are the sizes of the two islands in Crater Lake in acres?,Phantom Ship is 2.3 acres. Wizard Island is 315.85 acres.,https://en.wikipedia.org/wiki/Wizard_Island,https://en.wikipedia.org/wiki/Phantom_Ship_(island),,,,,,,,,,Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/Wizard_Island', 'https://en.wikipedia.org/wiki/Phantom_Ship_(island)']" +641,"As of August 3rd 2024, which movie using the Technicolor dye-transfer process was the last to win the Academy Award for Best Picture?","The movie using the Technicolor dye-transfer process that was the last to win the Academy Award for Best Picture was ""The Godfather Part II.""",https://en.wikipedia.org/wiki/Technicolor,https://en.wikipedia.org/wiki/Dye-transfer_process,https://en.wikipedia.org/wiki/The_Godfather_Part_II,https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Technicolor', 'https://en.wikipedia.org/wiki/Dye-transfer_process', 'https://en.wikipedia.org/wiki/The_Godfather_Part_II', 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture']" +642,How much older than Michael B. Jordan is Michael Jordan?,24 years,https://en.wikipedia.org/wiki/Michael_Jordan,https://en.wikipedia.org/wiki/Michael_B._Jordan,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Michael_Jordan', 'https://en.wikipedia.org/wiki/Michael_B._Jordan']" +643,The University that Cillian Murphy attended was founded how many years before he began studying?,151 years.,https://en.wikipedia.org/wiki/Cillian_Murphy,https://en.wikipedia.org/wiki/University_College_Cork,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Cillian_Murphy', 'https://en.wikipedia.org/wiki/University_College_Cork']" +644,"The incorporation of this company happened in the same year titled on Taylor Swift 5th studio album. It was incorporated by a man born in Setagaya, Tokyo, Japan that worked with the illustrator who largely designed the original 151 Pokémon. What is “this company”?",Game Freak,https://en.wikipedia.org/wiki/1989_(album),https://en.wikipedia.org/wiki/Satoshi_Tajiri,"https://en.wikipedia.org/wiki/Pok%C3%A9mon (NOT REQUIRED, BUT HELPFUL)",,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1989_(album)', 'https://en.wikipedia.org/wiki/Satoshi_Tajiri', 'https://en.wikipedia.org/wiki/Pok%C3%A9mon (NOT REQUIRED, BUT HELPFUL) ']" +645,"What is the name of the only Texan radio station on frequency 89.5 FM that is operated by a university, as of the death of Jerry West?",KACU,https://en.wikipedia.org/wiki/89.5_FM,https://en.wikipedia.org/wiki/Abilene_Christian_University,https://en.wikipedia.org/wiki/Jerry_West,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/89.5_FM', 'https://en.wikipedia.org/wiki/Abilene_Christian_University', 'https://en.wikipedia.org/wiki/Jerry_West']" +646,"How much money would be left of Elon Musk's net worth in January of 2021, if you subtracted the 2022 average yearly gross salary of 10,000 people, working in the Philippines, written in words?",one hundred eighty-four billion nine hundred fifty-nine million four hundred forty thousand.,https://en.wikipedia.org/wiki/Wealth_of_Elon_Musk,https://en.wikipedia.org/wiki/Economy_of_the_Philippines,,,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/Wealth_of_Elon_Musk', 'https://en.wikipedia.org/wiki/Economy_of_the_Philippines']" +647,What city was the capital of the United States on the day that the first president of the United States died?,"Philadelphia, Pennsylvania",https://en.wikipedia.org/wiki/George_Washington#Personal_life,https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/George_Washington#Personal_life', 'https://en.wikipedia.org/wiki/List_of_capitals_in_the_United_States']" +648,How old was Joel McHale the first time the Philadelphia Waterdogs won the PLL Championship?,Joel McHale was 50 years old.,"https://en.wikipedia.org/wiki/Joel_McHale#:~:text=Joel%20Edward%20McHale%20(born%20November,actor%2C%20comedian%20and%20television%20presenter.",https://en.wikipedia.org/wiki/Philadelphia_Waterdogs#Season_results,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Joel_McHale#:~:text=Joel%20Edward%20McHale%20(born%20November,actor%2C%20comedian%20and%20television%20presenter.', 'https://en.wikipedia.org/wiki/Philadelphia_Waterdogs#Season_results']" +649,"If somebody was born on the day the Anglo-Dutch Treaty of 1814 was signed, how old would they have been while Albert A. Michelson and Edward W. Morley were performing their Michelson-Morley experiment?",72,https://en.wikipedia.org/wiki/Anglo-Dutch_Treaty_of_1814,https://en.wikipedia.org/wiki/Michelson%E2%80%93Morley_experiment,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Anglo-Dutch_Treaty_of_1814', 'https://en.wikipedia.org/wiki/Michelson%E2%80%93Morley_experiment']" +650,Who won Britain's Got Talent in the same year that London hosted the Olympics for the third time?,Ashleigh and Pudsey,https://en.wikipedia.org/wiki/Category:Summer_Olympics_in_London,https://en.wikipedia.org/wiki/2012_Summer_Olympics,https://en.wikipedia.org/wiki/Britain%27s_Got_Talent,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Category:Summer_Olympics_in_London', 'https://en.wikipedia.org/wiki/2012_Summer_Olympics', 'https://en.wikipedia.org/wiki/Britain%27s_Got_Talent']" +651,"What key signature was the song that was number one on the Billboard Hot 100 on June 8, 2002 performed in?",C major,https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_2002,https://en.wikipedia.org/wiki/Foolish_(Ashanti_song),,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_2002', 'https://en.wikipedia.org/wiki/Foolish_(Ashanti_song)']" +652,Itanihomi is a municipality in Brazil. What is the state that sits directly south of the state Itanihomi belongs to?,"Itanihomi is in Minas Gerais, and the state directly south of this is Sao Paulo. ",https://en.wikipedia.org/wiki/Itanhomi,"https://en.wikipedia.org/wiki/Federative_units_of_Brazil#/media/File:Brazil,_administrative_divisions_(states)_-_en_-_colored.svg",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Itanhomi', 'https://en.wikipedia.org/wiki/Federative_units_of_Brazil#/media/File:Brazil,_administrative_divisions_(states)_-_en_-_colored.svg']" +653,Which Greek pole vaulter who participated in the 2020 Summer Olympics in Tokyo also won gold at the 2015 IAAF Diamond League?,Nikoleta Kyriakopoulou,https://en.wikipedia.org/wiki/Athletics_at_the_2020_Summer_Olympics,https://en.wikipedia.org/wiki/2015_Diamond_League,https://en.wikipedia.org/wiki/Nikoleta_Kyriakopoulou,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Athletics_at_the_2020_Summer_Olympics', 'https://en.wikipedia.org/wiki/2015_Diamond_League', 'https://en.wikipedia.org/wiki/Nikoleta_Kyriakopoulou']" +654,"Please consider the following clues and answer the question that follows: 1. This mosque is located in the city dubbed the ""Jerusalem of the Balkans."" 2, The mosque was commissioned by an authoritarian dictator born in a small village 10 km west of Yogyakarta. Question: What is the height difference between the twin towers of the mosque and its dome?",21 meters,"https://en.wikipedia.org/wiki/Istiklal_Mosque,_Sarajevo",https://en.wikipedia.org/wiki/Sarajevo,https://en.wikipedia.org/wiki/Suharto,https://en.wikipedia.org/wiki/Kemusuk,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Istiklal_Mosque,_Sarajevo', 'https://en.wikipedia.org/wiki/Sarajevo', 'https://en.wikipedia.org/wiki/Suharto', 'https://en.wikipedia.org/wiki/Kemusuk']" +655,What is the birthday of the basketball player turned wrestler who shares a nickname with the YouTuber who created the brand Feastables?,"May 23, 1985",https://en.wikipedia.org/wiki/Feastables,https://en.wikipedia.org/wiki/Peter_John_Ramos,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Feastables', 'https://en.wikipedia.org/wiki/Peter_John_Ramos']" +656,"In 2000, Michel von Tell drove a Toyota and placed sixth in an event. Who placed first that same year, and what car were they driving?",Charles Muhanji in a Subaru Impreza WRX,https://en.wikipedia.org/wiki/Michel_von_Tell,https://en.wikipedia.org/wiki/Rwanda_Mountain_Gorilla_Rally,,,,,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Michel_von_Tell', 'https://en.wikipedia.org/wiki/Rwanda_Mountain_Gorilla_Rally']" +657,"There's an episode of a TV series that was directed by the person who won the 2011 Dorothy Arzner Directors award. The series started in 2005. The season that contains my episode included 20 episodes in total. The episode is one word that starts with an ""M.""",Miracles,https://en.wikipedia.org/wiki/Women_in_Film_Crystal_%2B_Lucy_Awards#Dorothy_Arzner_Directors_award,https://en.wikipedia.org/wiki/Pamela_Fryman,https://en.wikipedia.org/wiki/How_I_Met_Your_Mother,https://en.wikipedia.org/wiki/List_of_How_I_Met_Your_Mother_episodes#Season_3_(2007%E2%80%9308),,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Women_in_Film_Crystal_%2B_Lucy_Awards#Dorothy_Arzner_Directors_award', 'https://en.wikipedia.org/wiki/Pamela_Fryman', 'https://en.wikipedia.org/wiki/How_I_Met_Your_Mother', 'https://en.wikipedia.org/wiki/List_of_How_I_Met_Your_Mother_episodes#Season_3_(2007%E2%80%9308)']" +658,"What is the birth town of the absent athlete from the Victory Salute statue in San Jose, California?","Peter Norman was born in Coburg, Victoria, Australia.",https://en.wikipedia.org/wiki/Victory_Salute_(statue),https://en.wikipedia.org/wiki/Peter_Norman,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Victory_Salute_(statue)', 'https://en.wikipedia.org/wiki/Peter_Norman']" +659,What is the capacity of the Olympic stadium used during the first Winter Games attended by a tropical nation? This nation was visited by the 5th Cavalry Regiment (US) in 1901.,"17,324",https://en.wikipedia.org/wiki/5th_Cavalry_Regiment,https://en.wikipedia.org/wiki/Philippines,https://en.wikipedia.org/wiki/1972_Winter_Olympics,https://en.wikipedia.org/wiki/Makomanai_Open_Stadium,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/5th_Cavalry_Regiment', 'https://en.wikipedia.org/wiki/Philippines', 'https://en.wikipedia.org/wiki/1972_Winter_Olympics', 'https://en.wikipedia.org/wiki/Makomanai_Open_Stadium']" +660,"In 2010, the WWE Hall of Fame took place in the same stadium as a famous comedy movie done a few years before. Who are the four comedians that starred in this comedy movie?","jeff Foxworthy, Bill Engvall, Ron White and Larry the Cable Guy.",https://en.wikipedia.org/wiki/WWE_Hall_of_Fame_(2010),https://en.wikipedia.org/wiki/Arizona_Financial_Theatre,https://en.wikipedia.org/wiki/Blue_Collar_Comedy_Tour:_The_Movie,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/WWE_Hall_of_Fame_(2010)', 'https://en.wikipedia.org/wiki/Arizona_Financial_Theatre', 'https://en.wikipedia.org/wiki/Blue_Collar_Comedy_Tour:_The_Movie']" +661,What year was the first Uber employee born?,1983,https://en.wikipedia.org/wiki/Uber,https://en.wikipedia.org/wiki/Ryan_Graves_(businessman),,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Uber', 'https://en.wikipedia.org/wiki/Ryan_Graves_(businessman)']" +662,Which British prime minister of the 1990s had the most children?,Tony Blair,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom,https://en.wikipedia.org/wiki/Margaret_Thatcher,https://en.wikipedia.org/wiki/John_Major,https://en.wikipedia.org/wiki/Tony_Blair,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/List_of_prime_ministers_of_the_United_Kingdom', 'https://en.wikipedia.org/wiki/Margaret_Thatcher', 'https://en.wikipedia.org/wiki/John_Major', 'https://en.wikipedia.org/wiki/Tony_Blair']" +663,What was the daily average passenger count in 2011 of the first station on the train line that serves Hiraka Train Station in Japan?,"2,851 Passengers",https://en.wikipedia.org/wiki/Hiraka_Station,https://en.wikipedia.org/wiki/K%C5%8Dnan_Railway_K%C5%8Dnan_Line,https://en.wikipedia.org/wiki/Hirosaki_Station,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Hiraka_Station', 'https://en.wikipedia.org/wiki/K%C5%8Dnan_Railway_K%C5%8Dnan_Line', 'https://en.wikipedia.org/wiki/Hirosaki_Station']" +664,How many Red Hot Chili Peppers albums were released while Nelson Mandela was in prison?,4,https://en.wikipedia.org/wiki/Red_Hot_Chili_Peppers#Discography,https://en.wikipedia.org/wiki/Nelson_Mandela#Imprisonment,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Red_Hot_Chili_Peppers#Discography', 'https://en.wikipedia.org/wiki/Nelson_Mandela#Imprisonment']" +665,How many years after the first album release by The Beatles was the first solo album released by one of its members?,Five years,https://en.wikipedia.org/wiki/The_Beatles_discography,https://en.wikipedia.org/wiki/George_Harrison#Discography,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/The_Beatles_discography', 'https://en.wikipedia.org/wiki/George_Harrison#Discography']" +666,"Who starred in a movie about the subject of a song on the compilation album ""Friends and Relatives,"" and also played themselves on an episode of ""Friends""?",Isabella Rossellini,https://en.wikipedia.org/wiki/Friends_%26_Relatives,https://en.wikipedia.org/wiki/Beethoven_in_film,https://en.wikipedia.org/wiki/Immortal_Beloved_(1994_film),https://en.wikipedia.org/wiki/Isabella_Rossellini,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Friends_%26_Relatives', 'https://en.wikipedia.org/wiki/Beethoven_in_film', 'https://en.wikipedia.org/wiki/Immortal_Beloved_(1994_film)', 'https://en.wikipedia.org/wiki/Isabella_Rossellini']" +667,"In 1966, the Lower Klamath National Wildlife Refuge became part of the U.S. National Register of Historic Places (NRHP). What is another natural site with water added during that year, also located in California?",Lake Merritt,https://en.wikipedia.org/wiki/Old_Mission_Dam,https://en.wikipedia.org/wiki/Pecos_National_Historical_Park,https://en.wikipedia.org/wiki/Walden_Pond,https://en.wikipedia.org/wiki/Seal_Island_Historic_District,https://en.wikipedia.org/wiki/Russian_Bishop%27s_House,https://en.wikipedia.org/wiki/Luther_Burbank_Home_and_Gardens,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Old_Mission_Dam', 'https://en.wikipedia.org/wiki/Pecos_National_Historical_Park', 'https://en.wikipedia.org/wiki/Walden_Pond', 'https://en.wikipedia.org/wiki/Seal_Island_Historic_District', 'https://en.wikipedia.org/wiki/Russian_Bishop%27s_House', 'https://en.wikipedia.org/wiki/Luther_Burbank_Home_and_Gardens']" +668,What was the magnitude of the earthquake that was the catalyst for the charitable U.S. speedrunning marathon that took place in April 2011?,Mw 9.0–9.1,https://en.wikipedia.org/wiki/Speedrunning,https://en.wikipedia.org/wiki/Games_Done_Quick,https://en.wikipedia.org/wiki/2011_T%C5%8Dhoku_earthquake_and_tsunami,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Speedrunning', 'https://en.wikipedia.org/wiki/Games_Done_Quick', 'https://en.wikipedia.org/wiki/2011_T%C5%8Dhoku_earthquake_and_tsunami']" +669,What Volkswagen former car model has nearly the same name as a Bangkok rooftop restaurant? The car has one additional letter.,Scirocco (The Bangkok restaurant is called Sirocco),https://en.wikipedia.org/wiki/Lebua_at_State_Tower,https://en.wikipedia.org/wiki/List_of_Volkswagen_vehicles,https://en.wikipedia.org/wiki/Volkswagen_Scirocco,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Lebua_at_State_Tower', 'https://en.wikipedia.org/wiki/List_of_Volkswagen_vehicles', 'https://en.wikipedia.org/wiki/Volkswagen_Scirocco']" +670,As of 1st August 2024 The three British Olympic Class ocean liners were manufactured in a city that lies at the mouth of what river?,River Lagan,https://en.wikipedia.org/wiki/HMHS_Britannic,https://en.wikipedia.org/wiki/Belfast,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/HMHS_Britannic', 'https://en.wikipedia.org/wiki/Belfast']" +671,"The Mossi King who passed away in Thailand in 2016, would have spoken what language?",Mòoré,https://en.wikipedia.org/wiki/Tenkodogo,https://en.wikipedia.org/wiki/Mossi_Kingdoms,https://en.wikipedia.org/wiki/List_of_rulers_of_the_Mossi_state_of_Tenkodogo,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Tenkodogo', 'https://en.wikipedia.org/wiki/Mossi_Kingdoms', 'https://en.wikipedia.org/wiki/List_of_rulers_of_the_Mossi_state_of_Tenkodogo']" +672,Which Colombian cyclist was born on the same day as Edmonton Oilers captain Connor McDavid?,Egan Bernal,https://en.wikipedia.org/wiki/Connor_McDavid,https://en.wikipedia.org/wiki/January_13,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Connor_McDavid', 'https://en.wikipedia.org/wiki/January_13']" +673,"From 1924 to August 2024, how many times did Texas's and California's electoral colleges elect the same nominee during the presidential election?",13 times.,https://en.wikipedia.org/wiki/Politics_of_California,https://en.wikipedia.org/wiki/Politics_of_Texas,,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Politics_of_California', 'https://en.wikipedia.org/wiki/Politics_of_Texas']" +674,Who composed the Broadway musical that premiered in 2003 and starred the actress who would later voice Elsa in Disney's Frozen?,Stephen Schwartz,https://en.wikipedia.org/wiki/Wicked_(musical),https://en.wikipedia.org/wiki/Idina_Menzel,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wicked_(musical)', 'https://en.wikipedia.org/wiki/Idina_Menzel']" +675,"In physics, when speaking of classical mechanics, there is an infamous problem that involves taking the initial positions and velocities of three point masses that orbit each other and attempting to calculate their trajectories. There is no general closed-form solution for this infamous problem. French mathematicians in the 18th century focused on solving this problem in regards to astronomical motion, specifically how the Moon rotates on its apsides. Their work led to a solution using Newton's laws of physics and the Discrete Fourier Transformation (DFT), which ultimately changed how sailors were able to determine longitude at sea. The inventor of the device that would revolutionize naval navigation using these new principles and proofs spent how many years testing and perfecting his work?",31 years (1730-1761),https://en.wikipedia.org/wiki/Three-body_problem,https://en.wikipedia.org/wiki/Alexis_Clairaut#Focus_on_astronomical_motion,https://en.wikipedia.org/wiki/Marine_chronometer,https://en.wikipedia.org/wiki/John_Harrison,,,,,,,,Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Three-body_problem', 'https://en.wikipedia.org/wiki/Alexis_Clairaut#Focus_on_astronomical_motion', 'https://en.wikipedia.org/wiki/Marine_chronometer', 'https://en.wikipedia.org/wiki/John_Harrison']" +676,"What is the country of origin of the football coach with the first initial ""P"" for the Thailand national men's football team who coached 54 years after the country's name officially changed?",German.,https://en.wikipedia.org/wiki/Thailand,https://en.wikipedia.org/wiki/Thailand_national_football_team#Coaching_history,,,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Thailand', 'https://en.wikipedia.org/wiki/Thailand_national_football_team#Coaching_history']" +677,Archibald Sinclair had an American mom who was a half-sister. The half-sister had a life partner who had a painting of her by Walter Sickert. How many words is the title of that painting?,5 ( Miss Hudson at Rowlandson House),"https://en.wikipedia.org/wiki/Archibald_Sinclair,_1st_Viscount_Thurso",https://en.wikipedia.org/wiki/Ethel_Sands,https://en.wikipedia.org/wiki/Anna_Hope_Hudson,,,,,,,,,Post processing,"['https://en.wikipedia.org/wiki/Archibald_Sinclair,_1st_Viscount_Thurso', 'https://en.wikipedia.org/wiki/Ethel_Sands', 'https://en.wikipedia.org/wiki/Anna_Hope_Hudson']" +678,What movie won the Academy Award for Best Picture the same year that Argentina won its first World Cup?,The Deer Hunter,"https://en.wikipedia.org/wiki/Argentina_at_the_FIFA_World_Cup#:~:text=Argentina%20is%20one%20of%20the,in%201930%2C%201990%20and%202014.",https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture#1970s,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Argentina_at_the_FIFA_World_Cup#:~:text=Argentina%20is%20one%20of%20the,in%201930%2C%201990%20and%202014.', 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture#1970s']" +679,"What is the other name, beginning and ending with the letter ""d"", for the substance, often deployed by means of artillery shells, which was also instrumental in developing a treatment for a condition that dogs of the same breed as Mayor Max II are unusually prone to?",Distilled mustard,https://en.wikipedia.org/wiki/Mayor_Max_II,https://en.wikipedia.org/wiki/Golden_Retriever#Health,https://en.wikipedia.org/wiki/Hemangiosarcoma#Treatments,https://en.wikipedia.org/wiki/Chemotherapy#History,https://en.wikipedia.org/wiki/Mustard_gas,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mayor_Max_II', 'https://en.wikipedia.org/wiki/Golden_Retriever#Health', 'https://en.wikipedia.org/wiki/Hemangiosarcoma#Treatments', 'https://en.wikipedia.org/wiki/Chemotherapy#History', 'https://en.wikipedia.org/wiki/Mustard_gas']" +680,"As of May 2024, which female New Zealand Prime Minister was the oldest when they took office?",Helen Clark.,https://en.wikipedia.org/wiki/List_of_prime_ministers_of_New_Zealand,https://en.wikipedia.org/wiki/Jenny_Shipley,https://en.wikipedia.org/wiki/Helen_Clark,https://en.wikipedia.org/wiki/Jacinda_Ardern,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_prime_ministers_of_New_Zealand', 'https://en.wikipedia.org/wiki/Jenny_Shipley', 'https://en.wikipedia.org/wiki/Helen_Clark', 'https://en.wikipedia.org/wiki/Jacinda_Ardern']" +681,What is the capital of the country where the Treaty on European Union was signed?,Amsterdam,https://en.wikipedia.org/wiki/Maastricht_Treaty,https://en.wikipedia.org/wiki/Netherlands,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Maastricht_Treaty', 'https://en.wikipedia.org/wiki/Netherlands']" +682,Which happened earlier: Diageo reducing the volume of Red stripe beer bottles in the US from 12 fl. oz. to 11.2 fl. oz. or Philip Morris making a bid for the company Swedish Match?,The Red Stripe US bottle volume reduction happened earlier.,https://en.m.wikipedia.org/wiki/Red_Stripe,https://en.m.wikipedia.org/wiki/Swedish_Match,,,,,,,,,,Temporal reasoning,"['https://en.m.wikipedia.org/wiki/Red_Stripe', 'https://en.m.wikipedia.org/wiki/Swedish_Match']" +683,Which 2024 college/university president has a degree from Harvard University: the president from the one that organizes the Miami Book Fair or from the one that organizes the Kentucky Women Writers Conference?,Eli Capilouto,https://en.wikipedia.org/wiki/Miami_Book_Fair_International,https://en.wikipedia.org/wiki/Miami_Dade_College,https://en.wikipedia.org/wiki/Kentucky_Women_Writers_Conference,https://en.wikipedia.org/wiki/University_of_Kentucky,https://en.wikipedia.org/wiki/Eli_Capilouto,https://en.wikipedia.org/wiki/Madeline_Pumariega,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Miami_Book_Fair_International', 'https://en.wikipedia.org/wiki/Miami_Dade_College', 'https://en.wikipedia.org/wiki/Kentucky_Women_Writers_Conference', 'https://en.wikipedia.org/wiki/University_of_Kentucky', 'https://en.wikipedia.org/wiki/Eli_Capilouto', 'https://en.wikipedia.org/wiki/Madeline_Pumariega']" +684,What Doctor Who episode aired on a date closest to the 441st launch of the Skylark rocket?,Dalek,https://en.wikipedia.org/wiki/Doctor_Who_series_1,https://en.wikipedia.org/wiki/Skylark_(rocket),,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Doctor_Who_series_1', 'https://en.wikipedia.org/wiki/Skylark_(rocket)']" +685,The shonen manga that won the 35th Kodansha Manga Award has how many chapters in its final volume?,5,https://en.wikipedia.org/wiki/Kodansha_Manga_Award,https://en.wikipedia.org/wiki/List_of_Attack_on_Titan_chapters,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Kodansha_Manga_Award', 'https://en.wikipedia.org/wiki/List_of_Attack_on_Titan_chapters']" +686,"How many years apart did a Kim Jong Un impersonator who was interviewed by USA today, and the man who founded Playboy attend the University of Illinois Urbana-Champaign?",60,https://en.wikipedia.org/wiki/University_of_Illinois_Urbana-Champaign#Notable_alumni_and_faculty,https://en.wikipedia.org/wiki/Hugh_Hefner#Early_life_and_education,https://en.wikipedia.org/wiki/Minyong_Kim,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/University_of_Illinois_Urbana-Champaign#Notable_alumni_and_faculty', 'https://en.wikipedia.org/wiki/Hugh_Hefner#Early_life_and_education', 'https://en.wikipedia.org/wiki/Minyong_Kim']" +687,"In 2024, assuming that their family has been in the same line of work since they took a surname, what is the major use of the product they make if their last name is Kalkbrenner?",Making steel.,https://en.wikipedia.org/wiki/Kalkbrenner,https://en.wikipedia.org/wiki/Lime_kiln,https://en.wikipedia.org/wiki/Calcium_oxide,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Kalkbrenner', 'https://en.wikipedia.org/wiki/Lime_kiln', 'https://en.wikipedia.org/wiki/Calcium_oxide']" +688,"The person who posted a photo with Rahul Ligma and Daniel Johnson at the headquarters of a social media company claims to have a certain syndrome, despite never receiving a formal diagnosis. Who was this syndrome named after?",Hans Asperger,https://en.wikipedia.org/wiki/Rahul_Ligma,https://en.wikipedia.org/wiki/Elon_Musk#Personal_life,https://en.wikipedia.org/wiki/Asperger_syndrome,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Rahul_Ligma', 'https://en.wikipedia.org/wiki/Elon_Musk#Personal_life', 'https://en.wikipedia.org/wiki/Asperger_syndrome']" +689,Emma Stone was the highest paid actress in 2017. How much did breakthrough movie for the highest paid actress 6 years after Emma Stone make in its theatrical run?,$406.9 million,https://en.wikipedia.org/wiki/Emma_Stone,https://en.wikipedia.org/wiki/List_of_highest-paid_film_actors,https://en.wikipedia.org/wiki/Margot_Robbie,https://en.wikipedia.org/wiki/The_Wolf_of_Wall_Street_(2013_film),,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Emma_Stone', 'https://en.wikipedia.org/wiki/List_of_highest-paid_film_actors', 'https://en.wikipedia.org/wiki/Margot_Robbie', 'https://en.wikipedia.org/wiki/The_Wolf_of_Wall_Street_(2013_film)']" +690,The lead actor who plays the regional manager of this popular mockumentary sitcom released in 2005 has the same initials as Santa Claus. What is the name of the voice character for Flower in the latest animated film this actor starred in in 2024?,Matt Damon,https://en.wikipedia.org/wiki/The_Office,https://en.wikipedia.org/wiki/Steve_Carell#2004%E2%80%932013:_The_Office_and_comedic_roles,https://en.wikipedia.org/wiki/IF_(film),,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/The_Office', 'https://en.wikipedia.org/wiki/Steve_Carell#2004%E2%80%932013:_The_Office_and_comedic_roles', 'https://en.wikipedia.org/wiki/IF_(film)']" +691,This blood pressure drug commonly used to treat gestational hypertension was patented the same year the first black student at the University of Mississippi was shot.,Labetalol,https://en.wikipedia.org/wiki/University_of_Mississippi,https://en.wikipedia.org/wiki/James_Meredith,https://en.wikipedia.org/wiki/Gestational_hypertension,https://en.wikipedia.org/wiki/Labetalol,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/University_of_Mississippi', 'https://en.wikipedia.org/wiki/James_Meredith', 'https://en.wikipedia.org/wiki/Gestational_hypertension', 'https://en.wikipedia.org/wiki/Labetalol']" +692,"Little River Canyon National Preserve has three waterfalls, one of which is the tallest in Alabama. How much shorter is Alabama's tallest waterfall than the tallest waterfall in the continental US?","2,452 ft",https://en.wikipedia.org/wiki/Grace%27s_High_Falls,https://en.wikipedia.org/wiki/List_of_waterfalls_by_height#By_overall_height,,,,,,,,,,Numerical reasoning | Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/Grace%27s_High_Falls', 'https://en.wikipedia.org/wiki/List_of_waterfalls_by_height#By_overall_height']" +693,What is the name of the retired Swiss tennis player who made the 4th round of Wimbledon in 2002?,Michel Kratochvil,https://en.wikipedia.org/wiki/2002_Wimbledon_Championships_%E2%80%93_Men%27s_singles,https://en.wikipedia.org/wiki/Michel_Kratochvil,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/2002_Wimbledon_Championships_%E2%80%93_Men%27s_singles', 'https://en.wikipedia.org/wiki/Michel_Kratochvil']" +694,Who did the Canadian swimmer Eric Lamont compete against in Heat 3 of the freestyle competition that he was older than?,"No one, he was the youngest.",https://en.wikipedia.org/wiki/Eric_Jubb,https://en.wikipedia.org/wiki/Swimming_at_the_1948_Summer_Olympics_%E2%80%93_Men%27s_100_metre_freestyle,https://en.wikipedia.org/wiki/Sachin_Nag,https://en.wikipedia.org/wiki/Ali_Mustafa_Baghdady,https://en.wikipedia.org/wiki/Warren_Boyd_(swimmer),https://en.wikipedia.org/wiki/Alberto_Isaac,https://en.wikipedia.org/wiki/G%C3%A9za_K%C3%A1das,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Eric_Jubb', 'https://en.wikipedia.org/wiki/Swimming_at_the_1948_Summer_Olympics_%E2%80%93_Men%27s_100_metre_freestyle', 'https://en.wikipedia.org/wiki/Sachin_Nag', 'https://en.wikipedia.org/wiki/Ali_Mustafa_Baghdady', 'https://en.wikipedia.org/wiki/Warren_Boyd_(swimmer)', 'https://en.wikipedia.org/wiki/Alberto_Isaac', 'https://en.wikipedia.org/wiki/G%C3%A9za_K%C3%A1das']" +695,"I'm a spooky tourist and I'm visiting Savannah, GA. I'm going to visit two of the most well-known cemeteries, what Cemetery in Savannah is famous for being in a book and on a book cover? What was the book? There is another old cemetery downtown I want to visit, I heard epidemic victims were buried there. What epidemic and what years did it take place? How many victims are buried there?","Bonaventure Cemetery, ""Midnight in the Garden of Good and Evil."" Colonial Park, Yellow Fever; 1820s, estimated 700 victims.",https://en.wikipedia.org/wiki/Bonaventure_Cemetery,https://en.wikipedia.org/wiki/Colonial_Park_Cemetery,https://en.wikipedia.org/wiki/Midnight_in_the_Garden_of_Good_and_Evil,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Bonaventure_Cemetery', 'https://en.wikipedia.org/wiki/Colonial_Park_Cemetery', 'https://en.wikipedia.org/wiki/Midnight_in_the_Garden_of_Good_and_Evil']" +696,"Of the 6 main cast members of the series Friends, which have appeared in music videos?","Of the 6 main cast members of Friends, Jennifer Anniston, Courtney Cox, and Matt LeBlanc have all appeared in music videos.",https://en.wikipedia.org/wiki/Friends,https://en.wikipedia.org/wiki/Jennifer_Aniston,https://en.wikipedia.org/wiki/Courteney_Cox,https://en.wikipedia.org/wiki/Lisa_Kudrow,https://en.wikipedia.org/wiki/Matt_LeBlanc,https://en.wikipedia.org/wiki/Matthew_Perry,https://en.wikipedia.org/wiki/David_Schwimmer,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Friends', 'https://en.wikipedia.org/wiki/Jennifer_Aniston', 'https://en.wikipedia.org/wiki/Courteney_Cox', 'https://en.wikipedia.org/wiki/Lisa_Kudrow', 'https://en.wikipedia.org/wiki/Matt_LeBlanc', 'https://en.wikipedia.org/wiki/Matthew_Perry', 'https://en.wikipedia.org/wiki/David_Schwimmer']" +697,In what city were the Summer Olympic Games held in the year the RMS _Titanic_ sank?,Stockholm (Sweden),https://en.wikipedia.org/wiki/Titanic,https://en.wikipedia.org/wiki/1912_Summer_Olympics,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Titanic', 'https://en.wikipedia.org/wiki/1912_Summer_Olympics']" +698,The manager Pep Guardiola won the Premier League four years in a row with a football club whose owners are from a country whose main export in 2009 was which raw material?,"Oil. ('Oil' is fine here but related words from the Wikipedia article (link 6) are also acceptable, such as 'petroleum')",https://en.wikipedia.org/wiki/Pep_Guardiola,https://en.wikipedia.org/wiki/Manchester_City_F.C.,https://en.wikipedia.org/wiki/City_Football_Group,https://en.wikipedia.org/wiki/Abu_Dhabi_United_Group,https://en.wikipedia.org/wiki/United_Arab_Emirates,https://en.wikipedia.org/wiki/Economy_of_the_United_Arab_Emirates,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Pep_Guardiola', 'https://en.wikipedia.org/wiki/Manchester_City_F.C.', 'https://en.wikipedia.org/wiki/City_Football_Group', 'https://en.wikipedia.org/wiki/Abu_Dhabi_United_Group', 'https://en.wikipedia.org/wiki/United_Arab_Emirates', 'https://en.wikipedia.org/wiki/Economy_of_the_United_Arab_Emirates']" +699,"As of August 3 2024, which surviving building of the World's Columbian Exposition of 1893 sits on the same street as a skyscraper over 1,000 feet tall? Give the surviving building's current name.",Art Institute of Chicago,https://en.wikipedia.org/wiki/World%27s_Columbian_Exposition,https://en.wikipedia.org/wiki/Art_Institute_of_Chicago,https://en.wikipedia.org/wiki/John_Hancock_Center,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/World%27s_Columbian_Exposition', 'https://en.wikipedia.org/wiki/Art_Institute_of_Chicago', 'https://en.wikipedia.org/wiki/John_Hancock_Center']" +700,How many more wins did the team with the number one seed from the NBA Western Conference in the 2020-2021 season have than the team with the fifth seed from the NBA Western Conference in the 2019-2020 season?,8,https://en.wikipedia.org/wiki/2020%E2%80%9321_NBA_season,https://en.wikipedia.org/wiki/2019%E2%80%9320_NBA_season,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/2020%E2%80%9321_NBA_season', 'https://en.wikipedia.org/wiki/2019%E2%80%9320_NBA_season']" +701,"Cut It by O.T Genasis was released in 2015. What is the name of the streaming service that exclusively hosted the music video of the song that ranked one position above ""Cut It"" on the US Billboard Hot 100 of 2016?",Tidal,https://en.wikipedia.org/wiki/Cut_It,https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2016,https://en.wikipedia.org/wiki/No_Limit_(Usher_song),,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Cut_It', 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2016', 'https://en.wikipedia.org/wiki/No_Limit_(Usher_song)']" +702,An episode of the first season of the show Digimon had an English air date exactly 2 years before 9/11 - what Digimon destroyed the Black Gear in that episode?,Kabuterimon,https://en.wikipedia.org/wiki/Digimon,https://en.wikipedia.org/wiki/Digimon_Adventure_(1999_TV_series),https://en.wikipedia.org/wiki/List_of_Digimon_Adventure_(1999_TV_series)_episodes,https://en.wikipedia.org/wiki/September_11_attacks,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Digimon', 'https://en.wikipedia.org/wiki/Digimon_Adventure_(1999_TV_series)', 'https://en.wikipedia.org/wiki/List_of_Digimon_Adventure_(1999_TV_series)_episodes', 'https://en.wikipedia.org/wiki/September_11_attacks']" +703,How many years after Prohibition ended was Gone With The Wind released?,6 years.,https://en.wikipedia.org/wiki/Prohibition_in_the_United_States,https://en.wikipedia.org/wiki/Gone_with_the_Wind_(film),,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Prohibition_in_the_United_States', 'https://en.wikipedia.org/wiki/Gone_with_the_Wind_(film)']" +704,Who was on the British throne when the England Men’s Cricket Team first beat Australia in a Test Series?,"Queen Victoria, 1882.",https://en.wikipedia.org/wiki/The_Ashes,https://en.wikipedia.org/wiki/List_of_British_monarchs,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Ashes', 'https://en.wikipedia.org/wiki/List_of_British_monarchs']" +705,How many years had the then-Prime Minister of France been in office as PM when the first Shelby GT500 was built?,5,https://en.wikipedia.org/wiki/Shelby_Mustang,https://en.wikipedia.org/wiki/Georges_Pompidou,,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Shelby_Mustang', 'https://en.wikipedia.org/wiki/Georges_Pompidou']" +706,Azamat Satybaldy's appearance in the film Road to Mother occurred in the same year as the Trace Gas Orbiter's launch from what location?,Baikonur Cosmodrome,https://en.wikipedia.org/wiki/Azamat_Satybaldy,https://en.wikipedia.org/wiki/Trace_Gas_Orbiter,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Azamat_Satybaldy', 'https://en.wikipedia.org/wiki/Trace_Gas_Orbiter']" +707,Who was the winner of the Nobel Peace Prize the year that U.S. President Barack Obama awarded baseball player Willie Mays the Presidential Medal of Freedom?,The Tunisian National Dialogue Quartet,https://en.wikipedia.org/wiki/Presidential_Medal_of_Freedom,https://en.wikipedia.org/wiki/Willie_Mays,https://en.wikipedia.org/wiki/Nobel_Peace_Prize,https://en.wikipedia.org/wiki/List_of_Nobel_Peace_Prize_laureates,https://en.wikipedia.org/wiki/Tunisian_National_Dialogue_Quartet,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Presidential_Medal_of_Freedom', 'https://en.wikipedia.org/wiki/Willie_Mays', 'https://en.wikipedia.org/wiki/Nobel_Peace_Prize', 'https://en.wikipedia.org/wiki/List_of_Nobel_Peace_Prize_laureates', 'https://en.wikipedia.org/wiki/Tunisian_National_Dialogue_Quartet']" +708,"What song did Christina Aguilera release after Britney Spears released ""...Baby One More Time.""?",Genie in a Bottle ,https://en.wikipedia.org/wiki/...Baby_One_More_Time_(song),https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Christina_Aguilera,https://en.wikipedia.org/wiki/Genie_in_a_Bottle,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/...Baby_One_More_Time_(song)', 'https://en.wikipedia.org/wiki/List_of_songs_recorded_by_Christina_Aguilera', 'https://en.wikipedia.org/wiki/Genie_in_a_Bottle']" +709,"What primate species, known for its large population in China and presence in Florida, could potentially carry the Herpes B virus?",Rhesus Macaques,https://en.wikipedia.org/wiki/Wildlife_of_China,https://en.wikipedia.org/wiki/Macaque,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Wildlife_of_China', 'https://en.wikipedia.org/wiki/Macaque']" +710,What drug did the male founder of the company that first cloned a U.S. endangered species help study with the International Foundation for Advanced Study?,LSD,https://en.wikipedia.org/wiki/Elizabeth_Ann,https://en.wikipedia.org/wiki/Revive_%26_Restore,https://en.wikipedia.org/wiki/Stewart_Brand,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Elizabeth_Ann', 'https://en.wikipedia.org/wiki/Revive_%26_Restore', 'https://en.wikipedia.org/wiki/Stewart_Brand']" +711,What made-for-TV movie did Dolly Parton have a role in the same year that Dolly the sheep was cloned?,"In 1996, the year Dolly the sheep was cloned, Dolly Parton had a role in the made-for-tv movie ""Unlikely Angel"".",https://en.wikipedia.org/wiki/Dolly_(sheep),https://en.wikipedia.org/wiki/Dolly_Parton,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Dolly_(sheep)', 'https://en.wikipedia.org/wiki/Dolly_Parton']" +712,"Before the COVID-19 pandemic, how many Juno Awards nominations did the Saskatoon bands The Northern Pikes, Wide Mouth Mason, and The Sheepdogs have combined?",18,https://en.wikipedia.org/wiki/The_Northern_Pikes,https://en.wikipedia.org/wiki/Wide_Mouth_Mason,https://en.wikipedia.org/wiki/The_Sheepdogs,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/The_Northern_Pikes', 'https://en.wikipedia.org/wiki/Wide_Mouth_Mason', 'https://en.wikipedia.org/wiki/The_Sheepdogs']" +713,How many years did Cardi B's rap career overlap with Tupac's rap career?,Zero,https://en.wikipedia.org/wiki/Cardi_B,https://en.wikipedia.org/wiki/Tupac_Shakur,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Cardi_B', 'https://en.wikipedia.org/wiki/Tupac_Shakur']" +714,What date did the Lego Avatar theme debut? How many years are there between the release of the original movie and the release of the Lego theme?,"The debut date was October 1st, 2022. It debuted 13 years after the release of the movie.",https://en.wikipedia.org/wiki/Lego_Avatar,https://en.wikipedia.org/wiki/Avatar_(franchise),,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Lego_Avatar', 'https://en.wikipedia.org/wiki/Avatar_(franchise)']" +715,"What year did the author of ""The Conquest for Bread"" write about ""Mutual Aid""? Who was the author?","Peter Kropotkin wrote the series of essays ""Mutual Aid"" in 1902.",https://en.wikipedia.org/wiki/The_Conquest_of_Bread,https://en.wikipedia.org/wiki/Peter_Kropotkin,https://en.wikipedia.org/wiki/Mutual_Aid:_A_Factor_of_Evolution,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Conquest_of_Bread', 'https://en.wikipedia.org/wiki/Peter_Kropotkin', 'https://en.wikipedia.org/wiki/Mutual_Aid:_A_Factor_of_Evolution']" +716,"What were the last names of the players selected for the Pro Bowl from the NFL team that was featured in the movie ""Ace Ventura: Pet Detective""? Base the answer on the following specifications: -- These players were on the team while Wayne Huizenga was the owner -- The team that these players were on achieved a 10-6 regular season record while still making the playoffs during their Pro Bowl season","Bowens, Thomas, & Madison ",https://en.wikipedia.org/wiki/Ace_Ventura:_Pet_Detective,https://en.wikipedia.org/wiki/Miami_Dolphins,https://en.wikipedia.org/wiki/1995_Miami_Dolphins_season,https://en.wikipedia.org/wiki/1997_Miami_Dolphins_season,https://en.wikipedia.org/wiki/1998_Miami_Dolphins_season,https://en.wikipedia.org/wiki/1999_Miami_Dolphins_season,https://en.wikipedia.org/wiki/2000_Miami_Dolphins_season,https://en.wikipedia.org/wiki/2001_Miami_Dolphins_season,https://en.wikipedia.org/wiki/2008_Miami_Dolphins_season,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Ace_Ventura:_Pet_Detective', 'https://en.wikipedia.org/wiki/Miami_Dolphins', 'https://en.wikipedia.org/wiki/1995_Miami_Dolphins_season', 'https://en.wikipedia.org/wiki/1997_Miami_Dolphins_season', 'https://en.wikipedia.org/wiki/1998_Miami_Dolphins_season', 'https://en.wikipedia.org/wiki/1999_Miami_Dolphins_season', 'https://en.wikipedia.org/wiki/2000_Miami_Dolphins_season', 'https://en.wikipedia.org/wiki/2001_Miami_Dolphins_season', 'https://en.wikipedia.org/wiki/2008_Miami_Dolphins_season']" +717,"Who placed 2nd and 3rd against Katharina Molitor in her World Winning Championship, and what was the difference between the final gold and silver throws, and silver and bronze throws?",Katharina Molitor - 67.69m 1.56m difference Lü Huihui - 66.13m 0.34m difference Sunette Viljoen - 65.79m,https://en.wikipedia.org/wiki/Katharina_Molitor,https://en.wikipedia.org/wiki/2015_World_Championships_in_Athletics_%E2%80%93_Women%27s_javelin_throw,,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Katharina_Molitor', 'https://en.wikipedia.org/wiki/2015_World_Championships_in_Athletics_%E2%80%93_Women%27s_javelin_throw']" +718,How many years had passed since the Commonwealth of Pennsylvania was admitted to the Union by the time Rep. Robert D. Heaton was born?,85,https://en.wikipedia.org/wiki/Pennsylvania,https://en.wikipedia.org/wiki/Robert_D._Heaton,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Pennsylvania', 'https://en.wikipedia.org/wiki/Robert_D._Heaton']" +719,Who was the president of the United States when the resort housing the BomBora steel roller coaster first opened?,Grover Cleveland,https://en.wikipedia.org/wiki/BomBora_(Lagoon),https://en.wikipedia.org/wiki/Lagoon_(amusement_park),https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/BomBora_(Lagoon)', 'https://en.wikipedia.org/wiki/Lagoon_(amusement_park)', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States']" +720,Which movie starring Meryl Streep was nominated for Best Picture at the Academy Awards the year that the Pioneer 11 visited Saturn?,Kramer vs. Kramer,https://en.wikipedia.org/wiki/Meryl_Streep,https://en.wikipedia.org/wiki/Kramer_vs._Kramer,https://en.wikipedia.org/wiki/Pioneer_11,https://en.wikipedia.org/wiki/The_Seduction_of_Joe_Tynan#Awards,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Meryl_Streep', 'https://en.wikipedia.org/wiki/Kramer_vs._Kramer', 'https://en.wikipedia.org/wiki/Pioneer_11', 'https://en.wikipedia.org/wiki/The_Seduction_of_Joe_Tynan#Awards']" +721,"In 1973, Edward Fox starred in The Day of the Jackal. He beat out a James Bond actor to the part, but what was the name of another James Bond actor he appeared with in an 'unofficial' 80s Bond film?",Sean Connery,https://en.wikipedia.org/wiki/The_Day_of_the_Jackal_(film),https://en.wikipedia.org/wiki/Edward_Fox_(actor),https://en.wikipedia.org/wiki/Roger_Moore,https://en.wikipedia.org/wiki/Never_Say_Never_Again,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Day_of_the_Jackal_(film)', 'https://en.wikipedia.org/wiki/Edward_Fox_(actor)', 'https://en.wikipedia.org/wiki/Roger_Moore', 'https://en.wikipedia.org/wiki/Never_Say_Never_Again']" +722,"In September 1607, two Earls and their followers left Rathmullan for Rome. The event was first called a ""flight"" in a book published in 1868. What is the name of the book?","The Fate and Fortunes of Hugh O'Neill, Earl of Tyrone and Rory O'Donnel, Earl of Tyrconnel; their flight from Ireland, and death in exile (or ""Fate and Fortunes of the Earls of Tyrone and Tyrconnell"") by Charles Patrick Meehan",https://en.wikipedia.org/wiki/Rathmullan,https://en.wikipedia.org/wiki/Flight_of_the_Earls#Journey,https://en.wikipedia.org/wiki/Charles_Patrick_Meehan,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Rathmullan', 'https://en.wikipedia.org/wiki/Flight_of_the_Earls#Journey', 'https://en.wikipedia.org/wiki/Charles_Patrick_Meehan']" +723,The flavored ice brand Slush Puppie is actually named after a food made from cornmeal-batter. That food was itself originally named after a fish native to South Carolina. What’s the genus name of the fish?,Moxostoma,https://en.wikipedia.org/wiki/Slush_Puppie,https://en.wikipedia.org/wiki/Hushpuppy,https://en.wikipedia.org/wiki/Moxostoma,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Slush_Puppie', 'https://en.wikipedia.org/wiki/Hushpuppy', 'https://en.wikipedia.org/wiki/Moxostoma']" +724,Gifts to children from Krampus are principally composed of what element?,Krampus may give children a wooden rute or coal. They are both mostly composed of carbon.,https://en.m.wikipedia.org/wiki/Krampus,https://en.m.wikipedia.org/wiki/Coal,https://en.m.wikipedia.org/wiki/Wood,,,,,,,,,Multiple constraints,"['https://en.m.wikipedia.org/wiki/Krampus', 'https://en.m.wikipedia.org/wiki/Coal', 'https://en.m.wikipedia.org/wiki/Wood']" +725,"As noted in the 2020 census, what is the population of the county in which Waterville, Maine, resides?","123,642","https://en.wikipedia.org/wiki/Waterville,_Maine","https://en.wikipedia.org/wiki/Kennebec_County,_Maine",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Waterville,_Maine', 'https://en.wikipedia.org/wiki/Kennebec_County,_Maine']" +726,How many years apart were the Canadian Museum of History and the National Gallery of Canada established?,24 years,https://en.wikipedia.org/wiki/Canadian_Museum_of_History,https://en.wikipedia.org/wiki/National_Gallery_of_Canada,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Canadian_Museum_of_History', 'https://en.wikipedia.org/wiki/National_Gallery_of_Canada']" +727,"Who was the team manager for Lee Jae-won's only football season as of January 1, 2024?",Yasuyuki Kishino,"https://en.wikipedia.org/wiki/Lee_Jae-won_(footballer,_born_1992)",https://en.wikipedia.org/wiki/2015_J3_League,https://en.wikipedia.org/wiki/Kataller_Toyama,,,,,,,,,Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Lee_Jae-won_(footballer,_born_1992)', 'https://en.wikipedia.org/wiki/2015_J3_League', 'https://en.wikipedia.org/wiki/Kataller_Toyama']" +728,What medal was won in 1979 by the famous physicist who attended the oldest college in London?,Albert Einstein medal,https://en.wikipedia.org/wiki/University_of_Oxford#Mathematics_and_sciences,https://en.wikipedia.org/wiki/Stephen_Hawking#1975%E2%80%931990,https://en.wikipedia.org/wiki/Albert_Einstein_Medal,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/University_of_Oxford#Mathematics_and_sciences', 'https://en.wikipedia.org/wiki/Stephen_Hawking#1975%E2%80%931990', 'https://en.wikipedia.org/wiki/Albert_Einstein_Medal']" +729,"What is the name of the ""pseudo label"" that collected the early collaborations of English architect Sir Peter Cook's son? ",Gamsonite,https://en.wikipedia.org/wiki/Peter_Cook_(architect),https://en.wikipedia.org/wiki/A._G._Cook,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Peter_Cook_(architect)', 'https://en.wikipedia.org/wiki/A._G._Cook']" +730,Who won the third-place playoff at the UEFA World Cup while Donald Trump was in office as the 45th President of the United States?,Belgium,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,https://en.wikipedia.org/wiki/FIFA_World_Cup,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States', 'https://en.wikipedia.org/wiki/FIFA_World_Cup']" +731,How old were the winners of the Men's Pairs division at the 1988 World Indoor Bowls Championship?,35 and 53 years old.,https://en.wikipedia.org/wiki/1988_World_Indoor_Bowls_Championship,https://en.wikipedia.org/wiki/Ian_Schuback,https://en.wikipedia.org/wiki/Jim_Yates_(bowls),,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/1988_World_Indoor_Bowls_Championship', 'https://en.wikipedia.org/wiki/Ian_Schuback', 'https://en.wikipedia.org/wiki/Jim_Yates_(bowls)']" +732,In 1908 a fireboat operated by the Chicago Fire Department sank and was later refloated. When was the state that bears the same name as the fireboat founded?,1818,https://en.m.wikipedia.org/wiki/Illinois_(fireboat),https://en.m.wikipedia.org/wiki/Illinois,,,,,,,,,,Multiple constraints,"['https://en.m.wikipedia.org/wiki/Illinois_(fireboat)', 'https://en.m.wikipedia.org/wiki/Illinois']" +733,What were the top 5 Billboard songs by musical groups in the year 1985?,"1. ""Careless Whisper"" by Wham! 2. ""Wake Me Up Before You Go-Go"" by Wham! 3. ""I Want to Know What Love Is"" by Foreigner 4. ""Out of Touch"" by Hall & Oats. 5. ""Everybody Wants to Rule the World"" by Tears for Fears","https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1985#:~:text=Article,11",https://en.wikipedia.org/wiki/Wham!,https://en.wikipedia.org/wiki/I_Want_to_Know_What_Love_Is,https://en.wikipedia.org/wiki/Out_of_Touch,https://en.wikipedia.org/wiki/Everybody_Wants_to_Rule_the_World,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1985#:~:text=Article,11', 'https://en.wikipedia.org/wiki/Wham!', 'https://en.wikipedia.org/wiki/I_Want_to_Know_What_Love_Is', 'https://en.wikipedia.org/wiki/Out_of_Touch', 'https://en.wikipedia.org/wiki/Everybody_Wants_to_Rule_the_World']" +734,"Tell me the name of the place I am thinking of based on these clues: - I am a metropolitan borough in the West Midlands, UK - I am not a city - My boroughs name does not relate to a town",Sandwell.,https://en.wikipedia.org/wiki/West_Midlands_(county),https://en.wikipedia.org/wiki/Metropolitan_Borough_of_Dudley,https://en.wikipedia.org/wiki/Sandwell,https://en.wikipedia.org/wiki/Metropolitan_Borough_of_Solihull,https://en.wikipedia.org/wiki/Metropolitan_Borough_of_Walsall,https://en.wikipedia.org/wiki/Birmingham,https://en.wikipedia.org/wiki/Coventry,https://en.wikipedia.org/wiki/Wolverhampton,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/West_Midlands_(county)', 'https://en.wikipedia.org/wiki/Metropolitan_Borough_of_Dudley', 'https://en.wikipedia.org/wiki/Sandwell', 'https://en.wikipedia.org/wiki/Metropolitan_Borough_of_Solihull', 'https://en.wikipedia.org/wiki/Metropolitan_Borough_of_Walsall', 'https://en.wikipedia.org/wiki/Birmingham', 'https://en.wikipedia.org/wiki/Coventry', 'https://en.wikipedia.org/wiki/Wolverhampton']" +735,What was the name of the worker-owned cooperative in Spain that recently started working with the United Steelworkers in the late 2000s and was associated with a social activist priest?,Mondragon Corporation,https://en.wikipedia.org/wiki/Jos%C3%A9_Mar%C3%ADa_Arizmendiarrieta,https://en.wikipedia.org/wiki/Mondragon_Corporation,https://en.wikipedia.org/wiki/United_Steelworkers,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jos%C3%A9_Mar%C3%ADa_Arizmendiarrieta', 'https://en.wikipedia.org/wiki/Mondragon_Corporation', 'https://en.wikipedia.org/wiki/United_Steelworkers']" +736,What was the former name of the brand of sneakers worn by the Heaven's Gate members who committed suicide?,Blue Ribbon Sports Inc.,https://en.wikipedia.org/wiki/Heaven%27s_Gate_(religious_group)#Nike_Decades,"https://en.wikipedia.org/wiki/Nike,_Inc.",,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Heaven%27s_Gate_(religious_group)#Nike_Decades', 'https://en.wikipedia.org/wiki/Nike,_Inc.']" +737,"During World War I, the French designed a new military decoration to recognize French and allied soldiers for their service. The sculptor who designed the medal also worked on two war monuments one year after WWI ended. What is the name of the monument that he began work on, but was completed by someone else?",Le Creusot War Memorial (monument aux morts),https://en.wikipedia.org/wiki/Croix_de_guerre_1914%E2%80%931918_(France)#Award_description,https://en.wikipedia.org/wiki/World_War_I,https://en.wikipedia.org/wiki/Albert_Bartholom%C3%A9#Main_works_(continued),,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Croix_de_guerre_1914%E2%80%931918_(France)#Award_description', 'https://en.wikipedia.org/wiki/World_War_I', 'https://en.wikipedia.org/wiki/Albert_Bartholom%C3%A9#Main_works_(continued)']" +738,Who wrote the first movie that Chris Columbus ever directed?,David Simkins.,https://en.wikipedia.org/wiki/Chris_Columbus_(filmmaker)#Filmography,https://en.wikipedia.org/wiki/Adventures_in_Babysitting,,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Chris_Columbus_(filmmaker)#Filmography', 'https://en.wikipedia.org/wiki/Adventures_in_Babysitting']" +739,What was the earliest known media use of the theme song used by the show The Last Leg?,2007,https://en.wikipedia.org/wiki/The_Last_Leg,https://en.wikipedia.org/wiki/Harder_Than_You_Think,https://en.wikipedia.org/wiki/Fully_Flared,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/The_Last_Leg', 'https://en.wikipedia.org/wiki/Harder_Than_You_Think', 'https://en.wikipedia.org/wiki/Fully_Flared']" +740,Frank Lampard scored 5 league goals in his debut season at Chelsea. How many more league goals did Didier Drogba score during his debut season at Chelsea?,5 more goals,https://en.wikipedia.org/wiki/Frank_Lampard,https://en.wikipedia.org/wiki/Didier_Drogba,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Frank_Lampard', 'https://en.wikipedia.org/wiki/Didier_Drogba']" +741,"What do the inventor of the marine chronometer, the US president with the shortest tenure in history, and the president who was sworn in by Chief Justice Melville Fuller all have in common?","They all share the last name ""Harrison."" The scientist is John Harrison, the US president with the shortest tenure is William Henry Harrison, who is the grandfather of the US president sworn in by Chief Justice Melville Fuller, Benjamin Harrison. ",https://en.wikipedia.org/wiki/Marine_chronometer,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,https://en.wikipedia.org/wiki/Melville_Fuller,https://en.wikipedia.org/wiki/William_Henry_Harrison,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Marine_chronometer', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States', 'https://en.wikipedia.org/wiki/Melville_Fuller', 'https://en.wikipedia.org/wiki/William_Henry_Harrison']" +742,"Of the top five all-time scoring leaders of the National Basketball Association (NBA) and the Women's National Basketball Association (WNBA), which players were still playing professionally as of the 2024 season?","LeBron James, Diana Taurasi, Tina Charles, and DeWanna Bonner",https://en.wikipedia.org/wiki/List_of_NBA_career_scoring_leaders,https://en.wikipedia.org/wiki/List_of_WNBA_career_scoring_leaders,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/List_of_NBA_career_scoring_leaders', 'https://en.wikipedia.org/wiki/List_of_WNBA_career_scoring_leaders']" +743,Who was the Captain of the Toronto Maple Leafs when Morgan Rielly played his first game?,Dion Phaneuf,https://en.wikipedia.org/wiki/Morgan_Rielly,https://en.wikipedia.org/wiki/Toronto_Maple_Leafs#Season-by-season_record,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Morgan_Rielly', 'https://en.wikipedia.org/wiki/Toronto_Maple_Leafs#Season-by-season_record']" +744,What is the famous novel by the wife of the 19th-century poet who wrote about an Egyptian pharaoh who reigned sometime between 1290 and 1200 B.C.?,Frankenstein,https://en.wikipedia.org/wiki/List_of_pharaohs,https://en.wikipedia.org/wiki/Seti_I,https://en.wikipedia.org/wiki/Ramesses_II,https://en.wikipedia.org/wiki/Merneptah,https://en.wikipedia.org/wiki/Amenmesse,https://en.wikipedia.org/wiki/Percy_Bysshe_Shelley,,,,,,Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_pharaohs', 'https://en.wikipedia.org/wiki/Seti_I', 'https://en.wikipedia.org/wiki/Ramesses_II', 'https://en.wikipedia.org/wiki/Merneptah', 'https://en.wikipedia.org/wiki/Amenmesse', 'https://en.wikipedia.org/wiki/Percy_Bysshe_Shelley']" +745,How many Best Director winners from the Academy Awards in the 1990s were born before 1950?,"Three. Steven Spielberg, Jonathan Demme, Clint Eastwood.",https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director,https://en.wikipedia.org/wiki/Sam_Mendes,https://en.wikipedia.org/wiki/Steven_Spielberg,https://en.wikipedia.org/wiki/James_Cameron,https://en.wikipedia.org/wiki/Anthony_Minghella,https://en.wikipedia.org/wiki/Robert_Zemeckis,https://en.wikipedia.org/wiki/Jonathan_Demme,https://en.wikipedia.org/wiki/Kevin_Costner,https://en.wikipedia.org/wiki/Clint_Eastwood,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director', 'https://en.wikipedia.org/wiki/Sam_Mendes', 'https://en.wikipedia.org/wiki/Steven_Spielberg', 'https://en.wikipedia.org/wiki/James_Cameron', 'https://en.wikipedia.org/wiki/Anthony_Minghella', 'https://en.wikipedia.org/wiki/Robert_Zemeckis', 'https://en.wikipedia.org/wiki/Jonathan_Demme', 'https://en.wikipedia.org/wiki/Kevin_Costner', 'https://en.wikipedia.org/wiki/Clint_Eastwood']" +746,"who won the formula one season in the year nine people were killed on the track at the argentine grand prix, and how old were they two years before sliced bread was first sold? ","Alberto Ascari, 8 years old",https://en.wikipedia.org/wiki/Argentine_Grand_Prix,https://en.wikipedia.org/wiki/1953_Formula_One_season,https://en.wikipedia.org/wiki/Alberto_Ascari,https://en.wikipedia.org/wiki/Sliced_bread,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Argentine_Grand_Prix', 'https://en.wikipedia.org/wiki/1953_Formula_One_season', 'https://en.wikipedia.org/wiki/Alberto_Ascari', 'https://en.wikipedia.org/wiki/Sliced_bread']" +747,"What is the burial place of the most successful racehorse in the Grand National's history, as of 2024?",The winning post at Aintree Racecourse,https://en.wikipedia.org/wiki/Grand_National ,https://en.wikipedia.org/wiki/Red_Rum,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Grand_National ', 'https://en.wikipedia.org/wiki/Red_Rum']" +748,"Who is the male cousin, whose name begins with an L, of the actor who played the murderer in the ITV series White House Farm, and how old was he when he stood in the 2021 London mayoral election?","Laurence Fox, 42",https://en.wikipedia.org/wiki/White_House_Farm_murders,https://en.wikipedia.org/wiki/Freddie_Fox_(actor),https://en.wikipedia.org/wiki/Laurence_Fox,https://en.wikipedia.org/wiki/2021_London_mayoral_election,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/White_House_Farm_murders', 'https://en.wikipedia.org/wiki/Freddie_Fox_(actor)', 'https://en.wikipedia.org/wiki/Laurence_Fox', 'https://en.wikipedia.org/wiki/2021_London_mayoral_election']" +749,"In the 1984 Olympics, what sport did the country that got 5 total medals win a gold medal in?",Sailing,https://en.wikipedia.org/wiki/1984_Summer_Olympics_medal_table,https://en.wikipedia.org/wiki/Spain_at_the_1984_Summer_Olympics,https://en.wikipedia.org/wiki/Sailing_at_the_1984_Summer_Olympics_%E2%80%93_470,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/1984_Summer_Olympics_medal_table', 'https://en.wikipedia.org/wiki/Spain_at_the_1984_Summer_Olympics', 'https://en.wikipedia.org/wiki/Sailing_at_the_1984_Summer_Olympics_%E2%80%93_470']" +750,"When Tom Hanks received his first Oscar, how many Grammys had Alan Menken won?",9,https://en.m.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Alan_Menken,https://en.m.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Tom_Hanks,https://en.m.wikipedia.org/wiki/36th_Annual_Grammy_Awards,https://en.m.wikipedia.org/wiki/66th_Academy_Awards,,,,,,,,Temporal reasoning,"['https://en.m.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Alan_Menken', 'https://en.m.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Tom_Hanks', 'https://en.m.wikipedia.org/wiki/36th_Annual_Grammy_Awards', 'https://en.m.wikipedia.org/wiki/66th_Academy_Awards']" +751,"As of August 3rd, 2024, How high is the ancient standing stone located next to the A92 road?",3.5 metres,https://en.wikipedia.org/wiki/A92_road,https://en.wikipedia.org/wiki/Stone_of_Morphie,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/A92_road', 'https://en.wikipedia.org/wiki/Stone_of_Morphie']" +752,"As of August 04, 2024, what is the exact age difference between Daniel Radcliff and his current partner in days?",1777 days,https://en.wikipedia.org/wiki/Daniel_Radcliffe,https://en.wikipedia.org/wiki/Erin_Darke,,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Daniel_Radcliffe', 'https://en.wikipedia.org/wiki/Erin_Darke']" +753,How long after the incorporation of Ottawa was the designer of Ottawa's Justice Building born?,10 years.,https://en.wikipedia.org/wiki/Ottawa,https://en.wikipedia.org/wiki/Justice_Building,https://en.wikipedia.org/wiki/Thomas_W._Fuller,,,,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Ottawa', 'https://en.wikipedia.org/wiki/Justice_Building', 'https://en.wikipedia.org/wiki/Thomas_W._Fuller']" +754,"How many times larger was the population of the city of Paris, 19 years after the year designated as The International Year for the Culture of Peace by the United Nations, than the population of Brown County, Kansas according to its 2020 census? Round the answer to the nearest whole number.",228 times larger,"https://en.wikipedia.org/wiki/Brown_County,_Kansas",https://en.wikipedia.org/wiki/Demographics_of_Paris,https://en.wikipedia.org/wiki/International_Year_for_the_Culture_of_Peace,,,,,,,,,Numerical reasoning | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Brown_County,_Kansas', 'https://en.wikipedia.org/wiki/Demographics_of_Paris', 'https://en.wikipedia.org/wiki/International_Year_for_the_Culture_of_Peace']" +755,"After Edward II, who was the next English monarch to have been born outside of England?",Richard II,https://en.wikipedia.org/wiki/Edward_II_of_England,https://en.wikipedia.org/wiki/Edward_III_of_England,https://en.wikipedia.org/wiki/Richard_II_of_England,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Edward_II_of_England', 'https://en.wikipedia.org/wiki/Edward_III_of_England', 'https://en.wikipedia.org/wiki/Richard_II_of_England']" +756,Who was the president of the USA when the wife of the former HEB Grocery Company CEO Howard Edward Butt Sr. died?,"Bill Clinton was president beginning in 1993, the same year that Mary Elizabeth Butt passed away.",https://en.wikipedia.org/wiki/Mary_Elizabeth_Butt,https://en.wikipedia.org/wiki/Bill_Clinton,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Mary_Elizabeth_Butt', 'https://en.wikipedia.org/wiki/Bill_Clinton']" +757,"What was the population of the province which the town Robat-e Morad is located, in 2016?","1,429,475",https://en.wikipedia.org/wiki/Robat-e_Morad,https://en.wikipedia.org/wiki/Markazi_province,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Robat-e_Morad', 'https://en.wikipedia.org/wiki/Markazi_province']" +758,"Who was the Mayor of Quincy, Massachusetts when the Quincy Medical Center opened and who was the Mayor when it closed?",Henry O. Fairbanks and Thomas P. Koch,https://en.wikipedia.org/wiki/Quincy_Medical_Center,"https://en.wikipedia.org/wiki/List_of_mayors_of_Quincy,_Massachusetts",,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Quincy_Medical_Center', 'https://en.wikipedia.org/wiki/List_of_mayors_of_Quincy,_Massachusetts']" +759,I can't recall who I'm trying to think of. This person was the partner of someone in the 1984 Olympic keelboat competition. Their partner's sister was the only American who placed in the 2003 Pan American Games for sailing in the men's or women's competition (not including the open events).,Richard Coxon.,https://en.wikipedia.org/wiki/Sailing_at_the_2003_Pan_American_Games,https://en.wikipedia.org/wiki/Lanee_Butler,https://en.wikipedia.org/wiki/Colin_Beashel,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Sailing_at_the_2003_Pan_American_Games', 'https://en.wikipedia.org/wiki/Lanee_Butler', 'https://en.wikipedia.org/wiki/Colin_Beashel']" +760,There's a famous children's book about a King Elephant. At what age did the French author die?,37 years old.,https://en.wikipedia.org/wiki/Babar_the_Elephant,https://en.wikipedia.org/wiki/Jean_de_Brunhoff,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Babar_the_Elephant', 'https://en.wikipedia.org/wiki/Jean_de_Brunhoff']" +761,How many ballets did Frederick Ashton choreograph by the time he was 25 years old?,8,https://en.wikipedia.org/wiki/Frederick_Ashton,https://en.wikipedia.org/wiki/List_of_ballets_choreographed_by_Frederick_Ashton,,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Frederick_Ashton', 'https://en.wikipedia.org/wiki/List_of_ballets_choreographed_by_Frederick_Ashton']" +762,"In the season before Jamal Murray won the NBA Championship, who was the fourth overall draft pick?",Scottie Barnes,https://en.wikipedia.org/wiki/Jamal_Murray,https://en.wikipedia.org/wiki/2021_NBA_draft,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Jamal_Murray', 'https://en.wikipedia.org/wiki/2021_NBA_draft']" +763,What band had a male lead singer born on the day Mohammad Mosaddegh announced the dissolution of the Iranian parliament?,Kool & the Gang,https://en.wikipedia.org/wiki/1953_Iranian_parliamentary_dissolution_referendum,https://en.wikipedia.org/wiki/August_16#1901%E2%80%93present_2,https://en.wikipedia.org/wiki/James_%22J.T.%22_Taylor,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/1953_Iranian_parliamentary_dissolution_referendum', 'https://en.wikipedia.org/wiki/August_16#1901%E2%80%93present_2', 'https://en.wikipedia.org/wiki/James_%22J.T.%22_Taylor']" +764,Who won the French Open Men’s Singles tournament the year that New York City FC won their first MLS Cup title?,Novak Djokovic,https://en.wikipedia.org/wiki/List_of_MLS_Cup_finals#Results_by_team,https://en.wikipedia.org/wiki/2021_French_Open_%E2%80%93_Men%2527s_singles,,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/List_of_MLS_Cup_finals#Results_by_team', 'https://en.wikipedia.org/wiki/2021_French_Open_%E2%80%93_Men%2527s_singles']" +765,In what year did a great fire destroy over 100 buildings in the North American city which hosted the 2010 G20 summit?,1904 (Great Toronto Fire of 1904),https://en.wikipedia.org/wiki/2010_G20_Toronto_summit,https://en.wikipedia.org/wiki/Toronto,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/2010_G20_Toronto_summit', 'https://en.wikipedia.org/wiki/Toronto']" +766,What was the difference in population in the most populous town in the Isère department from 1946 to 1975?,63876,https://en.wikipedia.org/wiki/Is%C3%A8re#Principal_towns,https://en.wikipedia.org/wiki/Grenoble#Population,,,,,,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Is%C3%A8re#Principal_towns', 'https://en.wikipedia.org/wiki/Grenoble#Population']" +767,What team scored the most points in an NBA finals game while Ronald Reagan was president of the United States of America?,Game 1 of the Finals in 1985 the Boston Celtics scored 148 points.,https://en.wikipedia.org/wiki/Ronald_Reagan,https://en.wikipedia.org/wiki/1981_NBA_Finals,https://en.wikipedia.org/wiki/1982_NBA_Finals,https://en.wikipedia.org/wiki/1983_NBA_Finals,https://en.wikipedia.org/wiki/1984_NBA_Finals,https://en.wikipedia.org/wiki/1985_NBA_Finals,https://en.wikipedia.org/wiki/1986_NBA_Finals,https://en.wikipedia.org/wiki/1987_NBA_Finals,https://en.wikipedia.org/wiki/1988_NBA_Finals,https://en.wikipedia.org/wiki/1989_NBA_Finals,,Tabular reasoning,"['https://en.wikipedia.org/wiki/Ronald_Reagan', 'https://en.wikipedia.org/wiki/1981_NBA_Finals', 'https://en.wikipedia.org/wiki/1982_NBA_Finals', 'https://en.wikipedia.org/wiki/1983_NBA_Finals', 'https://en.wikipedia.org/wiki/1984_NBA_Finals', 'https://en.wikipedia.org/wiki/1985_NBA_Finals', 'https://en.wikipedia.org/wiki/1986_NBA_Finals', 'https://en.wikipedia.org/wiki/1987_NBA_Finals', 'https://en.wikipedia.org/wiki/1988_NBA_Finals', 'https://en.wikipedia.org/wiki/1989_NBA_Finals']" +768,The founder of the eponymous music school at the University of Rochester held the patent for an item that later earned him a star on the Hollywood Walk of Fame. How many years passed between his initial patent and the placement of his star?,75,https://en.wikipedia.org/wiki/Eastman_School_of_Music,https://en.wikipedia.org/wiki/George_Eastman,https://en.wikipedia.org/wiki/List_of_stars_on_the_Hollywood_Walk_of_Fame,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Eastman_School_of_Music', 'https://en.wikipedia.org/wiki/George_Eastman', 'https://en.wikipedia.org/wiki/List_of_stars_on_the_Hollywood_Walk_of_Fame']" +769,What teammates were inducted into the college football hall of fame who played the same year as the first Native American to get a gold medal in the Olympics for the United States?," Gus Welch, and William ""Lone Star"" Dietz.",https://en.wikipedia.org/wiki/Jim_Thorpe,https://en.wikipedia.org/wiki/1911_Carlisle_Indians_football_team,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jim_Thorpe', 'https://en.wikipedia.org/wiki/1911_Carlisle_Indians_football_team']" +770,"What is the the origin of the mascot of the school district that Donahoe, Texas is located within?",England,"https://en.wikipedia.org/wiki/Donahoe,_Texas",https://en.wikipedia.org/wiki/Bartlett_Independent_School_District,https://en.wikipedia.org/wiki/Bulldog,https://en.wikipedia.org/wiki/England,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Donahoe,_Texas', 'https://en.wikipedia.org/wiki/Bartlett_Independent_School_District', 'https://en.wikipedia.org/wiki/Bulldog', 'https://en.wikipedia.org/wiki/England']" +771,How many Mount Katahdins makes up the height of Mount Kilimanjaro?,"Mount Kilimanjaro stands at a height of 5,895 meters whereas Mount Katahdn stands at a height of 1,606 meters. Therefore, Mount Kilimanjaro is the height of approximately 3.7 Mount Katahdins.",https://en.wikipedia.org/wiki/Mount_Kilimanjaro,https://en.wikipedia.org/wiki/Mount_Katahdin,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Mount_Kilimanjaro', 'https://en.wikipedia.org/wiki/Mount_Katahdin']" +772,What's the name of Goku's iconic transformation and what episode number does it first appear in the anime?,Goku becomes a Super Saiyan in episode 95 of Dragon Ball Z.,https://en.wikipedia.org/wiki/Goku,https://en.wikipedia.org/wiki/Dragon_Ball_Z,https://en.wikipedia.org/wiki/List_of_Dragon_Ball_Z_episodes,https://en.wikipedia.org/wiki/Dragon_Ball_Z_season_3,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Goku', 'https://en.wikipedia.org/wiki/Dragon_Ball_Z', 'https://en.wikipedia.org/wiki/List_of_Dragon_Ball_Z_episodes', 'https://en.wikipedia.org/wiki/Dragon_Ball_Z_season_3']" +773,When was the last team that Miloš Beleslin played for dissolved?,1945,https://en.wikipedia.org/wiki/Milo%C5%A1_Beleslin,https://en.wikipedia.org/wiki/%C5%BDAK_Subotica,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Milo%C5%A1_Beleslin', 'https://en.wikipedia.org/wiki/%C5%BDAK_Subotica']" +774,"Of the six main/principal cast of The Simpsons, who was born first?",Harry Shearer,https://en.wikipedia.org/wiki/The_Simpsons,https://en.wikipedia.org/wiki/List_of_The_Simpsons_cast_members,https://en.wikipedia.org/wiki/Dan_Castellaneta,https://en.wikipedia.org/wiki/Julie_Kavner,https://en.wikipedia.org/wiki/Nancy_Cartwright,https://en.wikipedia.org/wiki/Yeardley_Smith,https://en.wikipedia.org/wiki/Hank_Azaria,https://en.wikipedia.org/wiki/Harry_Shearer,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Simpsons', 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_cast_members', 'https://en.wikipedia.org/wiki/Dan_Castellaneta', 'https://en.wikipedia.org/wiki/Julie_Kavner', 'https://en.wikipedia.org/wiki/Nancy_Cartwright', 'https://en.wikipedia.org/wiki/Yeardley_Smith', 'https://en.wikipedia.org/wiki/Hank_Azaria', 'https://en.wikipedia.org/wiki/Harry_Shearer']" +775,"How old was the Commander-in-Chief of India from 1865-1870 when he died? How old was his wife when she died? Average these two numbers, rounding up to the nearest whole integer if necessary.","William Mansfield, 1st Baron Sandhurst was 57 at the time of his death. His wife, Margaret Mansfield, Baroness Sandhurst was aged 64 at the time of her death. Their average lifespan was 61 years old.","https://en.wikipedia.org/wiki/William_Mansfield,_1st_Baron_Sandhurst","https://en.wikipedia.org/wiki/Margaret_Mansfield,_Baroness_Sandhurst",,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/William_Mansfield,_1st_Baron_Sandhurst', 'https://en.wikipedia.org/wiki/Margaret_Mansfield,_Baroness_Sandhurst']" +776,"One of Jane Auten's novels was released in 1813, which was later adapted into a film in 2005. When was the director of that film born?","Joe Wright was born on August 25th, 1972.",https://en.wikipedia.org/wiki/Jane_Austen#List_of_works,"https://en.wikipedia.org/wiki/Pride_and_Prejudice#Film,_television_and_theatre",https://en.wikipedia.org/wiki/Pride_%26_Prejudice_(2005_film),https://en.wikipedia.org/wiki/Joe_Wright,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Jane_Austen#List_of_works', 'https://en.wikipedia.org/wiki/Pride_and_Prejudice#Film,_television_and_theatre', 'https://en.wikipedia.org/wiki/Pride_%26_Prejudice_(2005_film)', 'https://en.wikipedia.org/wiki/Joe_Wright']" +777,Which leap year happened exactly halfway between the birth of Shoghi Effendi and the founding of the college he attended?,1580,https://en.wikipedia.org/wiki/Shoghi_Effendi,"https://en.wikipedia.org/wiki/Balliol_College,_Oxford",https://en.wikipedia.org/wiki/1580,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Shoghi_Effendi', 'https://en.wikipedia.org/wiki/Balliol_College,_Oxford', 'https://en.wikipedia.org/wiki/1580']" +778,"What was the original name of the band the male lead of ""The Song"" founded?",Anthem Lights was originally known as Yellow Cavalier.,https://en.wikipedia.org/wiki/The_Song_(2014_film),https://en.wikipedia.org/wiki/Anthem_Lights,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/The_Song_(2014_film)', 'https://en.wikipedia.org/wiki/Anthem_Lights']" +779,How old was the future 34th president 5 years after the founding of the National Football League?,35,https://en.wikipedia.org/wiki/National_Football_League,https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States,https://en.wikipedia.org/wiki/Dwight_D._Eisenhower,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/National_Football_League', 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States', 'https://en.wikipedia.org/wiki/Dwight_D._Eisenhower']" +780,"Of the three producers of the tv sitcom Friends, which two had ties to Philadelphia? ",David Crane and Marta Kauffman,https://en.wikipedia.org/wiki/Friends,https://en.wikipedia.org/wiki/Kevin_S._Bright,https://en.wikipedia.org/wiki/Marta_Kauffman,https://en.wikipedia.org/wiki/David_Crane_(producer),,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Friends', 'https://en.wikipedia.org/wiki/Kevin_S._Bright', 'https://en.wikipedia.org/wiki/Marta_Kauffman', 'https://en.wikipedia.org/wiki/David_Crane_(producer)']" +781,"What movie won the Teen Choice Award for ""Choice Movie Liplock"" the same year George W. Bush gave his ""Mission Accomplished"" speech?",Sweet Home Alabama,https://en.wikipedia.org/wiki/Mission_Accomplished_speech,https://en.wikipedia.org/wiki/2003_Teen_Choice_Awards,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mission_Accomplished_speech', 'https://en.wikipedia.org/wiki/2003_Teen_Choice_Awards']" +782,"Using only the Winter and Summer Olympic events that occurred in the same country in the same year, which year had the most total combined women competing in the olympics?",1936,https://en.wikipedia.org/wiki/Winter_Olympic_Games,https://en.wikipedia.org/wiki/Summer_Olympic_Games,https://en.wikipedia.org/wiki/1932_Winter_Olympics,https://en.wikipedia.org/wiki/1932_Summer_Olympics,https://en.wikipedia.org/wiki/1936_Winter_Olympics,https://en.wikipedia.org/wiki/1936_Summer_Olympics,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Winter_Olympic_Games', 'https://en.wikipedia.org/wiki/Summer_Olympic_Games', 'https://en.wikipedia.org/wiki/1932_Winter_Olympics', 'https://en.wikipedia.org/wiki/1932_Summer_Olympics', 'https://en.wikipedia.org/wiki/1936_Winter_Olympics', 'https://en.wikipedia.org/wiki/1936_Summer_Olympics']" +783,"Of all states within New England in the United States, which had a population between 400,000 and 800,000 in 1920?","Of the 6 states within New England, Maine, New Hampshire, and Rhode Island had populations between 400,000 and 800,000 in 1920.",https://en.wikipedia.org/wiki/New_England,https://en.wikipedia.org/wiki/Connecticut,https://en.wikipedia.org/wiki/Maine,https://en.wikipedia.org/wiki/Massachusetts,https://en.wikipedia.org/wiki/New_Hampshire,https://en.wikipedia.org/wiki/Rhode_Island,https://en.wikipedia.org/wiki/Vermont,,,,,Numerical reasoning | Tabular reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/New_England', 'https://en.wikipedia.org/wiki/Connecticut', 'https://en.wikipedia.org/wiki/Maine', 'https://en.wikipedia.org/wiki/Massachusetts', 'https://en.wikipedia.org/wiki/New_Hampshire', 'https://en.wikipedia.org/wiki/Rhode_Island', 'https://en.wikipedia.org/wiki/Vermont']" +784,"In the three summer Olympics held in Europe between 1984 and 2020, how many more times did Australia place above South Korea in the final medal rankings?",Once.,https://en.wikipedia.org/wiki/Australia_at_the_Olympics,https://en.wikipedia.org/wiki/South_Korea_at_the_Olympics,,,,,,,,,,Numerical reasoning | Tabular reasoning,"['https://en.wikipedia.org/wiki/Australia_at_the_Olympics', 'https://en.wikipedia.org/wiki/South_Korea_at_the_Olympics']" +785,What was the difference in receiving yards from 2022 and 2023 for number 11 on the 49ers? Calculate the same 2022/2023 difference for the closest ranked WR for number 11's draft class and compare their numbers.,The yard difference for Brandon Aiyuk is 327. The yard difference for Justin Jefferson is -735. Brandon improved more in 2023 but Justin's 2022 numbers were astronomical.,https://en.wikipedia.org/wiki/Brandon_Aiyuk,https://en.wikipedia.org/wiki/2020_NFL_draft,https://en.wikipedia.org/wiki/Justin_Jefferson,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Brandon_Aiyuk', 'https://en.wikipedia.org/wiki/2020_NFL_draft', 'https://en.wikipedia.org/wiki/Justin_Jefferson']" +786,Which element of the periodic table is a homonym of a synonym of a tool commonly used in dog walking?,Lead,https://en.wikipedia.org/wiki/Periodic_table,https://en.wikipedia.org/wiki/Dog_walking,https://en.wikipedia.org/wiki/Leash,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Periodic_table', 'https://en.wikipedia.org/wiki/Dog_walking', 'https://en.wikipedia.org/wiki/Leash']" +787,Which German Fairytale did one of the directors of Trolls voice act as a character in a franchise Movie?,Walt Dohrn voice acted Rumpelstiltskin,https://en.wikipedia.org/wiki/Trolls_(film),https://en.wikipedia.org/wiki/Mike_Mitchell_(director),https://en.wikipedia.org/wiki/Walt_Dohrn,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Trolls_(film)', 'https://en.wikipedia.org/wiki/Mike_Mitchell_(director)', 'https://en.wikipedia.org/wiki/Walt_Dohrn']" +788,A Japanese aircraft carrier that was first built to become a luxury ocean liner was sunk by the U.S. torpedos in a battle four months prior to the Battle of Leyte Gulf in World War II. What is the name of the aircraft carrier and what battle did it sink in?,The aircraft carrier Hiyō was sunk in the Battle of the Philippine Sea.,https://en.wikipedia.org/wiki/Japanese_aircraft_carrier_Hiy%C5%8D,https://en.wikipedia.org/wiki/Battle_of_the_Philippine_Sea,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Japanese_aircraft_carrier_Hiy%C5%8D', 'https://en.wikipedia.org/wiki/Battle_of_the_Philippine_Sea']" +789,"The creator of the animated series Family Guy was supposed to be on one of the planes that was involved in the 9/11 attacks but he arrived too late to board. Altogether, how many letters are in the name of the city from which his missed fight departed that day?",6 (Boston),https://en.wikipedia.org/wiki/Family_Guy,https://en.wikipedia.org/wiki/Seth_MacFarlane,https://en.wikipedia.org/wiki/American_Airlines_Flight_11,,,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Family_Guy', 'https://en.wikipedia.org/wiki/Seth_MacFarlane', 'https://en.wikipedia.org/wiki/American_Airlines_Flight_11']" +790,"On the same day Roald Dahl first published a work, a famous guitar player and singer was born. This person was once ranked 13th in Rolling Stone's ""100 Greatest Guitarists of All Time"" cover story. In the same year that this person was ranked 13th, who was ranked number 2 in Rolling Stone magazine's list of the 100 greatest guitarists of all time?",Duane Allman,https://en.wikipedia.org/wiki/Roald_Dahl,https://en.wikipedia.org/wiki/August_1942,https://en.wikipedia.org/wiki/Jerry_Garcia,https://en.wikipedia.org/wiki/Duane_Allman,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Roald_Dahl', 'https://en.wikipedia.org/wiki/August_1942', 'https://en.wikipedia.org/wiki/Jerry_Garcia', 'https://en.wikipedia.org/wiki/Duane_Allman']" +791,"I lost a final against Greg Rusedski on grass, who won his 4th career singles title on grass against a player that once defeated me in the opening round of the US Open. I achieved my highest ranking in singles on what date?",10 July 2000,https://en.wikipedia.org/wiki/Greg_Rusedski,https://en.wikipedia.org/wiki/Karol_Kučera,https://en.wikipedia.org/wiki/2003_US_Open_%E2%80%93_Men%27s_singles,https://en.wikipedia.org/wiki/Alexander_Popp,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Greg_Rusedski', 'https://en.wikipedia.org/wiki/Karol_Kučera', 'https://en.wikipedia.org/wiki/2003_US_Open_%E2%80%93_Men%27s_singles', 'https://en.wikipedia.org/wiki/Alexander_Popp']" +792,"As of 1st August 2024 Queens of the Stone Age, Them Crooked Vultures, Screaming Trees and Kyuss have one band member in common. What does his last name mean in English?",Man,https://en.wikipedia.org/wiki/Queens_of_the_Stone_Age,https://en.wikipedia.org/wiki/Them_Crooked_Vultures,https://en.wikipedia.org/wiki/Kyuss,https://en.wikipedia.org/wiki/Homme,https://en.wikipedia.org/wiki/Screaming_Trees,,,,,,,Multiple constraints | Post processing,"['https://en.wikipedia.org/wiki/Queens_of_the_Stone_Age', 'https://en.wikipedia.org/wiki/Them_Crooked_Vultures', 'https://en.wikipedia.org/wiki/Kyuss', 'https://en.wikipedia.org/wiki/Homme', 'https://en.wikipedia.org/wiki/Screaming_Trees']" +793,Joel Oshiro Dyck played professional ice hockey for three teams. Which of those teams were dissolved for financial difficulties after the 2018-2019 season?,"Joel Oshiro Dyck played for the Chatham Wheels, the Wheeling Thunderbirds, and the Nippon Paper Cranes. After the 2018-2019 ice hockey season, Nippon Paper Cranes were dissolved and replaced by the East Hokkaido Cranes.",https://en.wikipedia.org/wiki/Joel_Dyck,https://en.wikipedia.org/wiki/Nippon_Paper_Cranes,,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Joel_Dyck', 'https://en.wikipedia.org/wiki/Nippon_Paper_Cranes']" +794,"There is a statue, Avukana Buddha, that overlooks a reservoir in Sri Lanka. The reservoir was built by a king in 460 A.D. What is the name of the king's uncle who raised him?",Mahanama,https://en.wikipedia.org/wiki/Avukana_Buddha_statue#Location_and_appearance,https://en.wikipedia.org/wiki/Kala_Wewa#History,https://en.wikipedia.org/wiki/Dhatusena_of_Anuradhapura#Early_life_and_becoming_king,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Avukana_Buddha_statue#Location_and_appearance', 'https://en.wikipedia.org/wiki/Kala_Wewa#History', 'https://en.wikipedia.org/wiki/Dhatusena_of_Anuradhapura#Early_life_and_becoming_king']" +795,"This Canadian mountain, located in Cypress Provincial Park, got its name because one didn't have to cross it to reach The Lions (formerly Ch'ich'iyúy Elxwíkn'). What is the name of this mountain? ",Unnecessary Mountain,https://en.wikipedia.org/wiki/Cypress_Provincial_Park,https://en.wikipedia.org/wiki/The_Two_Sisters_(British_Columbia),https://en.wikipedia.org/wiki/Unnecessary_Mountain,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Cypress_Provincial_Park', 'https://en.wikipedia.org/wiki/The_Two_Sisters_(British_Columbia)', 'https://en.wikipedia.org/wiki/Unnecessary_Mountain']" +796,Who won the FIFA World Cup in the year the Falklands War broke out?,Italy,https://en.wikipedia.org/wiki/Falklands_War,https://en.wikipedia.org/wiki/1982_FIFA_World_Cup,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Falklands_War', 'https://en.wikipedia.org/wiki/1982_FIFA_World_Cup']" +797,"John F. Kennedy was inaugurated into the presidency quite some time ago. Which song was number 1 on the Billboard Hot 100 chart on that same day, but 30 years after his inauguration?","Kennedy was inaugurated on January 20, 1961 In 1991, the number 1 song on the Billboard Hot 100 was ""Love Will Never Do (Without You)"" by Janet Jackson.",https://en.wikipedia.org/wiki/Inauguration_of_John_F._Kennedy,https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_1991,,,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Inauguration_of_John_F._Kennedy', 'https://en.wikipedia.org/wiki/List_of_Billboard_Hot_100_number_ones_of_1991']" +798,What genus does the animal that features in the upper left of the coat of arms of the area that the family of Jürgen Warnke moved to in 1945 belong to?,Panthera,https://en.wikipedia.org/wiki/J%C3%BCrgen_Warnke,https://en.wikipedia.org/wiki/Upper_Franconia#Coat_of_arms,https://en.wikipedia.org/wiki/Lion,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/J%C3%BCrgen_Warnke', 'https://en.wikipedia.org/wiki/Upper_Franconia#Coat_of_arms', 'https://en.wikipedia.org/wiki/Lion']" +799,"How many minutes longer is the men's marathon record time (as of June 2024) than the duration of the shortest war in history? If multiple conflicting durations are given for the war, use the shortest one. Round the answer to the nearest whole minute.",83,https://en.wikipedia.org/wiki/Marathon_world_record_progression,https://en.wikipedia.org/wiki/Anglo-Zanzibar_War,,,,,,,,,,Numerical reasoning | Post processing,"['https://en.wikipedia.org/wiki/Marathon_world_record_progression', 'https://en.wikipedia.org/wiki/Anglo-Zanzibar_War']" +800,"Which cities hosted the Olympics in 1988, and where were the opening ceremonies held in each city?","Calgary- Winter Olympics, opening ceremony held at McMahon Stadium. Seoul- Summer Olympics, opening ceremony held at Seoul Olympic Stadium.",https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities,https://en.wikipedia.org/wiki/1988_Summer_Olympics,https://en.wikipedia.org/wiki/1988_Winter_Olympics,,,,,,,,,Tabular reasoning,"['https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities', 'https://en.wikipedia.org/wiki/1988_Summer_Olympics', 'https://en.wikipedia.org/wiki/1988_Winter_Olympics']" +801,"Which actor in the movie Nadja has a Golden Palm Star on the Walk of Stars in Palm Springs, California?",Peter Fonda,https://en.wikipedia.org/wiki/Nadja_(film),https://en.wikipedia.org/wiki/Palm_Springs_Walk_of_Stars,https://en.wikipedia.org/wiki/Peter_Fonda,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Nadja_(film)', 'https://en.wikipedia.org/wiki/Palm_Springs_Walk_of_Stars', 'https://en.wikipedia.org/wiki/Peter_Fonda']" +802,"The artist, MadLib, released Mind Fusion Vol. 1 as a collaboration with several other artists. The track, ""I Got A Right Ta (Madlib Remix)"" features an artist other than MadLib. This artist received a Bachelor's of Science degree from a university in Florida. How many years after the establishment of this university was the album Mind Fusion Vol. 1 released?",117,https://en.wikipedia.org/wiki/Mind_Fusion,https://en.wikipedia.org/wiki/Common_(rapper),https://en.wikipedia.org/wiki/Florida_A%26M_University,,,,,,,,,Numerical reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Mind_Fusion', 'https://en.wikipedia.org/wiki/Common_(rapper)', 'https://en.wikipedia.org/wiki/Florida_A%26M_University']" +803,"During the month that GEMA Global Engine Alliance LLC was founded as a joint venture of Chrysler, Mitsubishi Motors, and Hyundai Motor Company, which international arms treaty was signed, who signed it, where, and on what date?","On May 24th, 2002, the Strategic Offensive Reductions Treaty was signed in Moscow by Vladimir Putin and George W. Bush.",https://en.wikipedia.org/wiki/Global_Engine_Alliance,https://en.wikipedia.org/wiki/Strategic_Offensive_Reductions_Treaty,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Global_Engine_Alliance', 'https://en.wikipedia.org/wiki/Strategic_Offensive_Reductions_Treaty']" +804,Which Pope served the longest between the Battle of the Milvian Bridge and the end of the Civil Wars of the Tetrarchy?,"St. Sylvester I, whose Latin name was Silvester",https://en.wikipedia.org/wiki/Battle_of_the_Milvian_Bridge,https://en.wikipedia.org/wiki/Civil_wars_of_the_Tetrarchy,https://en.wikipedia.org/wiki/List_of_popes,,,,,,,,,Numerical reasoning | Multiple constraints | Post processing | Temporal reasoning,"['https://en.wikipedia.org/wiki/Battle_of_the_Milvian_Bridge', 'https://en.wikipedia.org/wiki/Civil_wars_of_the_Tetrarchy', 'https://en.wikipedia.org/wiki/List_of_popes']" +805,Who won the season of the dance show that Tate McRae placed third in back in 2016?,"Leon ""Kida"" Burns",https://en.wikipedia.org/wiki/Tate_McRae,https://en.wikipedia.org/wiki/So_You_Think_You_Can_Dance:_The_Next_Generation_(American_TV_series),,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/Tate_McRae', 'https://en.wikipedia.org/wiki/So_You_Think_You_Can_Dance:_The_Next_Generation_(American_TV_series)']" +806,"What is greater: the combined 2011 populations of Rennington (Northumberland), Lydbrook (Gloucestershire), Stow-on-the-Wold (Gloucestershire) and Witney (Oxfordshire), or the 2022 population of London?",The 2022 population of London,https://en.wikipedia.org/wiki/Rennington,https://en.wikipedia.org/wiki/Lydbrook,https://en.wikipedia.org/wiki/Stow-on-the-Wold,https://en.wikipedia.org/wiki/Witney,https://en.wikipedia.org/wiki/London,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Rennington', 'https://en.wikipedia.org/wiki/Lydbrook', 'https://en.wikipedia.org/wiki/Stow-on-the-Wold', 'https://en.wikipedia.org/wiki/Witney', 'https://en.wikipedia.org/wiki/London']" +807,How many years old was The Real Housewives of New York City franchise when Jenna Lyons premiered on the show?,15 years old,https://en.wikipedia.org/wiki/Jenna_Lyons,https://en.wikipedia.org/wiki/The_Real_Housewives_of_New_York_City,,,,,,,,,,Numerical reasoning,"['https://en.wikipedia.org/wiki/Jenna_Lyons', 'https://en.wikipedia.org/wiki/The_Real_Housewives_of_New_York_City']" +808,"Two famous modernist writers were born and died on the same year. Who were they, which of them was alive for the longest, and by how many days?",Virginia Woolf and James Joyce. Virginia Woolf lived 82 days longer.,https://en.wikipedia.org/wiki/List_of_modernist_writers,https://en.wikipedia.org/wiki/James_Joyce,https://en.wikipedia.org/wiki/Virginia_Woolf,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_modernist_writers', 'https://en.wikipedia.org/wiki/James_Joyce', 'https://en.wikipedia.org/wiki/Virginia_Woolf']" +809,Which governor of Shizuoka resigned due to the delayed opening of the airport?,Yoshinobu Ishikawa,https://en.wikipedia.org/wiki/Shizuoka_Prefecture,https://en.wikipedia.org/wiki/Yoshinobu_Ishikawa,,,,,,,,,,Tabular reasoning | Post processing,"['https://en.wikipedia.org/wiki/Shizuoka_Prefecture', 'https://en.wikipedia.org/wiki/Yoshinobu_Ishikawa']" +810,"According to topographical summit prominence, how many years were there between the first ascent of the United State's second most prominent mountain and the first ascent of Russia's second most prominent mountain? ",35 years.,https://en.wikipedia.org/wiki/List_of_mountain_peaks_by_prominence,https://en.wikipedia.org/wiki/Mauna_Kea,https://en.wikipedia.org/wiki/Klyuchevskaya_Sopka,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_mountain_peaks_by_prominence', 'https://en.wikipedia.org/wiki/Mauna_Kea', 'https://en.wikipedia.org/wiki/Klyuchevskaya_Sopka']" +811,What is the difference between the number of years served in the seventh-ratified US state's House of Delegates between that state's senator elected in 2007 and his uncle?,"The seventh-ratified US state is Maryland. The senator of Maryland elected in 2007 is Ben Cardin. Ben Cardin served 20 years (1967 to 1987) and his uncle, Maurice Cardin, served 15 years (1951 to 1966) in the Maryland House of Delegates. 20 - 15 = 5 years difference, with Ben serving 5 more years.",https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States,https://en.wikipedia.org/wiki/List_of_United_States_senators_from_Maryland,https://en.wikipedia.org/wiki/Ben_Cardin,https://en.wikipedia.org/wiki/Maurice_Cardin,,,,,,,,Numerical reasoning | Tabular reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States', 'https://en.wikipedia.org/wiki/List_of_United_States_senators_from_Maryland', 'https://en.wikipedia.org/wiki/Ben_Cardin', 'https://en.wikipedia.org/wiki/Maurice_Cardin']" +812,"What is the name of the father of the first cousin of the mother of the man whose name inspired the naming of the lunar mountain ""Mons Hansteen""?",Peter Treschow,https://en.wikipedia.org/wiki/Mons_Hansteen,https://en.wikipedia.org/wiki/Christopher_Hansteen,https://en.wikipedia.org/wiki/Niels_Treschow,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Mons_Hansteen', 'https://en.wikipedia.org/wiki/Christopher_Hansteen', 'https://en.wikipedia.org/wiki/Niels_Treschow']" +813,What happened at the Dyatlov Pass Incident and how did it inspire the plot of 2013 horror film Devil Pass?,The Dyatlov Pass Incident was an event in 1959 where nine Soviet Hiker's died in the Northern Ural Mountains after cutting open their tents and running into the snow for a reason without explanation. Devil Pass is a found footage film that takes place in the decades following the Dyatlov Pass Incident about a group of American students who travel to Russia to investigate the event.,https://en.wikipedia.org/wiki/Dyatlov_Pass_incident,https://en.wikipedia.org/wiki/Devil%27s_Pass,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Dyatlov_Pass_incident', 'https://en.wikipedia.org/wiki/Devil%27s_Pass']" +814,"Who was the winner of Tour de France the same year that a major catastrophic crash happened at Circuit de la Sarthe in Le Mans, France?",Louison Bobet,https://en.wikipedia.org/wiki/1955_Le_Mans_disaster,https://en.wikipedia.org/wiki/List_of_Tour_de_France_general_classification_winners,,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/1955_Le_Mans_disaster', 'https://en.wikipedia.org/wiki/List_of_Tour_de_France_general_classification_winners']" +815,A 2002 science fiction novel by an American author references La Llorona and themes of personal identity. What is the name of the trilogy that this author wrote under the same publisher?,The Sea of Trolls trilogy,https://en.wikipedia.org/wiki/La_Llorona#Literature,https://en.wikipedia.org/wiki/The_House_of_the_Scorpion,https://en.wikipedia.org/wiki/Nancy_Farmer#Bibliography,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://en.wikipedia.org/wiki/La_Llorona#Literature', 'https://en.wikipedia.org/wiki/The_House_of_the_Scorpion', 'https://en.wikipedia.org/wiki/Nancy_Farmer#Bibliography']" +816,"Mary Gaulden Jagger worked in the Biology Division of the Oak Ridge National Laboratory; what type of supercomputer, ranked by the TOP500 as the world's most powerful in June 2022, is present on the campus, and in what year did this supercomputer become operational?","Frontier, 2022",https://en.wikipedia.org/wiki/Mary_Gaulden_Jagger,https://en.wikipedia.org/wiki/Oak_Ridge_National_Laboratory,https://en.wikipedia.org/wiki/Frontier_(supercomputer),,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Mary_Gaulden_Jagger', 'https://en.wikipedia.org/wiki/Oak_Ridge_National_Laboratory', 'https://en.wikipedia.org/wiki/Frontier_(supercomputer)']" +817,"I grew up in a village on Long Island in the Town of Oyster Bay. The name of this town is made up of two words, the first starts with the letter ""S"" and the second with the letter ""C."" I went to a public elementary school in this village in the year 1999. What was the name of my school?",Sea Cliff Elementary School,"https://en.wikipedia.org/wiki/Oyster_Bay_(town),_New_York","https://en.wikipedia.org/wiki/Sea_Cliff,_New_York",https://en.wikipedia.org/wiki/North_Shore_School_District,,,,,,,,,Multiple constraints,"['https://en.wikipedia.org/wiki/Oyster_Bay_(town),_New_York', 'https://en.wikipedia.org/wiki/Sea_Cliff,_New_York', 'https://en.wikipedia.org/wiki/North_Shore_School_District']" +818,Who was the Catholic Pope eleven years after Emperor Charlemagne died?,Eugene II,https://en.wikipedia.org/wiki/Charlemagne,https://en.wikipedia.org/wiki/List_of_popes,,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Charlemagne', 'https://en.wikipedia.org/wiki/List_of_popes']" +819,"How many years after publishing his paper *On the Inhalation of the Vapor of Ether* did John Snow make the connection between cholera, kidney failure, and contaminated water sources?",Seven,https://en.wikipedia.org/wiki/Miasma_theory#,https://en.wikipedia.org/wiki/John_Snow#,https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak#,,,,,,,,,Numerical reasoning | Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Miasma_theory#', 'https://en.wikipedia.org/wiki/John_Snow#', 'https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak#']" +820,This singer represented Sweden in Eurovision four years before the Sweden Democrats entered Parliament for the first time. What astrological sign was the real person behind the character she played in her first musical?,Aquarius,https://en.wikipedia.org/wiki/Sweden_Democrats,https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2006,https://en.wikipedia.org/wiki/Carola_H%C3%A4ggkvist,https://en.wikipedia.org/wiki/The_Sound_of_Music,https://en.wikipedia.org/wiki/Maria_von_Trapp,https://en.wikipedia.org/wiki/Astrological_sign,,,,,,Numerical reasoning | Temporal reasoning,"['https://en.wikipedia.org/wiki/Sweden_Democrats', 'https://en.wikipedia.org/wiki/Eurovision_Song_Contest_2006', 'https://en.wikipedia.org/wiki/Carola_H%C3%A4ggkvist', 'https://en.wikipedia.org/wiki/The_Sound_of_Music', 'https://en.wikipedia.org/wiki/Maria_von_Trapp', 'https://en.wikipedia.org/wiki/Astrological_sign']" +821,Who was the king of England when Isaac Newton first published his Principia?,James II of England,https://en.wikipedia.org/wiki/Philosophi%C3%A6_Naturalis_Principia_Mathematica,https://en.wikipedia.org/wiki/Monarchy_of_the_United_Kingdom,https://en.wikipedia.org/wiki/James_II_of_England,,,,,,,,,Temporal reasoning,"['https://en.wikipedia.org/wiki/Philosophi%C3%A6_Naturalis_Principia_Mathematica', 'https://en.wikipedia.org/wiki/Monarchy_of_the_United_Kingdom', 'https://en.wikipedia.org/wiki/James_II_of_England']" +822,"Which movie musical produced a song that was inspired by poetry from an American poet, who was born a week after Queen Victoria?",Fame,https://en.wikipedia.org/wiki/Queen_Victoria,https://en.wikipedia.org/wiki/1819,https://en.wikipedia.org/wiki/Walt_Whitman,,,,,,,,,Multiple constraints | Temporal reasoning,"['https://en.wikipedia.org/wiki/Queen_Victoria', 'https://en.wikipedia.org/wiki/1819', 'https://en.wikipedia.org/wiki/Walt_Whitman']" +823,Diago Costa played for which club when he was awarded the first FIFA World Cup Goal based on a VAR Decision?,Atlético Madrid,https://simple.wikipedia.org/wiki/Video_assistant_referee,https://en.wikipedia.org/wiki/2018_FIFA_World_Cup#Officiating,https://en.wikipedia.org/wiki/Diego_Costa#Spain,,,,,,,,,Tabular reasoning | Multiple constraints,"['https://simple.wikipedia.org/wiki/Video_assistant_referee', 'https://en.wikipedia.org/wiki/2018_FIFA_World_Cup#Officiating', 'https://en.wikipedia.org/wiki/Diego_Costa#Spain']" \ No newline at end of file diff --git a/src/evals/constants.py b/src/evals/constants.py index 1d413eb..fe462ab 100644 --- a/src/evals/constants.py +++ b/src/evals/constants.py @@ -1,6 +1,6 @@ # We used a weaker model for synthesis and a stronger model for grading to ensure fairness. -SYNTHESIS_MODEL = "gpt-5-nano" -GRADER_MODEL = "gpt-5-mini" +SYNTHESIS_MODEL = "gpt-4o-mini" +GRADER_MODEL = "gpt-4.1" # Maximum tokens available for search results (leaving room for prompt and response) MAX_SEARCH_RESULT_TOKENS = 127750 @@ -10,3 +10,125 @@ Read the provided search snippets carefully and answer based only on information found in the snippets. Keep your response clear and concise. """ + +# Prompt is from OpenAI's simple-evals repository https://github.com/openai/simple-evals/blob/ee3b0318d8d1d9d72755a4120879be65f7c07e9e/simpleqa_eval.py#L13 +SIMPLEQA_ANSWER_GRADER_TEMPLATE = """ +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. +First, I will give examples of each grade, and then you will grade a new example. + + +The following are examples of CORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia Obama and Sasha Obama +Predicted answer 1: sasha and malia obama +Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check +Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. +``` +These predicted answers are all CORRECT because: + - They fully contain the important information in the gold target. + - They do not contain any information that contradicts the gold target. + - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. + - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. + + +The following are examples of INCORRECT predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: Malia. +Predicted answer 2: Malia, Sasha, and Susan. +Predicted answer 3: Barack Obama does not have any children. +Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. +Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. +Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? +Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. +``` +These predicted answers are all INCORRECT because: + - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. + + +The following are examples of NOT_ATTEMPTED predicted answers. +``` +Question: What are the names of Barack Obama's children? +Gold target: Malia and Sasha +Predicted answer 1: I don't know. +Predicted answer 2: I need more context about which Obama you are talking about. +Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. +Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. +``` +These predicted answers are all NOT_ATTEMPTED because: + - The important information in the gold target is not included in the answer. + - No statements in the answer contradict the gold target. + + +Also note the following things: +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". + - Predicted answers "120k", "124k", and 115k" are all CORRECT. + - Predicted answers "100k" and "113k" are INCORRECT. + - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. +- Do not punish predicted answers if they omit information that would be clearly inferred from the question. + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. +- Do not punish for typos in people's name if it's clearly the same name. + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". + + +Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. +``` +Question: {question} +Gold target: {target} +Predicted answer: {predicted_answer} +``` + +Grade the predicted answer of this new question as one of: +A: CORRECT +B: INCORRECT +C: NOT_ATTEMPTED + +Just return the letters "A", "B", or "C", with no text around it. +""".strip() + +FRAMES_ANSWER_GRADER_TEMPLATE = """ +===Task=== + + +I need your help in evaluating an answer provided by an LLM against a ground truth +answer. Your task is to determine if the ground truth answer is present in the LLM’s response. +Please analyze the provided data and make a decision. + + +===Instructions=== + + +1. Carefully compare the "Predicted Answer" with the "Ground Truth Answer". +2. Consider the substance of the answers – look for equivalent information or correct answers. Do +not focus on exact wording unless the exact wording is crucial to the meaning. +3. Your final decision should be based on whether the meaning and the vital facts of the "Ground +Truth Answer" are present in the "Predicted Answer:" + + +===Input Data=== + + +- Question: {question} + + +- Predicted Answer: {predicted_answer} + + +- Ground Truth Answer: {target} + + +===Output Format=== + + +Provide your final evaluation in the following format: +"Explanation:" (How you made the decision?) +"Decision:" ("TRUE" or "FALSE") +Please proceed with the evaluation. +""" \ No newline at end of file diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index cf1590f..e5e15d9 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -61,19 +61,20 @@ def get_remaining_problems(df, sampler_name: str, dataset_name: str, results_dir return df -async def process_query_with_semaphore(semaphore, sampler, target_query, target_ground_truth): +async def process_query_with_semaphore(semaphore, sampler, target_query, target_ground_truth, dataset): async with semaphore: try: - return await sampler(target_query, ground_truth=target_ground_truth) + return await sampler(target_query, ground_truth=target_ground_truth, dataset=dataset) except Exception as e: logging.error(f"Failed to run {sampler.sampler_name} for query: {target_query}") - return e + raise e def get_dataset(dataset_name): if dataset_name == "simpleqa": return pd.read_csv("data/simpleqa_full_dataset.csv") - # TODO: Add frames, route to deep search + elif dataset_name == "frames": + return pd.read_csv("data/frames_full_dataset.csv") else: raise ValueError(f"Dataset '{dataset_name}' not recognized, run python src/evals/eval_runner.py --help for available datasets") @@ -132,7 +133,15 @@ async def run_evals( for _, row in batch_df.iterrows(): query = row["problem"] ground_truth = row["answer"] - task = asyncio.create_task(process_query_with_semaphore(semaphore, sampler, query, ground_truth)) + task = asyncio.create_task( + process_query_with_semaphore( + semaphore=semaphore, + sampler=sampler, + target_query=query, + target_ground_truth=ground_truth, + dataset=dataset_name, + ) + ) tasks.append(task) batch_results = await asyncio.gather(*tasks, return_exceptions=True) diff --git a/src/evals/processing/evaluate_answer.py b/src/evals/processing/evaluate_answer.py index 59d1f3b..21a70d9 100644 --- a/src/evals/processing/evaluate_answer.py +++ b/src/evals/processing/evaluate_answer.py @@ -14,88 +14,6 @@ from evals import constants -# Prompt is from OpenAI's simple-evals repository https://github.com/openai/simple-evals/blob/ee3b0318d8d1d9d72755a4120879be65f7c07e9e/simpleqa_eval.py#L13 -ANSWER_GRADER_TEMPLATE = """ -Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. -First, I will give examples of each grade, and then you will grade a new example. - - -The following are examples of CORRECT predicted answers. -``` -Question: What are the names of Barack Obama's children? -Gold target: Malia Obama and Sasha Obama -Predicted answer 1: sasha and malia obama -Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check -Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. -``` -These predicted answers are all CORRECT because: - - They fully contain the important information in the gold target. - - They do not contain any information that contradicts the gold target. - - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. - - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. - - -The following are examples of INCORRECT predicted answers. -``` -Question: What are the names of Barack Obama's children? -Gold target: Malia and Sasha -Predicted answer 1: Malia. -Predicted answer 2: Malia, Sasha, and Susan. -Predicted answer 3: Barack Obama does not have any children. -Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. -Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. -Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? -Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. -``` -These predicted answers are all INCORRECT because: - - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. - - -The following are examples of NOT_ATTEMPTED predicted answers. -``` -Question: What are the names of Barack Obama's children? -Gold target: Malia and Sasha -Predicted answer 1: I don't know. -Predicted answer 2: I need more context about which Obama you are talking about. -Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. -Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. -``` -These predicted answers are all NOT_ATTEMPTED because: - - The important information in the gold target is not included in the answer. - - No statements in the answer contradict the gold target. - - -Also note the following things: -- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". - - Predicted answers "120k", "124k", and 115k" are all CORRECT. - - Predicted answers "100k" and "113k" are INCORRECT. - - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. -- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. - - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. -- Do not punish predicted answers if they omit information that would be clearly inferred from the question. - - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". - - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. - - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. - - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. -- Do not punish for typos in people's name if it's clearly the same name. - - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". - - -Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. -``` -Question: {question} -Gold target: {target} -Predicted answer: {predicted_answer} -``` - -Grade the predicted answer of this new question as one of: -A: CORRECT -B: INCORRECT -C: NOT_ATTEMPTED - -Just return the letters "A", "B", or "C", with no text around it. -""".strip() - class AnswerGrader: def __init__(self, model: str = constants.GRADER_MODEL, max_retries: int = 3): @@ -145,9 +63,9 @@ async def call_openai_async(self, client: httpx.AsyncClient, prompt: str) -> str raise ValueError("Failed to call OpenAI API") - async def evaluate_single(self, question: str, target: str, predicted_answer: str) -> Dict[str, Any]: - """Evaluate a single response asynchronously""" - grader_prompt = ANSWER_GRADER_TEMPLATE.format( + async def evaluate_single_simpleqa(self, question: str, target: str, predicted_answer: str) -> Dict[str, Any]: + """Evaluate a single response asynchronously for SimpleQA dataset""" + grader_prompt = constants.SIMPLEQA_ANSWER_GRADER_TEMPLATE.format( question=question, target=target, predicted_answer=predicted_answer, @@ -175,3 +93,32 @@ async def evaluate_single(self, question: str, target: str, predicted_answer: st "is_not_attempted": is_not_attempted, "score": is_correct, } + + async def evaluate_single_frames(self, question: str, target: str, predicted_answer: str) -> Dict[str, Any]: + """Evaluate a single response asynchronously for frames dataset""" + grader_prompt = constants.FRAMES_ANSWER_GRADER_TEMPLATE.format( + question=question, + target=target, + predicted_answer=predicted_answer, + ) + + async with httpx.AsyncClient(timeout=60.0) as client: + grading_response = await self.call_openai_async(client, grader_prompt) + + # Parse the grade + match = re.search(r"(TRUE|FALSE)", grading_response) + grade_letter = match.group(0) if match else None + + # Convert to readable format + score_name = {"TRUE": "is_correct", "FALSE": "is_incorrect"}[grade_letter] + + is_correct = grade_letter == "TRUE" + is_incorrect = grade_letter == "FALSE" + + return { + "grade": grade_letter, + "score_name": score_name, + "is_correct": is_correct, + "is_incorrect": is_incorrect, + "score": is_correct, + } \ No newline at end of file diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 7745991..55f99ee 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -3,6 +3,7 @@ import logging import time from typing import Any, Dict +from wsgiref.validate import validator from evals.processing import synthesizer_utils @@ -72,14 +73,19 @@ def __extract_query_from_messages__(message_list: list[dict]) -> str: return str(message_list) @staticmethod - async def __evaluate_response(query: str, ground_truth: str, generated_answer: str) -> Dict[str, Any]: + async def __evaluate_response(query: str, ground_truth: str, generated_answer: str, dataset: str) -> Dict[str, Any]: """Evaluate the generated response against ground truth""" from evals.processing.evaluate_answer import AnswerGrader evaluator = AnswerGrader() - return await evaluator.evaluate_single(query, ground_truth, generated_answer) + if dataset == 'simpleqa': + return await evaluator.evaluate_single_simpleqa(query, ground_truth, generated_answer) + elif dataset == 'frames': + return await evaluator.evaluate_single_frames(query, ground_truth, generated_answer) + else: + raise ValueError(f"Unknown dataset {dataset}, not sure which evaluator to use") - async def __call__(self, query_input, ground_truth: str = "", overwrite: bool = False) -> Dict[str, Any]: + async def __call__(self, query_input, dataset: str, ground_truth: str = "", overwrite: bool = False) -> Dict[str, Any]: """Main execution pipeline""" internal_response_time_ms = None end_to_end_time_ms = None @@ -129,7 +135,7 @@ async def __call__(self, query_input, ground_truth: str = "", overwrite: bool = # Evaluated synthesized results against ground truth try: if ground_truth: - evaluation_result_dict = await self.__evaluate_response(query, ground_truth, generated_answer) + evaluation_result_dict = await self.__evaluate_response(query, ground_truth, generated_answer, dataset) evaluation_result = evaluation_result_dict["score_name"] else: raise ValueError("Ground truth is missing") From 277d7946e833396d077bff8a57cb648257e56fb8 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 14:03:23 -0800 Subject: [PATCH 12/23] Add parallel; remove custom args --- .env.example | 2 +- requirements.txt | 1 + src/evals/configs/samplers.py | 12 +++- .../samplers/applied_samplers/exa_sampler.py | 8 +-- .../applied_samplers/parallel_sampler.py | 62 +++++++++++++++++++ .../applied_samplers/tavily_sampler.py | 16 +++-- .../applied_samplers/you_livecrawl_sampler.py | 1 - .../applied_samplers/you_search_sampler.py | 1 - .../base_samplers/base_api_sampler.py | 1 - .../base_samplers/base_sdk_sampler.py | 2 - 10 files changed, 82 insertions(+), 24 deletions(-) create mode 100644 src/evals/samplers/applied_samplers/parallel_sampler.py diff --git a/.env.example b/.env.example index 0fb7487..5466e36 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,7 @@ YOU_API_KEY= OPENAI_API_KEY= EXA_API_KEY= -PERPLEXITY_API_KEY= PARALLEL_API_KEY= +PERPLEXITY_API_KEY= SERP_API_KEY= TAVILY_API_KEY= diff --git a/requirements.txt b/requirements.txt index bb29995..4c532dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ aiohttp==3.12.15 exa-py==2.2.0 openai==1.78.1 pandas==2.2.3 +parallel-web==0.4.1 pydantic==2.11.4 pytest==8.3.4 pytest-asyncio==0.24.0 diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py index 6dba108..3c85583 100644 --- a/src/evals/configs/samplers.py +++ b/src/evals/configs/samplers.py @@ -1,6 +1,7 @@ import os from evals.samplers.applied_samplers.exa_sampler import ExaSampler +from evals.samplers.applied_samplers.parallel_sampler import ParallelSearchSampler from evals.samplers.applied_samplers.tavily_sampler import TavilySampler from evals.samplers.applied_samplers.you_livecrawl_sampler import YouLivecrawlSampler from evals.samplers.applied_samplers.you_search_sampler import YouSearchSampler @@ -18,16 +19,21 @@ ExaSampler( sampler_name="exa_search_with_contents", api_key=os.getenv("EXA_API_KEY"), - custom_args={"text": True}, + text=True, + ), + ParallelSearchSampler( + sampler_name="parallel_fast", + api_key=os.getenv("PARALLEL_API_KEY"), + mode="fast", ), TavilySampler( sampler_name="tavily_basic", api_key=os.getenv("TAVILY_API_KEY"), - custom_args={"search_depth": "basic"}, + search_depth="basic", ), TavilySampler( sampler_name="tavily_advanced", api_key=os.getenv("TAVILY_API_KEY"), - custom_args={"search_depth": "advanced"}, + search_depth="advanced", ), ] diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index 2c37799..83d8d30 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -16,7 +16,7 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args: Dict[str, Any] | None = None, + text: bool = False, ): super().__init__( sampler_name=sampler_name, @@ -25,17 +25,13 @@ def __init__( timeout=timeout, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, - custom_args=custom_args, ) def _initialize_client(self): self.client = Exa(self.api_key) def _get_search_results_impl(self, query: str) -> Any: - if self.custom_args and self.custom_args["text"]: - return self.client.search(query=query, num_results=10, contents={"text": True}) - - raise ValueError("Unknown configuration for Exa") + return self.client.search(query=query, num_results=10, contents={"text": self.text}) def format_results(self, results: Any) -> list[str]: formatted_results = [] diff --git a/src/evals/samplers/applied_samplers/parallel_sampler.py b/src/evals/samplers/applied_samplers/parallel_sampler.py new file mode 100644 index 0000000..7600901 --- /dev/null +++ b/src/evals/samplers/applied_samplers/parallel_sampler.py @@ -0,0 +1,62 @@ +import os +from typing import Any + +from parallel import Parallel + +from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler + + +class ParallelSearchSampler(BaseSDKSampler): + """Parallel sampler using the Search API""" + + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_characters: int | None = None, + mode: str = "fast", + max_concurrency: int = 10, + ): + self.max_characters = max_characters + self.mode = mode + + if api_key is None: + api_key = os.getenv("PARALLEL_API_KEY") + + super().__init__( + sampler_name=sampler_name, + api_key=api_key, + timeout=timeout, + max_concurrency=max_concurrency, + ) + + if api_key is None: + print("No API key provided for Parallel") + + def _initialize_client(self): + """Initialize Parallel SDK client""" + return Parallel(api_key=self.api_key) + + def _get_search_results_impl(self, query): + search_params = { + "mode": self.mode, + "objective": query, + "max_results": 10, + } + + if self.max_characters is not None: + search_params["excerpts"] = {"max_chars_per_result": self.max_characters} + + response = self.client.beta.search(**search_params) + return response + + def format_results(self, results: Any) -> list[str]: + formatted_results = [] + if results and results.results: + for result in results.results: + title = result.title + url = result.url + content = "\n".join(result.excerpts) + formatted_results.append(f"[{title}]({url})\n{content}\n") + return formatted_results diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index 2fcf644..3e0ba31 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -16,8 +16,9 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args: Dict[str, Any] | None = None, + search_depth: str = None, ): + self.search_depth = search_depth super().__init__( sampler_name=sampler_name, api_key=api_key, @@ -25,20 +26,17 @@ def __init__( timeout=timeout, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, - custom_args=custom_args, ) def _initialize_client(self): self.client = TavilyClient(self.api_key) def _get_search_results_impl(self, query: str) -> Any: - if self.custom_args and self.custom_args["search_depth"]: - return self.client.search( - query=query, - max_results=10, - search_depth=self.custom_args["search_depth"], - ) - raise ValueError("Unknown configuration for Tavily") + return self.client.search( + query=query, + max_results=10, + search_depth=self.search_depth, + ) def format_results(self, results: Any) -> list[str]: formatted_results = [] diff --git a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py index 46f5d68..5411080 100644 --- a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py +++ b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py @@ -29,7 +29,6 @@ def __init__( timeout=timeout, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, - custom_args=custom_args, ) @staticmethod diff --git a/src/evals/samplers/applied_samplers/you_search_sampler.py b/src/evals/samplers/applied_samplers/you_search_sampler.py index 9795bc0..2409b50 100644 --- a/src/evals/samplers/applied_samplers/you_search_sampler.py +++ b/src/evals/samplers/applied_samplers/you_search_sampler.py @@ -25,7 +25,6 @@ def __init__( timeout=timeout, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, - custom_args=custom_args, ) def _initialize_client(self): diff --git a/src/evals/samplers/base_samplers/base_api_sampler.py b/src/evals/samplers/base_samplers/base_api_sampler.py index 5708929..94b25e7 100644 --- a/src/evals/samplers/base_samplers/base_api_sampler.py +++ b/src/evals/samplers/base_samplers/base_api_sampler.py @@ -26,7 +26,6 @@ def __init__( timeout=timeout, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, - custom_args=custom_args, ) def _set_params(self): diff --git a/src/evals/samplers/base_samplers/base_sdk_sampler.py b/src/evals/samplers/base_samplers/base_sdk_sampler.py index bba35a6..359a689 100644 --- a/src/evals/samplers/base_samplers/base_sdk_sampler.py +++ b/src/evals/samplers/base_samplers/base_sdk_sampler.py @@ -20,7 +20,6 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args=None, ): super().__init__( sampler_name=sampler_name, @@ -29,7 +28,6 @@ def __init__( timeout=timeout, max_concurrency=max_concurrency, needs_synthesis=needs_synthesis, - custom_args=custom_args, ) self.client = None if self.api_key: From 80111b4db3ef61af2ba40830940cd65a86a8031c Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 15:24:12 -0800 Subject: [PATCH 13/23] Convert dataset and sampler to config --- src/evals/configs/datasets.py | 33 +++++++++++ src/evals/eval_runner.py | 58 ++++++++++--------- .../base_samplers/base_api_sampler.py | 1 - .../samplers/base_samplers/base_sampler.py | 16 ++--- 4 files changed, 68 insertions(+), 40 deletions(-) create mode 100644 src/evals/configs/datasets.py diff --git a/src/evals/configs/datasets.py b/src/evals/configs/datasets.py new file mode 100644 index 0000000..29b9433 --- /dev/null +++ b/src/evals/configs/datasets.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass +from collections.abc import Callable + +import pandas as pd + +from evals.processing.evaluate_answer import AnswerGrader + + +evaluator = AnswerGrader() + + +@dataclass +class Dataset: + dataset_name: str + csv_path: str + grader: Callable + df: pd.DataFrame | None + + +DATASETS = [ + Dataset( + dataset_name="frames", + csv_path="data/frames_full_dataset.csv", + grader=evaluator.evaluate_single_frames, + df=None, + ), + Dataset( + dataset_name="simpleqa", + csv_path="data/simpleqa_full_dataset.csv", + grader=evaluator.evaluate_single_simpleqa, + df=None, + ), +] diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index e5e15d9..cca307d 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -14,7 +14,8 @@ import pandas as pd from tqdm import tqdm -from evals.configs import samplers +from evals.configs import samplers, datasets +from evals.samplers.base_samplers.base_sampler import BaseSampler from evals.eval_results_analyzer import write_metrics, get_default_results_dir @@ -26,12 +27,12 @@ logger = logging.getLogger(__name__) -def get_sampler_filepath(sampler_name: str, dataset_name: str, results_dir: Path = None) -> Path: +def get_sampler_filepath(sampler: BaseSampler, dataset: datasets.Dataset, results_dir: Path = None) -> Path: """Get the filepath for a sampler's results file.""" if results_dir is None: results_dir = get_default_results_dir() - return results_dir / f"dataset_{dataset_name}_raw_results_{sampler_name}.csv" + return results_dir / f"dataset_{dataset.dataset_name}_raw_results_{sampler.sampler_name}.csv" def get_sampler(sampler_name: str): @@ -50,15 +51,15 @@ def clean_results_folder(results_dir: Path = None): shutil.rmtree(results_dir) -def get_remaining_problems(df, sampler_name: str, dataset_name: str, results_dir: Path = None): +def get_remaining_problems(dataset: datasets.Dataset, sampler: BaseSampler, results_dir: Path = None): """In case of failure, only run problems from the dataset that have not been run yet""" if results_dir is None: results_dir = get_default_results_dir() - sampler_results_filepath = get_sampler_filepath(sampler_name, dataset_name, results_dir) + sampler_results_filepath = get_sampler_filepath(sampler, dataset, results_dir) if os.path.isdir(results_dir) and os.path.isfile(sampler_results_filepath): sampler_results = pd.read_csv(sampler_results_filepath) - return df[~df["problem"].isin(sampler_results["query"].tolist())] - return df + return dataset.df[~dataset.df["problem"].isin(sampler_results["query"].tolist())] + return dataset.df async def process_query_with_semaphore(semaphore, sampler, target_query, target_ground_truth, dataset): @@ -71,12 +72,15 @@ async def process_query_with_semaphore(semaphore, sampler, target_query, target_ def get_dataset(dataset_name): - if dataset_name == "simpleqa": - return pd.read_csv("data/simpleqa_full_dataset.csv") - elif dataset_name == "frames": - return pd.read_csv("data/frames_full_dataset.csv") - else: + dataset = next( + (dataset for dataset in datasets.DATASETS if dataset.dataset_name == dataset_name), None + ) + dataset.df = pd.read_csv(dataset.csv_path) + if dataset is None: raise ValueError(f"Dataset '{dataset_name}' not recognized, run python src/evals/eval_runner.py --help for available datasets") + if dataset.df is None: + raise ValueError(f"Failed to initialize df for {dataset_name} and csv_path {dataset.csv_path}") + return dataset async def run_evals( @@ -101,33 +105,33 @@ async def run_evals( results = {} for dataset_name in args.datasets: - df = get_dataset(dataset_name) + dataset = get_dataset(dataset_name) if args.limit: - df = df.sample(n=args.limit) + dataset.df = dataset.df.sample(n=args.limit) for sampler_name in args.samplers: sampler = get_sampler(sampler_name) # Only run on problems that are not already in results folder remaining_problems = get_remaining_problems( - df=df, sampler_name=sampler.sampler_name, dataset_name=dataset_name, results_dir=results_dir + dataset=dataset, sampler=sampler, results_dir=results_dir ) if len(remaining_problems) == 0: logging.info(f"No problems remaining for sampler {sampler.sampler_name}, moving on...") - results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler.sampler_name, dataset_name, results_dir)) + results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler, dataset, results_dir)) continue logging.info(f"Running sampler {sampler.sampler_name} on dataset {dataset_name} on {len(remaining_problems)} problems") - df = remaining_problems + dataset.df = remaining_problems # Run problems in batches with tqdm( - total=len(df), - desc=f"Running sampler: {sampler.sampler_name} for dataset {dataset_name}", + total=len(dataset.df), + desc=f"Running sampler: {sampler.sampler_name} for dataset {dataset.dataset_name}", unit="queries", ) as pbar: semaphore = asyncio.Semaphore(args.max_concurrent_tasks) - for i in range(0, len(df), args.batch_size): - batch_df = df[i : i + args.batch_size] + for i in range(0, len(dataset.df), args.batch_size): + batch_df = dataset.df[i : i + args.batch_size] tasks = [] for _, row in batch_df.iterrows(): @@ -139,7 +143,7 @@ async def run_evals( sampler=sampler, target_query=query, target_ground_truth=ground_truth, - dataset=dataset_name, + dataset=dataset, ) ) tasks.append(task) @@ -149,10 +153,10 @@ async def run_evals( await asyncio.gather(*[t for t in tasks if not t.done()]) # Write results of each batch so we can keep progress in case of a failure - write_raw_sampler_results(batch_results, sampler.sampler_name, dataset_name, results_dir) + write_raw_sampler_results(batch_results, sampler, dataset, results_dir) -def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: str, dataset_name: str, results_dir: Path = None): +def write_raw_sampler_results(sampler_results: list[str | Any], sampler: BaseSampler, dataset: datasets.Dataset, results_dir: Path = None): """ Write raw results to a csv file. @@ -165,7 +169,7 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st if not os.path.isdir(results_dir): os.makedirs(results_dir, exist_ok=True) - sampler_results_filepath = get_sampler_filepath(sampler_name, dataset_name, results_dir) + sampler_results_filepath = get_sampler_filepath(sampler, dataset, results_dir) if os.path.isfile(sampler_results_filepath): # If file already exists, append df_sampler_results.to_csv( @@ -182,8 +186,8 @@ def write_raw_sampler_results(sampler_results: list[str | Any], sampler_name: st async def main(): - available_samplers = ["you_search_livecrawl", "you_search", "exa_search_with_contents", "google_vertex", "tavily_basic", "tavily_advanced"] - available_datasets = ["simpleqa", "xfreshqa", "finsearch"] + available_samplers = [sampler.sampler_name for sampler in samplers.SAMPLERS] + available_datasets = [dataset.dataset_name for dataset in datasets.DATASETS] parser = argparse.ArgumentParser(description="Run an eval") parser.add_argument( "--samplers", diff --git a/src/evals/samplers/base_samplers/base_api_sampler.py b/src/evals/samplers/base_samplers/base_api_sampler.py index 94b25e7..2db9892 100644 --- a/src/evals/samplers/base_samplers/base_api_sampler.py +++ b/src/evals/samplers/base_samplers/base_api_sampler.py @@ -17,7 +17,6 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args: Dict[str, Any] | None = None, ): super().__init__( sampler_name=sampler_name, diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 55f99ee..1aab1ca 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -3,8 +3,8 @@ import logging import time from typing import Any, Dict -from wsgiref.validate import validator +from evals.configs import datasets from evals.processing import synthesizer_utils @@ -73,19 +73,11 @@ def __extract_query_from_messages__(message_list: list[dict]) -> str: return str(message_list) @staticmethod - async def __evaluate_response(query: str, ground_truth: str, generated_answer: str, dataset: str) -> Dict[str, Any]: + async def __evaluate_response(query: str, ground_truth: str, generated_answer: str, dataset: datasets.Dataset) -> Dict[str, Any]: """Evaluate the generated response against ground truth""" - from evals.processing.evaluate_answer import AnswerGrader + return await dataset.grader(query, ground_truth, generated_answer) - evaluator = AnswerGrader() - if dataset == 'simpleqa': - return await evaluator.evaluate_single_simpleqa(query, ground_truth, generated_answer) - elif dataset == 'frames': - return await evaluator.evaluate_single_frames(query, ground_truth, generated_answer) - else: - raise ValueError(f"Unknown dataset {dataset}, not sure which evaluator to use") - - async def __call__(self, query_input, dataset: str, ground_truth: str = "", overwrite: bool = False) -> Dict[str, Any]: + async def __call__(self, query_input, dataset: dict, ground_truth: str = "", overwrite: bool = False) -> Dict[str, Any]: """Main execution pipeline""" internal_response_time_ms = None end_to_end_time_ms = None From 324ef7a9a98277333b7fa7559deba1e4840da1a7 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 15:40:46 -0800 Subject: [PATCH 14/23] Added perplexity and parallel --- requirements.txt | 7 +-- src/evals/configs/samplers.py | 5 ++ .../applied_samplers/parallel_sampler.py | 8 +-- .../applied_samplers/perplexity_sampler.py | 53 +++++++++++++++++++ .../applied_samplers/tavily_sampler.py | 2 +- .../applied_samplers/you_livecrawl_sampler.py | 1 - .../applied_samplers/you_search_sampler.py | 1 - .../samplers/base_samplers/base_sampler.py | 2 - 8 files changed, 64 insertions(+), 15 deletions(-) create mode 100644 src/evals/samplers/applied_samplers/perplexity_sampler.py diff --git a/requirements.txt b/requirements.txt index 4c532dd..e47b3cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,14 @@ aiohttp==3.12.15 -exa-py==2.2.0 +exa-py==2.4.0 openai==1.78.1 pandas==2.2.3 parallel-web==0.4.1 +perplexityai==0.29.1 pydantic==2.11.4 pytest==8.3.4 pytest-asyncio==0.24.0 python-dotenv==1.0.1 retry==0.9.2 -tavily-python==0.7.20 +tavily-python==0.7.21 tqdm==4.67.1 -youdotcom==2.1.0 +youdotcom==2.2.0 diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py index 3c85583..5e77371 100644 --- a/src/evals/configs/samplers.py +++ b/src/evals/configs/samplers.py @@ -2,6 +2,7 @@ from evals.samplers.applied_samplers.exa_sampler import ExaSampler from evals.samplers.applied_samplers.parallel_sampler import ParallelSearchSampler +from evals.samplers.applied_samplers.perplexity_sampler import PerplexitySearchSampler from evals.samplers.applied_samplers.tavily_sampler import TavilySampler from evals.samplers.applied_samplers.you_livecrawl_sampler import YouLivecrawlSampler from evals.samplers.applied_samplers.you_search_sampler import YouSearchSampler @@ -26,6 +27,10 @@ api_key=os.getenv("PARALLEL_API_KEY"), mode="fast", ), + PerplexitySearchSampler( + sampler_name="perplexity_search", + api_key=os.getenv("PERPLEXITY_API_KEY"), + ), TavilySampler( sampler_name="tavily_basic", api_key=os.getenv("TAVILY_API_KEY"), diff --git a/src/evals/samplers/applied_samplers/parallel_sampler.py b/src/evals/samplers/applied_samplers/parallel_sampler.py index 7600901..d36d933 100644 --- a/src/evals/samplers/applied_samplers/parallel_sampler.py +++ b/src/evals/samplers/applied_samplers/parallel_sampler.py @@ -21,9 +21,6 @@ def __init__( self.max_characters = max_characters self.mode = mode - if api_key is None: - api_key = os.getenv("PARALLEL_API_KEY") - super().__init__( sampler_name=sampler_name, api_key=api_key, @@ -31,12 +28,9 @@ def __init__( max_concurrency=max_concurrency, ) - if api_key is None: - print("No API key provided for Parallel") - def _initialize_client(self): """Initialize Parallel SDK client""" - return Parallel(api_key=self.api_key) + self.client = Parallel(api_key=self.api_key) def _get_search_results_impl(self, query): search_params = { diff --git a/src/evals/samplers/applied_samplers/perplexity_sampler.py b/src/evals/samplers/applied_samplers/perplexity_sampler.py new file mode 100644 index 0000000..872be75 --- /dev/null +++ b/src/evals/samplers/applied_samplers/perplexity_sampler.py @@ -0,0 +1,53 @@ +import os +from typing import Any + +from perplexity import Perplexity + +from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler + + +class PerplexitySearchSampler(BaseSDKSampler): + def __init__( + self, + sampler_name: str, + api_key: str = None, + timeout: float = 60.0, + max_tokens_per_page: int = None, + max_tokens: int = 12000, # Limit max tokens per page to not overload synthesis model + max_concurrency: int = 10, + ): + self.max_tokens_per_page = max_tokens_per_page + self.max_tokens = max_tokens + + super().__init__( + sampler_name=sampler_name, + api_key=api_key, + timeout=timeout, + max_concurrency=max_concurrency, + ) + + if api_key is None: + print("No API key provided for Perplexity") + + def _initialize_client(self): + """Initialize Perplexity SDK client""" + self.client = Perplexity(api_key=self.api_key) + + def _get_search_results_impl(self, query): + return self.client.search.create( + query=query, + max_results=10, + max_tokens_per_page=self.max_tokens_per_page, + max_tokens=self.max_tokens, + ) + + def format_results(self, results: Any) -> list[str]: + formatted_results = [] + for result in results.results: + title = result.title + url = result.url + snippet = result.snippet + # if url not in contaminated_urls: + formatted_results.append(f"[{title}]({url})\n{snippet}\n") + + return formatted_results diff --git a/src/evals/samplers/applied_samplers/tavily_sampler.py b/src/evals/samplers/applied_samplers/tavily_sampler.py index 3e0ba31..42deb9c 100644 --- a/src/evals/samplers/applied_samplers/tavily_sampler.py +++ b/src/evals/samplers/applied_samplers/tavily_sampler.py @@ -1,6 +1,6 @@ """Run evals using the Tavily SDK""" -from typing import Any, Dict +from typing import Any from tavily import TavilyClient diff --git a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py index 5411080..f177a23 100644 --- a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py +++ b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py @@ -15,7 +15,6 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args: Dict[str, Any] | None = None, ): if api_key is None: raise ValueError( diff --git a/src/evals/samplers/applied_samplers/you_search_sampler.py b/src/evals/samplers/applied_samplers/you_search_sampler.py index 2409b50..560456e 100644 --- a/src/evals/samplers/applied_samplers/you_search_sampler.py +++ b/src/evals/samplers/applied_samplers/you_search_sampler.py @@ -16,7 +16,6 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args: Dict[str, Any] | None = None, ): super().__init__( sampler_name=sampler_name, diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 1aab1ca..5d1029e 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -19,14 +19,12 @@ def __init__( max_retries: int = 3, max_concurrency: int = 10, needs_synthesis: bool = True, - custom_args=None, ): self.sampler_name = sampler_name self.timeout = timeout self.max_retries = max_retries self.max_concurrency = max_concurrency self.needs_synthesis = needs_synthesis - self.custom_args = custom_args if api_key: self.api_key = api_key From be83c74a336cca37b824bd70f1fe48551411ad0f Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 15:47:45 -0800 Subject: [PATCH 15/23] Ran ruff --- src/evals/constants.py | 2 +- src/evals/eval_runner.py | 14 ++++++-------- src/evals/processing/evaluate_answer.py | 2 +- src/evals/samplers/base_samplers/base_sampler.py | 14 +++++++------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/evals/constants.py b/src/evals/constants.py index fe462ab..38d1a8b 100644 --- a/src/evals/constants.py +++ b/src/evals/constants.py @@ -131,4 +131,4 @@ "Explanation:" (How you made the decision?) "Decision:" ("TRUE" or "FALSE") Please proceed with the evaluation. -""" \ No newline at end of file +""" diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index cca307d..9dcad1c 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -39,7 +39,9 @@ def get_sampler(sampler_name: str): """Initialize requested samplers""" sampler = next((sampler for sampler in samplers.SAMPLERS if sampler.sampler_name == sampler_name), None) if sampler is None: - raise ValueError(f"Sampler '{sampler_name}' not found. Available samplers: {[sampler.sampler_name for sampler in samplers.SAMPLERS]}") + raise ValueError( + f"Sampler '{sampler_name}' not found. Available samplers: {[sampler.sampler_name for sampler in samplers.SAMPLERS]}" + ) return sampler @@ -72,9 +74,7 @@ async def process_query_with_semaphore(semaphore, sampler, target_query, target_ def get_dataset(dataset_name): - dataset = next( - (dataset for dataset in datasets.DATASETS if dataset.dataset_name == dataset_name), None - ) + dataset = next((dataset for dataset in datasets.DATASETS if dataset.dataset_name == dataset_name), None) dataset.df = pd.read_csv(dataset.csv_path) if dataset is None: raise ValueError(f"Dataset '{dataset_name}' not recognized, run python src/evals/eval_runner.py --help for available datasets") @@ -111,9 +111,7 @@ async def run_evals( for sampler_name in args.samplers: sampler = get_sampler(sampler_name) # Only run on problems that are not already in results folder - remaining_problems = get_remaining_problems( - dataset=dataset, sampler=sampler, results_dir=results_dir - ) + remaining_problems = get_remaining_problems(dataset=dataset, sampler=sampler, results_dir=results_dir) if len(remaining_problems) == 0: logging.info(f"No problems remaining for sampler {sampler.sampler_name}, moving on...") results[sampler.sampler_name] = pd.read_csv(get_sampler_filepath(sampler, dataset, results_dir)) @@ -144,7 +142,7 @@ async def run_evals( target_query=query, target_ground_truth=ground_truth, dataset=dataset, - ) + ) ) tasks.append(task) diff --git a/src/evals/processing/evaluate_answer.py b/src/evals/processing/evaluate_answer.py index 21a70d9..bb47a37 100644 --- a/src/evals/processing/evaluate_answer.py +++ b/src/evals/processing/evaluate_answer.py @@ -121,4 +121,4 @@ async def evaluate_single_frames(self, question: str, target: str, predicted_ans "is_correct": is_correct, "is_incorrect": is_incorrect, "score": is_correct, - } \ No newline at end of file + } diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 5d1029e..23eb674 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -89,15 +89,15 @@ async def __call__(self, query_input, dataset: dict, ground_truth: str = "", ove try: # Run synchronous SDK call in thread pool raw_results = await asyncio.to_thread(self.get_search_results, query) - if self.sampler_name == 'you_search_livecrawl': + if self.sampler_name == "you_search_livecrawl": internal_response_time_ms = round(raw_results["metadata"]["latency"] * 1000, 2) # Convert to ms - elif self.sampler_name == 'you_search': + elif self.sampler_name == "you_search": internal_response_time_ms = round(raw_results.metadata.latency * 1000, 2) # Convert to ms - elif 'tavily' in self.sampler_name: - internal_response_time_ms = round(raw_results['response_time'] * 1000, 2) # Convert to ms - elif 'exa' in self.sampler_name: - # Exa does not return internal run time, best we can do is API call time - internal_response_time_ms = round((time.time() - end_to_end_start_time) * 1000, 2) # Convert to ms + elif "tavily" in self.sampler_name: + internal_response_time_ms = round(raw_results["response_time"] * 1000, 2) # Convert to ms + elif "perplexity" in self.sampler_name: + # Perplexity is often empty + internal_response_time_ms = raw_results.server_time formatted_results = self.format_results(raw_results) except Exception as e: From da61dd33671cba67c62163c5ddc390e737ee8a50 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 15:53:47 -0800 Subject: [PATCH 16/23] Run all datasets by default --- src/evals/eval_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index 9dcad1c..7f79715 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -196,6 +196,7 @@ async def main(): ) parser.add_argument( "--datasets", + default=available_datasets, type=str, nargs="+", required=True, From 096f653e733dcc56ab11dbe2bc9a1a2096f6f639 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 15:54:09 -0800 Subject: [PATCH 17/23] Run all datasets by default --- src/evals/eval_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/evals/eval_runner.py b/src/evals/eval_runner.py index 7f79715..b54c826 100644 --- a/src/evals/eval_runner.py +++ b/src/evals/eval_runner.py @@ -199,7 +199,6 @@ async def main(): default=available_datasets, type=str, nargs="+", - required=True, help=f"The dataset(s) to eval against (can specify multiple). Select from {available_datasets}", ) parser.add_argument( From 3145c989e6bfcba9cb51faab87b90a87503acc5d Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 15:59:34 -0800 Subject: [PATCH 18/23] Update you snippets name --- src/evals/configs/samplers.py | 6 +++--- src/evals/samplers/applied_samplers/you_search_sampler.py | 2 +- src/evals/samplers/base_samplers/base_sampler.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/evals/configs/samplers.py b/src/evals/configs/samplers.py index 5e77371..d2cb847 100644 --- a/src/evals/configs/samplers.py +++ b/src/evals/configs/samplers.py @@ -5,7 +5,7 @@ from evals.samplers.applied_samplers.perplexity_sampler import PerplexitySearchSampler from evals.samplers.applied_samplers.tavily_sampler import TavilySampler from evals.samplers.applied_samplers.you_livecrawl_sampler import YouLivecrawlSampler -from evals.samplers.applied_samplers.you_search_sampler import YouSearchSampler +from evals.samplers.applied_samplers.you_search_sampler import YouSearchSnippetsSampler SAMPLERS = [ @@ -13,8 +13,8 @@ sampler_name="you_search_livecrawl", api_key=os.getenv("YOU_API_KEY"), ), - YouSearchSampler( - sampler_name="you_search", + YouSearchSnippetsSampler( + sampler_name="you_search_snippets", api_key=os.getenv("YOU_API_KEY"), ), ExaSampler( diff --git a/src/evals/samplers/applied_samplers/you_search_sampler.py b/src/evals/samplers/applied_samplers/you_search_sampler.py index 560456e..55e753e 100644 --- a/src/evals/samplers/applied_samplers/you_search_sampler.py +++ b/src/evals/samplers/applied_samplers/you_search_sampler.py @@ -7,7 +7,7 @@ from evals.samplers.base_samplers.base_sdk_sampler import BaseSDKSampler -class YouSearchSampler(BaseSDKSampler): +class YouSearchSnippetsSampler(BaseSDKSampler): def __init__( self, sampler_name: str, diff --git a/src/evals/samplers/base_samplers/base_sampler.py b/src/evals/samplers/base_samplers/base_sampler.py index 23eb674..5ddea37 100644 --- a/src/evals/samplers/base_samplers/base_sampler.py +++ b/src/evals/samplers/base_samplers/base_sampler.py @@ -91,7 +91,7 @@ async def __call__(self, query_input, dataset: dict, ground_truth: str = "", ove raw_results = await asyncio.to_thread(self.get_search_results, query) if self.sampler_name == "you_search_livecrawl": internal_response_time_ms = round(raw_results["metadata"]["latency"] * 1000, 2) # Convert to ms - elif self.sampler_name == "you_search": + elif self.sampler_name == "you_search_snippets": internal_response_time_ms = round(raw_results.metadata.latency * 1000, 2) # Convert to ms elif "tavily" in self.sampler_name: internal_response_time_ms = round(raw_results["response_time"] * 1000, 2) # Convert to ms From aa1f96364ee9d93134d6a6c360480f8f600f5cb1 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 16:54:06 -0800 Subject: [PATCH 19/23] Add text param to exa --- src/evals/samplers/applied_samplers/exa_sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/evals/samplers/applied_samplers/exa_sampler.py b/src/evals/samplers/applied_samplers/exa_sampler.py index 83d8d30..f14ca5b 100644 --- a/src/evals/samplers/applied_samplers/exa_sampler.py +++ b/src/evals/samplers/applied_samplers/exa_sampler.py @@ -18,6 +18,7 @@ def __init__( needs_synthesis: bool = True, text: bool = False, ): + self.text: bool = text super().__init__( sampler_name=sampler_name, api_key=api_key, From 25f8f03283f4cd413930d227dab96e22f98234de Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 20:14:06 -0800 Subject: [PATCH 20/23] Fix bug on reporting metrics during failure --- src/evals/eval_results_analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py index e97f7fb..f39816c 100644 --- a/src/evals/eval_results_analyzer.py +++ b/src/evals/eval_results_analyzer.py @@ -53,8 +53,8 @@ def write_metrics(results_dir: Optional[Path] = None): df_sampler_results = pd.read_csv(sampler_results_file) successful_df = df_sampler_results[df_sampler_results["query"] != "FAILED"] - avg_internal_latency = pd.to_numeric(successful_df["internal_response_time_ms"]).mean() - avg_end_to_end_latency = pd.to_numeric(successful_df["end_to_end_time_ms"]).mean() + avg_internal_latency = pd.to_numeric(successful_df["internal_response_time_ms"], errors="coerce").mean() + avg_end_to_end_latency = pd.to_numeric(successful_df["end_to_end_time_ms"], errors="coerce").mean() correct = len(df_sampler_results[df_sampler_results["evaluation_result"] == "is_correct"]) count_answered = len(successful_df) From faf82123d28bf89280d0503f2ec92b68c3ffbfb9 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 20:24:03 -0800 Subject: [PATCH 21/23] Update score correctness --- src/evals/eval_results_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py index f39816c..f5d4a5a 100644 --- a/src/evals/eval_results_analyzer.py +++ b/src/evals/eval_results_analyzer.py @@ -51,7 +51,7 @@ def write_metrics(results_dir: Optional[Path] = None): dataset_name = sampler_results_file.split("dataset_")[1].split("_raw_results")[0] sampler_name = sampler_results_file.split("raw_results_")[-1].split(".")[0] df_sampler_results = pd.read_csv(sampler_results_file) - successful_df = df_sampler_results[df_sampler_results["query"] != "FAILED"] + successful_df = df_sampler_results[df_sampler_results["generated_answer"] != "FAILED"] avg_internal_latency = pd.to_numeric(successful_df["internal_response_time_ms"], errors="coerce").mean() avg_end_to_end_latency = pd.to_numeric(successful_df["end_to_end_time_ms"], errors="coerce").mean() From a221895b9055fd9c2ad2cdb05600ca9a3f4fd0a7 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 20:25:22 -0800 Subject: [PATCH 22/23] Sort output --- src/evals/eval_results_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py index f5d4a5a..985bb5e 100644 --- a/src/evals/eval_results_analyzer.py +++ b/src/evals/eval_results_analyzer.py @@ -73,7 +73,7 @@ def write_metrics(results_dir: Optional[Path] = None): }) write_path = results_dir / "analyzed_results.csv" - metric_df = pd.DataFrame(metric_rows) + metric_df = pd.DataFrame(metric_rows).sort_values(["dataset", "provider", "accuracy_score"], ascending=False) metric_df.to_csv(write_path, index=False) print(f"Results were written to {write_path}") print(metric_df) From db4b77b41510957c03ce0fda507ad25e4fa70763 Mon Sep 17 00:00:00 2001 From: eddyn-you Date: Fri, 13 Feb 2026 20:29:08 -0800 Subject: [PATCH 23/23] Sort output --- src/evals/eval_results_analyzer.py | 2 +- src/evals/samplers/applied_samplers/you_livecrawl_sampler.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/evals/eval_results_analyzer.py b/src/evals/eval_results_analyzer.py index 985bb5e..295bb9a 100644 --- a/src/evals/eval_results_analyzer.py +++ b/src/evals/eval_results_analyzer.py @@ -73,7 +73,7 @@ def write_metrics(results_dir: Optional[Path] = None): }) write_path = results_dir / "analyzed_results.csv" - metric_df = pd.DataFrame(metric_rows).sort_values(["dataset", "provider", "accuracy_score"], ascending=False) + metric_df = pd.DataFrame(metric_rows).sort_values(["dataset", "accuracy_score"], ascending=False) metric_df.to_csv(write_path, index=False) print(f"Results were written to {write_path}") print(metric_df) diff --git a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py index f177a23..1466da6 100644 --- a/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py +++ b/src/evals/samplers/applied_samplers/you_livecrawl_sampler.py @@ -51,8 +51,8 @@ def _get_payload(self, query: str) -> Dict[str, Any]: "count": 10, "livecrawl": "all", "livecrawl_formats": "markdown", - # These parameters are in beta, and are designed to maximize performance - "num_bytes": 500000 + random.randint(1, 100), + # These parameters are in beta and are designed to maximize performance + "num_bytes": 500000, "crawl_timeout": 1, }