From 3abe94efccdd924fe997b685e68e7cae985a3f6c Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Sun, 28 Sep 2025 11:38:20 -0400 Subject: [PATCH 1/7] proposed new leaderboard schema --- schema/leaderboard.schema.json | 408 +++++++++++++++++++++++++++++++++ 1 file changed, 408 insertions(+) create mode 100644 schema/leaderboard.schema.json diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json new file mode 100644 index 0000000..05e3dc6 --- /dev/null +++ b/schema/leaderboard.schema.json @@ -0,0 +1,408 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "version": "0.0.1", + "type": "object", + "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics", + "required": [ + "schema_version", + "evaluation_id", + "model_info", + "evaluation_results" + ], + "properties": { + "schema_version": { + "type": "string", + "description": "Version of the schema used for this evaluation data" + }, + "evaluation_id": { + "type": "string", + "description": "Unique identifier for this specific evaluation run" + }, + "model_info": { + "type": "object", + "description": "Complete model specification including basic information, technical configuration and inference settings", + "required": [ + "name", + "provider_name" + ], + "properties": { + "name": { + "type": "string", + "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')" + }, + "provider_name": { + "type": "string", + "description": "Name of the provider for the version of the model used during evaluation." + }, + "family": { + "type": [ + "string" + ], + "description": "Model family" + }, + "developer": { + "type": "string", + "description": "Name of organization that provides the model (e.g. 'OpenAI')" + }, + "configuration": { + "type": "object", + "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted", + "required": [ + "context_window" + ], + "properties": { + "architecture": { + "type": [ + "string", + "null" + ], + "enum": [ + "transformer", + "moe", + "ssm", + null + ], + "description": "Model architecture type" + }, + "parameters": { + "type": [ + "integer", + "null" + ], + "minimum": 1, + "description": "Number of parameters in billions" + }, + "context_window": { + "type": "integer", + "minimum": 1, + "description": "Maximum context window size in tokens" + }, + "is_instruct": { + "type": "boolean", + "description": "Whether the model is instruction-tuned" + }, + "hf_path": { + "type": [ + "string", + "null" + ], + "description": "HuggingFace model path" + }, + "revision": { + "type": [ + "string", + "null" + ], + "description": "Model revision/commit hash" + } + } + }, + "inference_settings": { + "type": "object", + "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution", + "required": [ + "quantization" + ], + "properties": { + "quantization": { + "type": "object", + "required": [ + "bit_precision", + "method" + ], + "properties": { + "bit_precision": { + "type": "string", + "enum": [ + "none", + "int8", + "int4", + "float16", + "float32" + ], + "description": "Quantization bit precision" + }, + "method": { + "type": "string", + "enum": [ + "None", + "dynamic", + "static" + ], + "description": "Quantization method" + } + } + }, + "generation_args": { + "type": "object", + "properties": { + "use_vllm": { + "type": "boolean", + "description": "Whether VLLM was used for inference" + }, + "temperature": { + "type": [ + "null", + "number" + ], + "description": "Sampling temperature" + }, + "top_p": { + "type": [ + "null", + "number" + ], + "description": "Nucleus sampling parameter" + }, + "top_k": { + "type": [ + "null", + "number" + ], + "description": "Top-k sampling parameter" + }, + "max_tokens": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of tokens to generate" + }, + "stop_sequences": { + "type": "array", + "description": "Sequences that stop generation", + "items": { + "type": "string" + }, + "default": [] + } + } + } + } + } + } + }, + "evaluation_results": { + "type": "array", + "description": "Array of evaluation results", + "items": { + "type": "object", + "required": [ + "evaluation_name", + "metric_config", + "score_details" + ], + "properties": { + "evaluation_name": { + "type": "string", + "description": "Name of the evaluation" + }, + "metric_config": { + "type": "object", + "description": "Details about the metric", + "required": [ + "lower_is_better" + ], + "properties": { + "evaluation_description": { + "type": "string", + "description": "Description of the evaluation" + }, + "lower_is_better": { + "type": "boolean", + "description": "Whether a lower score is better" + }, + "min_score": { + "type": "number", + "description": "Minimum possible score" + }, + "max_score": { + "type": "number", + "description": "Maximum possible score" + } + } + }, + "score_details": { + "type": "string", + "description": "The score for the evaluation and relted details", + "properties": { + "score": { + "type": "number", + "description": "The score for the evaluation" + }, + "details": { + "type": "string", + "description": "Any additional details about the score" + } + } + }, + "generation_config": { + "type": "string", + "description": "Details about how the scores were generated", + "prompt_config": { + "type": "object", + "description": "Configuration of the prompt template and formatting", + "required": [ + "prompt_class" + ], + "properties": { + "prompt_class": { + "type": "string", + "description": "Type of task and its formatting requirements", + "enum": [ + "MultipleChoice", + "OpenEnded", + "Completion" + ] + }, + "dimensions": { + "type": "object", + "description": "Format-specific configuration dimensions", + "required": [ + "choices_order", + "enumerator", + "instruction_phrasing", + "separator", + "shots" + ], + "properties": { + "choices_order": { + "type": "object", + "required": [ + "method", + "description" + ], + "properties": { + "method": { + "type": "string", + "description": "The method to use for ordering choices" + }, + "description": { + "type": "string", + "description": "Detailed explanation of the ordering method" + } + } + }, + "demonstrations": { + "type": "array", + "description": "Array of demonstration examples used in few-shot prompting", + "default": [] + }, + "enumerator": { + "type": "string", + "description": "Style of enumeration for multiple choice options", + "enum": [ + "capitals", + "lowercase", + "numbers", + "roman", + "keyboard", + "greek" + ] + }, + "instruction_phrasing": { + "type": "object", + "required": [ + "name", + "text" + ], + "properties": { + "name": { + "type": "string", + "description": "Name of the instruction template" + }, + "text": { + "type": "string", + "description": "Template text with placeholders for question and choices (or more)" + } + } + }, + "separator": { + "type": "string", + "description": "Character(s) used to separate multiple choice options", + "enum": [ + "\\s", + "\n", + ", ", + "; ", + " | ", + " OR ", + " or " + ] + }, + "shots": { + "type": "integer", + "description": "Number of examples provided in the prompt", + "minimum": 0, + "maximum": 10 + } + } + } + } + }, + "evaluation_method": { + "type": "object", + "description": "Evaluation metrics and ground truth", + "required": [ + "evaluation_method" + ], + "properties": { + "evaluation_method": { + "type": "object", + "description": "Method used to evaluate the answer, including predefined methods and user-defined methods.", + "properties": { + "method_name": { + "type": "string", + "description": "Name of the evaluation method. Can be a predefined method or a user-defined method." + }, + "description": { + "type": "string", + "description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required." + }, + "parameters": { + "type": "object", + "description": "Optional parameters used by the evaluation method. Allows custom configuration.", + "additionalProperties": true + } + }, + "required": [ + "method_name", + "description" + ], + "if": { + "properties": { + "method_name": { + "enum": [ + "label_only_match", + "content_similarity" + ] + } + } + }, + "then": { + "properties": { + "description": { + "type": "string", + "enum": [ + "Compares only the choice identifier/label to evaluate the response.", + "Finds the most similar answer among the given choices by comparing the textual content" + ] + } + } + }, + "else": { + "properties": { + "description": { + "type": "string", + "description": "Explanation of the custom evaluation method." + } + } + } + } + } + } + } + } + } + + } + } +} From a189e5e93903a0af04037615f409e89fe9f78a49 Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Sun, 28 Sep 2025 11:55:42 -0400 Subject: [PATCH 2/7] Add config for sample level data --- schema/leaderboard.schema.json | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json index 05e3dc6..d2a7ae0 100644 --- a/schema/leaderboard.schema.json +++ b/schema/leaderboard.schema.json @@ -222,7 +222,7 @@ }, "score_details": { "type": "string", - "description": "The score for the evaluation and relted details", + "description": "The score for the evaluation and related details", "properties": { "score": { "type": "number", @@ -234,6 +234,27 @@ } } }, + "sample_level_data": { + "type": "array", + "description": "Sample level results for items used in evaluation", + "items": { + "type": "object", + "required": [ + "sample_id", + "score" + ], + "properties": { + "sample_id": { + "type": "string", + "description": "Unique identifier for the sample" + }, + "score": { + "type": "number", + "description": "Score for the sample" + } + } + } + }, "generation_config": { "type": "string", "description": "Details about how the scores were generated", From 2dbf00ebc62d9ec0b26217fb0c48b3bdf756b143 Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Sun, 28 Sep 2025 12:03:45 -0400 Subject: [PATCH 3/7] fix indent + add score level names --- schema/leaderboard.schema.json | 202 ++++++++++++++++++--------------- 1 file changed, 109 insertions(+), 93 deletions(-) diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json index d2a7ae0..1c1fdc2 100644 --- a/schema/leaderboard.schema.json +++ b/schema/leaderboard.schema.json @@ -210,6 +210,22 @@ "type": "boolean", "description": "Whether a lower score is better" }, + "score_type": { + "type": "string", + "description": "Type of score", + "enum": [ + "binary", + "continuous", + "levels" + ] + }, + "score_level_names": { + "type": "array", + "description": "Names of the score levels", + "items": { + "type": "string" + } + }, "min_score": { "type": "number", "description": "Minimum possible score" @@ -259,105 +275,105 @@ "type": "string", "description": "Details about how the scores were generated", "prompt_config": { - "type": "object", - "description": "Configuration of the prompt template and formatting", - "required": [ - "prompt_class" - ], - "properties": { - "prompt_class": { - "type": "string", - "description": "Type of task and its formatting requirements", - "enum": [ - "MultipleChoice", - "OpenEnded", - "Completion" - ] - }, - "dimensions": { - "type": "object", - "description": "Format-specific configuration dimensions", - "required": [ - "choices_order", - "enumerator", - "instruction_phrasing", - "separator", - "shots" - ], - "properties": { - "choices_order": { - "type": "object", - "required": [ - "method", - "description" - ], - "properties": { - "method": { - "type": "string", - "description": "The method to use for ordering choices" - }, - "description": { - "type": "string", - "description": "Detailed explanation of the ordering method" + "type": "object", + "description": "Configuration of the prompt template and formatting", + "required": [ + "prompt_class" + ], + "properties": { + "prompt_class": { + "type": "string", + "description": "Type of task and its formatting requirements", + "enum": [ + "MultipleChoice", + "OpenEnded", + "Completion" + ] + }, + "dimensions": { + "type": "object", + "description": "Format-specific configuration dimensions", + "required": [ + "choices_order", + "enumerator", + "instruction_phrasing", + "separator", + "shots" + ], + "properties": { + "choices_order": { + "type": "object", + "required": [ + "method", + "description" + ], + "properties": { + "method": { + "type": "string", + "description": "The method to use for ordering choices" + }, + "description": { + "type": "string", + "description": "Detailed explanation of the ordering method" + } } - } - }, - "demonstrations": { - "type": "array", - "description": "Array of demonstration examples used in few-shot prompting", - "default": [] - }, - "enumerator": { - "type": "string", - "description": "Style of enumeration for multiple choice options", - "enum": [ - "capitals", - "lowercase", - "numbers", - "roman", - "keyboard", - "greek" - ] - }, - "instruction_phrasing": { - "type": "object", - "required": [ - "name", - "text" - ], - "properties": { - "name": { - "type": "string", - "description": "Name of the instruction template" - }, - "text": { - "type": "string", - "description": "Template text with placeholders for question and choices (or more)" + }, + "demonstrations": { + "type": "array", + "description": "Array of demonstration examples used in few-shot prompting", + "default": [] + }, + "enumerator": { + "type": "string", + "description": "Style of enumeration for multiple choice options", + "enum": [ + "capitals", + "lowercase", + "numbers", + "roman", + "keyboard", + "greek" + ] + }, + "instruction_phrasing": { + "type": "object", + "required": [ + "name", + "text" + ], + "properties": { + "name": { + "type": "string", + "description": "Name of the instruction template" + }, + "text": { + "type": "string", + "description": "Template text with placeholders for question and choices (or more)" + } } + }, + "separator": { + "type": "string", + "description": "Character(s) used to separate multiple choice options", + "enum": [ + "\\s", + "\n", + ", ", + "; ", + " | ", + " OR ", + " or " + ] + }, + "shots": { + "type": "integer", + "description": "Number of examples provided in the prompt", + "minimum": 0, + "maximum": 10 } - }, - "separator": { - "type": "string", - "description": "Character(s) used to separate multiple choice options", - "enum": [ - "\\s", - "\n", - ", ", - "; ", - " | ", - " OR ", - " or " - ] - }, - "shots": { - "type": "integer", - "description": "Number of examples provided in the prompt", - "minimum": 0, - "maximum": 10 } } } - } }, "evaluation_method": { "type": "object", From 8ceca24d8260b98d48b701834362b9ef2900050a Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Mon, 29 Sep 2025 15:38:27 -0400 Subject: [PATCH 4/7] remove some unused keys --- schema/leaderboard.schema.json | 59 ---------------------------------- 1 file changed, 59 deletions(-) diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json index 1c1fdc2..f3443b7 100644 --- a/schema/leaderboard.schema.json +++ b/schema/leaderboard.schema.json @@ -34,69 +34,10 @@ "type": "string", "description": "Name of the provider for the version of the model used during evaluation." }, - "family": { - "type": [ - "string" - ], - "description": "Model family" - }, "developer": { "type": "string", "description": "Name of organization that provides the model (e.g. 'OpenAI')" }, - "configuration": { - "type": "object", - "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted", - "required": [ - "context_window" - ], - "properties": { - "architecture": { - "type": [ - "string", - "null" - ], - "enum": [ - "transformer", - "moe", - "ssm", - null - ], - "description": "Model architecture type" - }, - "parameters": { - "type": [ - "integer", - "null" - ], - "minimum": 1, - "description": "Number of parameters in billions" - }, - "context_window": { - "type": "integer", - "minimum": 1, - "description": "Maximum context window size in tokens" - }, - "is_instruct": { - "type": "boolean", - "description": "Whether the model is instruction-tuned" - }, - "hf_path": { - "type": [ - "string", - "null" - ], - "description": "HuggingFace model path" - }, - "revision": { - "type": [ - "string", - "null" - ], - "description": "Model revision/commit hash" - } - } - }, "inference_settings": { "type": "object", "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution", From 64b190ea03f2cc3c5ce4f8b237c53d0671b14022 Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Mon, 29 Sep 2025 15:49:05 -0400 Subject: [PATCH 5/7] Additional cleanup --- schema/leaderboard.schema.json | 52 ++++++---------------------------- 1 file changed, 8 insertions(+), 44 deletions(-) diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json index f3443b7..f3f9c16 100644 --- a/schema/leaderboard.schema.json +++ b/schema/leaderboard.schema.json @@ -41,46 +41,14 @@ "inference_settings": { "type": "object", "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution", - "required": [ - "quantization" - ], "properties": { - "quantization": { - "type": "object", - "required": [ - "bit_precision", - "method" - ], - "properties": { - "bit_precision": { - "type": "string", - "enum": [ - "none", - "int8", - "int4", - "float16", - "float32" - ], - "description": "Quantization bit precision" - }, - "method": { - "type": "string", - "enum": [ - "None", - "dynamic", - "static" - ], - "description": "Quantization method" - } - } + "quantization_method": { + "type": "string", + "description": "Quantization method used for the model (e.g GPTQ)" }, "generation_args": { "type": "object", "properties": { - "use_vllm": { - "type": "boolean", - "description": "Whether VLLM was used for inference" - }, "temperature": { "type": [ "null", @@ -106,16 +74,9 @@ "type": "integer", "minimum": 1, "description": "Maximum number of tokens to generate" - }, - "stop_sequences": { - "type": "array", - "description": "Sequences that stop generation", - "items": { - "type": "string" - }, - "default": [] } - } + }, + "additionalProperties": true } } } @@ -180,6 +141,9 @@ "score_details": { "type": "string", "description": "The score for the evaluation and related details", + "required": [ + "score" + ], "properties": { "score": { "type": "number", From ccd05dddf1d378d84ed9122b69808d4882a7050d Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Mon, 29 Sep 2025 15:52:36 -0400 Subject: [PATCH 6/7] add Python schema --- schema/leaderboard.schema.json | 6 +- schema/leaderboard_eval_types.py | 110 +++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 schema/leaderboard_eval_types.py diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json index f3f9c16..6fd8a54 100644 --- a/schema/leaderboard.schema.json +++ b/schema/leaderboard.schema.json @@ -23,13 +23,17 @@ "description": "Complete model specification including basic information, technical configuration and inference settings", "required": [ "name", - "provider_name" + "source_url" ], "properties": { "name": { "type": "string", "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')" }, + "source_url": { + "type": "string", + "description": "URL for the source of the evaluation data" + }, "provider_name": { "type": "string", "description": "Name of the provider for the version of the model used during evaluation." diff --git a/schema/leaderboard_eval_types.py b/schema/leaderboard_eval_types.py new file mode 100644 index 0000000..6281d72 --- /dev/null +++ b/schema/leaderboard_eval_types.py @@ -0,0 +1,110 @@ +# generated by datamodel-codegen: +# filename: leaderboard.schema.json +# timestamp: 2025-09-29T19:52:18+00:00 + +from __future__ import annotations + +from enum import Enum +from typing import List, Optional + +from pydantic import BaseModel, ConfigDict, Field, conint + + +class GenerationArgs(BaseModel): + model_config = ConfigDict( + extra='allow', + ) + temperature: Optional[float] = Field(None, description='Sampling temperature') + top_p: Optional[float] = Field(None, description='Nucleus sampling parameter') + top_k: Optional[float] = Field(None, description='Top-k sampling parameter') + max_tokens: Optional[conint(ge=1)] = Field( + None, description='Maximum number of tokens to generate' + ) + + +class InferenceSettings(BaseModel): + quantization_method: Optional[str] = Field( + None, description='Quantization method used for the model (e.g GPTQ)' + ) + generation_args: Optional[GenerationArgs] = None + + +class ModelInfo(BaseModel): + name: str = Field( + ..., description="Model name and version (e.g., 'Llama-2-13b-chat-hf')" + ) + source_url: str = Field( + ..., description='URL for the source of the evaluation data' + ) + provider_name: Optional[str] = Field( + None, + description='Name of the provider for the version of the model used during evaluation.', + ) + developer: Optional[str] = Field( + None, description="Name of organization that provides the model (e.g. 'OpenAI')" + ) + inference_settings: Optional[InferenceSettings] = Field( + None, + description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution', + ) + + +class ScoreType(Enum): + binary = 'binary' + continuous = 'continuous' + levels = 'levels' + + +class MetricConfig(BaseModel): + evaluation_description: Optional[str] = Field( + None, description='Description of the evaluation' + ) + lower_is_better: bool = Field(..., description='Whether a lower score is better') + score_type: Optional[ScoreType] = Field(None, description='Type of score') + score_level_names: Optional[List[str]] = Field( + None, description='Names of the score levels' + ) + min_score: Optional[float] = Field(None, description='Minimum possible score') + max_score: Optional[float] = Field(None, description='Maximum possible score') + + +class ScoreDetails(BaseModel): + score: float = Field(..., description='The score for the evaluation') + details: Optional[str] = Field( + None, description='Any additional details about the score' + ) + + +class SampleLevelDatum(BaseModel): + sample_id: str = Field(..., description='Unique identifier for the sample') + score: float = Field(..., description='Score for the sample') + + +class EvaluationResult(BaseModel): + evaluation_name: str = Field(..., description='Name of the evaluation') + metric_config: MetricConfig = Field(..., description='Details about the metric') + score_details: ScoreDetails = Field( + ..., description='The score for the evaluation and related details' + ) + sample_level_data: Optional[List[SampleLevelDatum]] = Field( + None, description='Sample level results for items used in evaluation' + ) + generation_config: Optional[str] = Field( + None, description='Details about how the scores were generated' + ) + + +class LeaderboardEvaluationResult(BaseModel): + schema_version: str = Field( + ..., description='Version of the schema used for this evaluation data' + ) + evaluation_id: str = Field( + ..., description='Unique identifier for this specific evaluation run' + ) + model_info: ModelInfo = Field( + ..., + description='Complete model specification including basic information, technical configuration and inference settings', + ) + evaluation_results: List[EvaluationResult] = Field( + ..., description='Array of evaluation results' + ) From c6f56ca5211e499c5b988b58c7a675b2e1c763ed Mon Sep 17 00:00:00 2001 From: Anastassia Kornilova Date: Wed, 1 Oct 2025 13:57:35 -0400 Subject: [PATCH 7/7] updated schema --- schema/leaderboard.schema.json | 263 +++++-------------------------- schema/leaderboard_eval_types.py | 48 ++---- 2 files changed, 52 insertions(+), 259 deletions(-) diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json index 6fd8a54..e02bb10 100644 --- a/schema/leaderboard.schema.json +++ b/schema/leaderboard.schema.json @@ -28,7 +28,7 @@ "properties": { "name": { "type": "string", - "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')" + "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)" }, "source_url": { "type": "string", @@ -36,53 +36,15 @@ }, "provider_name": { "type": "string", - "description": "Name of the provider for the version of the model used during evaluation." + "description": "Name of the provider of the evaluation results." }, "developer": { "type": "string", "description": "Name of organization that provides the model (e.g. 'OpenAI')" }, - "inference_settings": { - "type": "object", - "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution", - "properties": { - "quantization_method": { - "type": "string", - "description": "Quantization method used for the model (e.g GPTQ)" - }, - "generation_args": { - "type": "object", - "properties": { - "temperature": { - "type": [ - "null", - "number" - ], - "description": "Sampling temperature" - }, - "top_p": { - "type": [ - "null", - "number" - ], - "description": "Nucleus sampling parameter" - }, - "top_k": { - "type": [ - "null", - "number" - ], - "description": "Top-k sampling parameter" - }, - "max_tokens": { - "type": "integer", - "minimum": 1, - "description": "Maximum number of tokens to generate" - } - }, - "additionalProperties": true - } - } + "inference_platform": { + "type": "string", + "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)" } } }, @@ -159,191 +121,48 @@ } } }, - "sample_level_data": { - "type": "array", - "description": "Sample level results for items used in evaluation", - "items": { - "type": "object", - "required": [ - "sample_id", - "score" - ], - "properties": { - "sample_id": { - "type": "string", - "description": "Unique identifier for the sample" - }, - "score": { - "type": "number", - "description": "Score for the sample" - } - } - } + "detailed_evaluation_results_url": { + "type": "string", + "description": "Link to detailed evaluation data" }, "generation_config": { - "type": "string", - "description": "Details about how the scores were generated", - "prompt_config": { - "type": "object", - "description": "Configuration of the prompt template and formatting", - "required": [ - "prompt_class" - ], - "properties": { - "prompt_class": { - "type": "string", - "description": "Type of task and its formatting requirements", - "enum": [ - "MultipleChoice", - "OpenEnded", - "Completion" - ] - }, - "dimensions": { - "type": "object", - "description": "Format-specific configuration dimensions", - "required": [ - "choices_order", - "enumerator", - "instruction_phrasing", - "separator", - "shots" - ], - "properties": { - "choices_order": { - "type": "object", - "required": [ - "method", - "description" - ], - "properties": { - "method": { - "type": "string", - "description": "The method to use for ordering choices" - }, - "description": { - "type": "string", - "description": "Detailed explanation of the ordering method" - } - } - }, - "demonstrations": { - "type": "array", - "description": "Array of demonstration examples used in few-shot prompting", - "default": [] - }, - "enumerator": { - "type": "string", - "description": "Style of enumeration for multiple choice options", - "enum": [ - "capitals", - "lowercase", - "numbers", - "roman", - "keyboard", - "greek" - ] - }, - "instruction_phrasing": { - "type": "object", - "required": [ - "name", - "text" - ], - "properties": { - "name": { - "type": "string", - "description": "Name of the instruction template" - }, - "text": { - "type": "string", - "description": "Template text with placeholders for question and choices (or more)" - } - } - }, - "separator": { - "type": "string", - "description": "Character(s) used to separate multiple choice options", - "enum": [ - "\\s", - "\n", - ", ", - "; ", - " | ", - " OR ", - " or " - ] - }, - "shots": { - "type": "integer", - "description": "Number of examples provided in the prompt", - "minimum": 0, - "maximum": 10 - } - } - } - } - }, - "evaluation_method": { - "type": "object", - "description": "Evaluation metrics and ground truth", - "required": [ - "evaluation_method" - ], - "properties": { - "evaluation_method": { - "type": "object", - "description": "Method used to evaluate the answer, including predefined methods and user-defined methods.", - "properties": { - "method_name": { - "type": "string", - "description": "Name of the evaluation method. Can be a predefined method or a user-defined method." - }, - "description": { - "type": "string", - "description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required." - }, - "parameters": { - "type": "object", - "description": "Optional parameters used by the evaluation method. Allows custom configuration.", - "additionalProperties": true - } + "type": "object", + "generation_args": { + "type": "object", + "description": "Parameters used to generate results - properties may vary by model type", + "properties": { + "temperature": { + "type": [ + "null", + "number" + ], + "description": "Sampling temperature" }, - "required": [ - "method_name", - "description" - ], - "if": { - "properties": { - "method_name": { - "enum": [ - "label_only_match", - "content_similarity" - ] - } - } + "top_p": { + "type": [ + "null", + "number" + ], + "description": "Nucleus sampling parameter" }, - "then": { - "properties": { - "description": { - "type": "string", - "enum": [ - "Compares only the choice identifier/label to evaluate the response.", - "Finds the most similar answer among the given choices by comparing the textual content" - ] - } - } + "top_k": { + "type": [ + "null", + "number" + ], + "description": "Top-k sampling parameter" }, - "else": { - "properties": { - "description": { - "type": "string", - "description": "Explanation of the custom evaluation method." - } - } + "max_tokens": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of tokens to generate" } - } - } + }, + "additionalProperties": true + }, + "additional_details": { + "type": "string", + "description": "Additional details about how the results for this metric were generated." } } } diff --git a/schema/leaderboard_eval_types.py b/schema/leaderboard_eval_types.py index 6281d72..6607c43 100644 --- a/schema/leaderboard_eval_types.py +++ b/schema/leaderboard_eval_types.py @@ -1,51 +1,32 @@ # generated by datamodel-codegen: # filename: leaderboard.schema.json -# timestamp: 2025-09-29T19:52:18+00:00 +# timestamp: 2025-10-01T17:57:26+00:00 from __future__ import annotations from enum import Enum -from typing import List, Optional +from typing import Any, Dict, List, Optional -from pydantic import BaseModel, ConfigDict, Field, conint - - -class GenerationArgs(BaseModel): - model_config = ConfigDict( - extra='allow', - ) - temperature: Optional[float] = Field(None, description='Sampling temperature') - top_p: Optional[float] = Field(None, description='Nucleus sampling parameter') - top_k: Optional[float] = Field(None, description='Top-k sampling parameter') - max_tokens: Optional[conint(ge=1)] = Field( - None, description='Maximum number of tokens to generate' - ) - - -class InferenceSettings(BaseModel): - quantization_method: Optional[str] = Field( - None, description='Quantization method used for the model (e.g GPTQ)' - ) - generation_args: Optional[GenerationArgs] = None +from pydantic import BaseModel, Field class ModelInfo(BaseModel): name: str = Field( - ..., description="Model name and version (e.g., 'Llama-2-13b-chat-hf')" + ..., + description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)', ) source_url: str = Field( ..., description='URL for the source of the evaluation data' ) provider_name: Optional[str] = Field( - None, - description='Name of the provider for the version of the model used during evaluation.', + None, description='Name of the provider of the evaluation results.' ) developer: Optional[str] = Field( None, description="Name of organization that provides the model (e.g. 'OpenAI')" ) - inference_settings: Optional[InferenceSettings] = Field( + inference_platform: Optional[str] = Field( None, - description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution', + description='Description of platform used to run the evaluations (e.g. local machine, Bedrock)', ) @@ -75,23 +56,16 @@ class ScoreDetails(BaseModel): ) -class SampleLevelDatum(BaseModel): - sample_id: str = Field(..., description='Unique identifier for the sample') - score: float = Field(..., description='Score for the sample') - - class EvaluationResult(BaseModel): evaluation_name: str = Field(..., description='Name of the evaluation') metric_config: MetricConfig = Field(..., description='Details about the metric') score_details: ScoreDetails = Field( ..., description='The score for the evaluation and related details' ) - sample_level_data: Optional[List[SampleLevelDatum]] = Field( - None, description='Sample level results for items used in evaluation' - ) - generation_config: Optional[str] = Field( - None, description='Details about how the scores were generated' + detailed_evaluation_results_url: Optional[str] = Field( + None, description='Link to detailed evaluation data' ) + generation_config: Optional[Dict[str, Any]] = None class LeaderboardEvaluationResult(BaseModel):