evaleval · damian1996 · Oct 1, 2025 · Sep 28, 2025 · Sep 28, 2025 · Sep 28, 2025
diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
@@ -0,0 +1,173 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "version": "0.0.1",
+    "type": "object",
+    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
+    "required": [
+        "schema_version",
+        "evaluation_id",
+        "model_info",
+        "evaluation_results"
+    ],
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Version of the schema used for this evaluation data"
+        },
+        "evaluation_id": {
+            "type": "string",
+            "description": "Unique identifier for this specific evaluation run"
+        },
+        "model_info": {
+            "type": "object",
+            "description": "Complete model specification including basic information, technical configuration and inference settings",
+            "required": [
+                    "name",
+                    "source_url"
+            ],
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
+                },
+                "source_url": {
+                    "type": "string",
+                    "description": "URL for the source of the evaluation data"
+                },
+                "provider_name": {
+                    "type": "string",
+                    "description": "Name of the provider of the evaluation results."
+                },
+                "developer": {
+                    "type": "string",
+                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
+                },
+                "inference_platform": {
+                    "type": "string",
+                    "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
+                }
+            }
+        },
+        "evaluation_results": {
+            "type": "array",
+            "description": "Array of evaluation results",
+            "items": {
+                "type": "object",
+                "required": [
+                    "evaluation_name",
+                    "metric_config",
+                    "score_details"
+                ],
+                "properties": {
+                    "evaluation_name": {
+                        "type": "string",
+                        "description": "Name of the evaluation"
+                    },
+                    "metric_config": {
+                        "type": "object",
+                        "description": "Details about the metric",
+                        "required": [
+                            "lower_is_better"
+                        ],
+                        "properties": {
+                            "evaluation_description": {
+                                "type": "string",
+                                "description": "Description of the evaluation"
+                            },
+                            "lower_is_better": {
+                                "type": "boolean",
+                                "description": "Whether a lower score is better"
+                            },
+                            "score_type": {
+                                "type": "string",
+                                "description": "Type of score",
+                                "enum": [
+                                    "binary",
+                                    "continuous",
+                                    "levels"
+                                ]
+                            },
+                            "score_level_names": {
+                                "type": "array",
+                                "description": "Names of the score levels",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "min_score": {
+                                "type": "number",
+                                "description": "Minimum possible score"
+                            },
+                            "max_score": {
+                                "type": "number",
+                                "description": "Maximum possible score"
+                            }
+                        }
+                    },
+                    "score_details": {
+                        "type": "string",
+                        "description": "The score for the evaluation and related details",
+                        "required": [
+                            "score"
+                        ],
+                        "properties": {
+                            "score": {
+                                "type": "number",
+                                "description": "The score for the evaluation"
+                            },
+                            "details": {
+                                "type": "string",
+                                "description": "Any additional details about the score"
+                            }
+                        }
+                    },
+                    "detailed_evaluation_results_url": {
+                        "type": "string",
+                        "description": "Link to detailed evaluation data"
+                    },
+                    "generation_config": {
+                        "type": "object",
+                        "generation_args": {
+                                "type": "object",
+                                "description": "Parameters used to generate results - properties may vary by model type",
+                                "properties": {
+                                    "temperature": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Sampling temperature"
+                                    },
+                                    "top_p": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Nucleus sampling parameter"
+                                    },
+                                    "top_k": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Top-k sampling parameter"
+                                    },
+                                    "max_tokens": {
+                                        "type": "integer",
+                                        "minimum": 1,
+                                        "description": "Maximum number of tokens to generate"
+                                    }
+                                },
+                                "additionalProperties": true
+                        },
+                        "additional_details": {
+                            "type": "string",
+                            "description": "Additional details about how the results for this metric were generated."
+                        }
+                    }
+                }
+            }
+
+        }
+    }
+}
diff --git a/schema/leaderboard_eval_types.py b/schema/leaderboard_eval_types.py
@@ -0,0 +1,84 @@
+# generated by datamodel-codegen:
+#   filename:  leaderboard.schema.json
+#   timestamp: 2025-10-01T17:57:26+00:00
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class ModelInfo(BaseModel):
+    name: str = Field(
+        ...,
+        description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)',
+    )
+    source_url: str = Field(
+        ..., description='URL for the source of the evaluation data'
+    )
+    provider_name: Optional[str] = Field(
+        None, description='Name of the provider of the evaluation results.'
+    )
+    developer: Optional[str] = Field(
+        None, description="Name of organization that provides the model (e.g. 'OpenAI')"
+    )
+    inference_platform: Optional[str] = Field(
+        None,
+        description='Description of platform used to run the evaluations (e.g. local machine, Bedrock)',
+    )
+
+
+class ScoreType(Enum):
+    binary = 'binary'
+    continuous = 'continuous'
+    levels = 'levels'
+
+
+class MetricConfig(BaseModel):
+    evaluation_description: Optional[str] = Field(
+        None, description='Description of the evaluation'
+    )
+    lower_is_better: bool = Field(..., description='Whether a lower score is better')
+    score_type: Optional[ScoreType] = Field(None, description='Type of score')
+    score_level_names: Optional[List[str]] = Field(
+        None, description='Names of the score levels'
+    )
+    min_score: Optional[float] = Field(None, description='Minimum possible score')
+    max_score: Optional[float] = Field(None, description='Maximum possible score')
+
+
+class ScoreDetails(BaseModel):
+    score: float = Field(..., description='The score for the evaluation')
+    details: Optional[str] = Field(
+        None, description='Any additional details about the score'
+    )
+
+
+class EvaluationResult(BaseModel):
+    evaluation_name: str = Field(..., description='Name of the evaluation')
+    metric_config: MetricConfig = Field(..., description='Details about the metric')
+    score_details: ScoreDetails = Field(
+        ..., description='The score for the evaluation and related details'
+    )
+    detailed_evaluation_results_url: Optional[str] = Field(
+        None, description='Link to detailed evaluation data'
+    )
+    generation_config: Optional[Dict[str, Any]] = None
+
+
+class LeaderboardEvaluationResult(BaseModel):
+    schema_version: str = Field(
+        ..., description='Version of the schema used for this evaluation data'
+    )
+    evaluation_id: str = Field(
+        ..., description='Unique identifier for this specific evaluation run'
+    )
+    model_info: ModelInfo = Field(
+        ...,
+        description='Complete model specification including basic information, technical configuration and inference settings',
+    )
+    evaluation_results: List[EvaluationResult] = Field(
+        ..., description='Array of evaluation results'
+    )