Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions schema/leaderboard.schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "0.0.1",
"type": "object",
"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
"required": [
"schema_version",
"evaluation_id",
"model_info",
"evaluation_results"
],
"properties": {
"schema_version": {
"type": "string",
"description": "Version of the schema used for this evaluation data"
},
"evaluation_id": {
"type": "string",
"description": "Unique identifier for this specific evaluation run"
},
"model_info": {
"type": "object",
"description": "Complete model specification including basic information, technical configuration and inference settings",
"required": [
"name",
"source_url"
],
"properties": {
"name": {
"type": "string",
"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
},
"source_url": {
"type": "string",
"description": "URL for the source of the evaluation data"
},
"provider_name": {
"type": "string",
"description": "Name of the provider of the evaluation results."
},
"developer": {
"type": "string",
"description": "Name of organization that provides the model (e.g. 'OpenAI')"
},
"inference_platform": {
"type": "string",
"description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
}
}
},
"evaluation_results": {
"type": "array",
"description": "Array of evaluation results",
"items": {
"type": "object",
"required": [
"evaluation_name",
"metric_config",
"score_details"
],
"properties": {
"evaluation_name": {
"type": "string",
"description": "Name of the evaluation"
},
"metric_config": {
"type": "object",
"description": "Details about the metric",
"required": [
"lower_is_better"
],
"properties": {
"evaluation_description": {
"type": "string",
"description": "Description of the evaluation"
},
"lower_is_better": {
"type": "boolean",
"description": "Whether a lower score is better"
},
"score_type": {
"type": "string",
"description": "Type of score",
"enum": [
"binary",
"continuous",
"levels"
]
},
"score_level_names": {
"type": "array",
"description": "Names of the score levels",
"items": {
"type": "string"
}
},
"min_score": {
"type": "number",
"description": "Minimum possible score"
},
"max_score": {
"type": "number",
"description": "Maximum possible score"
}
}
},
"score_details": {
"type": "string",
"description": "The score for the evaluation and related details",
"required": [
"score"
],
"properties": {
"score": {
"type": "number",
"description": "The score for the evaluation"
},
"details": {
"type": "string",
"description": "Any additional details about the score"
}
}
},
"detailed_evaluation_results_url": {
"type": "string",
"description": "Link to detailed evaluation data"
},
"generation_config": {
"type": "object",
"generation_args": {
"type": "object",
"description": "Parameters used to generate results - properties may vary by model type",
"properties": {
"temperature": {
"type": [
"null",
"number"
],
"description": "Sampling temperature"
},
"top_p": {
"type": [
"null",
"number"
],
"description": "Nucleus sampling parameter"
},
"top_k": {
"type": [
"null",
"number"
],
"description": "Top-k sampling parameter"
},
"max_tokens": {
"type": "integer",
"minimum": 1,
"description": "Maximum number of tokens to generate"
}
},
"additionalProperties": true
},
"additional_details": {
"type": "string",
"description": "Additional details about how the results for this metric were generated."
}
}
}
}

}
}
}
84 changes: 84 additions & 0 deletions schema/leaderboard_eval_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# generated by datamodel-codegen:
# filename: leaderboard.schema.json
# timestamp: 2025-10-01T17:57:26+00:00

from __future__ import annotations

from enum import Enum
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field


class ModelInfo(BaseModel):
name: str = Field(
...,
description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)',
)
source_url: str = Field(
..., description='URL for the source of the evaluation data'
)
provider_name: Optional[str] = Field(
None, description='Name of the provider of the evaluation results.'
)
developer: Optional[str] = Field(
None, description="Name of organization that provides the model (e.g. 'OpenAI')"
)
inference_platform: Optional[str] = Field(
None,
description='Description of platform used to run the evaluations (e.g. local machine, Bedrock)',
)


class ScoreType(Enum):
binary = 'binary'
continuous = 'continuous'
levels = 'levels'


class MetricConfig(BaseModel):
evaluation_description: Optional[str] = Field(
None, description='Description of the evaluation'
)
lower_is_better: bool = Field(..., description='Whether a lower score is better')
score_type: Optional[ScoreType] = Field(None, description='Type of score')
score_level_names: Optional[List[str]] = Field(
None, description='Names of the score levels'
)
min_score: Optional[float] = Field(None, description='Minimum possible score')
max_score: Optional[float] = Field(None, description='Maximum possible score')


class ScoreDetails(BaseModel):
score: float = Field(..., description='The score for the evaluation')
details: Optional[str] = Field(
None, description='Any additional details about the score'
)


class EvaluationResult(BaseModel):
evaluation_name: str = Field(..., description='Name of the evaluation')
metric_config: MetricConfig = Field(..., description='Details about the metric')
score_details: ScoreDetails = Field(
..., description='The score for the evaluation and related details'
)
detailed_evaluation_results_url: Optional[str] = Field(
None, description='Link to detailed evaluation data'
)
generation_config: Optional[Dict[str, Any]] = None


class LeaderboardEvaluationResult(BaseModel):
schema_version: str = Field(
..., description='Version of the schema used for this evaluation data'
)
evaluation_id: str = Field(
..., description='Unique identifier for this specific evaluation run'
)
model_info: ModelInfo = Field(
...,
description='Complete model specification including basic information, technical configuration and inference settings',
)
evaluation_results: List[EvaluationResult] = Field(
..., description='Array of evaluation results'
)