From 3abe94efccdd924fe997b685e68e7cae985a3f6c Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Sun, 28 Sep 2025 11:38:20 -0400
Subject: [PATCH 1/7] proposed new leaderboard schema

---
 schema/leaderboard.schema.json | 408 +++++++++++++++++++++++++++++++++
 1 file changed, 408 insertions(+)
 create mode 100644 schema/leaderboard.schema.json

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
new file mode 100644
index 0000000..05e3dc6
--- /dev/null
+++ b/schema/leaderboard.schema.json
@@ -0,0 +1,408 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "version": "0.0.1",
+    "type": "object",
+    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
+    "required": [
+        "schema_version",
+        "evaluation_id",
+        "model_info",
+        "evaluation_results"
+    ],
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Version of the schema used for this evaluation data"
+        },
+        "evaluation_id": {
+            "type": "string",
+            "description": "Unique identifier for this specific evaluation run"
+        },
+        "model_info": {
+            "type": "object",
+            "description": "Complete model specification including basic information, technical configuration and inference settings",
+            "required": [
+                    "name",
+                    "provider_name"
+            ],
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')"
+                },
+                "provider_name": {
+                    "type": "string",
+                    "description": "Name of the provider for the version of the model used during evaluation."
+                },
+                "family": {
+                    "type": [
+                        "string"
+                    ],
+                    "description": "Model family"
+                },
+                "developer": {
+                    "type": "string",
+                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
+                },
+                "configuration": {
+                    "type": "object",
+                    "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
+                    "required": [
+                        "context_window"
+                    ],
+                    "properties": {
+                        "architecture": {
+                            "type": [
+                                "string",
+                                "null"
+                            ],
+                            "enum": [
+                                "transformer",
+                                "moe",
+                                "ssm",
+                                null
+                            ],
+                            "description": "Model architecture type"
+                        },
+                        "parameters": {
+                            "type": [
+                                "integer",
+                                "null"
+                            ],
+                            "minimum": 1,
+                            "description": "Number of parameters in billions"
+                        },
+                        "context_window": {
+                            "type": "integer",
+                            "minimum": 1,
+                            "description": "Maximum context window size in tokens"
+                        },
+                        "is_instruct": {
+                            "type": "boolean",
+                            "description": "Whether the model is instruction-tuned"
+                        },
+                        "hf_path": {
+                            "type": [
+                                "string",
+                                "null"
+                            ],
+                            "description": "HuggingFace model path"
+                        },
+                        "revision": {
+                            "type": [
+                                "string",
+                                "null"
+                            ],
+                            "description": "Model revision/commit hash"
+                        }
+                    }
+                },
+                "inference_settings": {
+                    "type": "object",
+                    "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
+                    "required": [
+                        "quantization"
+                    ],
+                    "properties": {
+                        "quantization": {
+                            "type": "object",
+                            "required": [
+                                "bit_precision",
+                                "method"
+                            ],
+                            "properties": {
+                                "bit_precision": {
+                                    "type": "string",
+                                    "enum": [
+                                        "none",
+                                        "int8",
+                                        "int4",
+                                        "float16",
+                                        "float32"
+                                    ],
+                                    "description": "Quantization bit precision"
+                                },
+                                "method": {
+                                    "type": "string",
+                                    "enum": [
+                                        "None",
+                                        "dynamic",
+                                        "static"
+                                    ],
+                                    "description": "Quantization method"
+                                }
+                            }
+                        },
+                        "generation_args": {
+                            "type": "object",
+                            "properties": {
+                                "use_vllm": {
+                                    "type": "boolean",
+                                    "description": "Whether VLLM was used for inference"
+                                },
+                                "temperature": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Sampling temperature"
+                                },
+                                "top_p": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Nucleus sampling parameter"
+                                },
+                                "top_k": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Top-k sampling parameter"
+                                },
+                                "max_tokens": {
+                                    "type": "integer",
+                                    "minimum": 1,
+                                    "description": "Maximum number of tokens to generate"
+                                },
+                                "stop_sequences": {
+                                    "type": "array",
+                                    "description": "Sequences that stop generation",
+                                    "items": {
+                                        "type": "string"
+                                    },
+                                    "default": []
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "evaluation_results": {
+            "type": "array",
+            "description": "Array of evaluation results",
+            "items": {
+                "type": "object",
+                "required": [
+                    "evaluation_name",
+                    "metric_config",
+                    "score_details"
+                ],
+                "properties": {
+                    "evaluation_name": {
+                        "type": "string",
+                        "description": "Name of the evaluation"
+                    },
+                    "metric_config": {
+                        "type": "object",
+                        "description": "Details about the metric",
+                        "required": [
+                            "lower_is_better"
+                        ],
+                        "properties": {
+                            "evaluation_description": {
+                                "type": "string",
+                                "description": "Description of the evaluation"
+                            },
+                            "lower_is_better": {
+                                "type": "boolean",
+                                "description": "Whether a lower score is better"
+                            },
+                            "min_score": {
+                                "type": "number",
+                                "description": "Minimum possible score"
+                            },
+                            "max_score": {
+                                "type": "number",
+                                "description": "Maximum possible score"
+                            }
+                        }
+                    },
+                    "score_details": {
+                        "type": "string",
+                        "description": "The score for the evaluation and relted details",
+                        "properties": {
+                            "score": {
+                                "type": "number",
+                                "description": "The score for the evaluation"
+                            },
+                            "details": {
+                                "type": "string",
+                                "description": "Any additional details about the score"
+                            }
+                        }
+                    },
+                    "generation_config": {
+                        "type": "string",
+                        "description": "Details about how the scores were generated",
+                        "prompt_config": {
+                        "type": "object",
+                        "description": "Configuration of the prompt template and formatting",
+                        "required": [
+                            "prompt_class"
+                        ],
+                        "properties": {
+                            "prompt_class": {
+                                "type": "string",
+                                "description": "Type of task and its formatting requirements",
+                                "enum": [
+                                    "MultipleChoice",
+                                    "OpenEnded",
+                                    "Completion"
+                                ]
+                            },
+                            "dimensions": {
+                                "type": "object",
+                                "description": "Format-specific configuration dimensions",
+                                "required": [
+                                    "choices_order",
+                                    "enumerator",
+                                    "instruction_phrasing",
+                                    "separator",
+                                    "shots"
+                                ],
+                                "properties": {
+                                    "choices_order": {
+                                        "type": "object",
+                                        "required": [
+                                            "method",
+                                            "description"
+                                        ],
+                                        "properties": {
+                                            "method": {
+                                                "type": "string",
+                                                "description": "The method to use for ordering choices"
+                                            },
+                                            "description": {
+                                                "type": "string",
+                                                "description": "Detailed explanation of the ordering method"
+                                            }
+                                        }
+                                    },
+                                    "demonstrations": {
+                                        "type": "array",
+                                        "description": "Array of demonstration examples used in few-shot prompting",
+                                        "default": []
+                                    },
+                                    "enumerator": {
+                                        "type": "string",
+                                        "description": "Style of enumeration for multiple choice options",
+                                        "enum": [
+                                            "capitals",
+                                            "lowercase",
+                                            "numbers",
+                                            "roman",
+                                            "keyboard",
+                                            "greek"
+                                        ]
+                                    },
+                                    "instruction_phrasing": {
+                                        "type": "object",
+                                        "required": [
+                                            "name",
+                                            "text"
+                                        ],
+                                        "properties": {
+                                            "name": {
+                                                "type": "string",
+                                                "description": "Name of the instruction template"
+                                            },
+                                            "text": {
+                                                "type": "string",
+                                                "description": "Template text with placeholders for question and choices (or more)"
+                                            }
+                                        }
+                                    },
+                                    "separator": {
+                                        "type": "string",
+                                        "description": "Character(s) used to separate multiple choice options",
+                                        "enum": [
+                                            "\\s",
+                                            "\n",
+                                            ", ",
+                                            "; ",
+                                            " | ",
+                                            " OR ",
+                                            " or "
+                                        ]
+                                    },
+                                    "shots": {
+                                        "type": "integer",
+                                        "description": "Number of examples provided in the prompt",
+                                        "minimum": 0,
+                                        "maximum": 10
+                                    }
+                                }
+                            }
+                        }
+                        },
+                        "evaluation_method": {
+                            "type": "object",
+                            "description": "Evaluation metrics and ground truth",
+                            "required": [
+                                "evaluation_method"
+                            ],
+                            "properties": {
+                                "evaluation_method": {
+                                    "type": "object",
+                                    "description": "Method used to evaluate the answer, including predefined methods and user-defined methods.",
+                                    "properties": {
+                                        "method_name": {
+                                            "type": "string",
+                                            "description": "Name of the evaluation method. Can be a predefined method or a user-defined method."
+                                        },
+                                        "description": {
+                                            "type": "string",
+                                            "description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required."
+                                        },
+                                        "parameters": {
+                                            "type": "object",
+                                            "description": "Optional parameters used by the evaluation method. Allows custom configuration.",
+                                            "additionalProperties": true
+                                        }
+                                    },
+                                    "required": [
+                                        "method_name",
+                                        "description"
+                                    ],
+                                    "if": {
+                                        "properties": {
+                                            "method_name": {
+                                                "enum": [
+                                                    "label_only_match",
+                                                    "content_similarity"
+                                                ]
+                                            }
+                                        }
+                                    },
+                                    "then": {
+                                        "properties": {
+                                            "description": {
+                                                "type": "string",
+                                                "enum": [
+                                                    "Compares only the choice identifier/label to evaluate the response.",
+                                                    "Finds the most similar answer among the given choices by comparing the textual content"
+                                                ]
+                                            }
+                                        }
+                                    },
+                                    "else": {
+                                        "properties": {
+                                            "description": {
+                                                "type": "string",
+                                                "description": "Explanation of the custom evaluation method."
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+        }
+    }
+}

From a189e5e93903a0af04037615f409e89fe9f78a49 Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Sun, 28 Sep 2025 11:55:42 -0400
Subject: [PATCH 2/7] Add config for sample level data

---
 schema/leaderboard.schema.json | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
index 05e3dc6..d2a7ae0 100644
--- a/schema/leaderboard.schema.json
+++ b/schema/leaderboard.schema.json
@@ -222,7 +222,7 @@
                     },
                     "score_details": {
                         "type": "string",
-                        "description": "The score for the evaluation and relted details",
+                        "description": "The score for the evaluation and related details",
                         "properties": {
                             "score": {
                                 "type": "number",
@@ -234,6 +234,27 @@
                             }
                         }
                     },
+                    "sample_level_data": {
+                        "type": "array",
+                        "description": "Sample level results for items used in evaluation",
+                        "items": {
+                            "type": "object",
+                            "required": [
+                                "sample_id",
+                                "score"
+                            ],
+                            "properties": {
+                                "sample_id": {
+                                    "type": "string",
+                                    "description": "Unique identifier for the sample"
+                                },
+                                "score": {
+                                    "type": "number",
+                                    "description": "Score for the sample"
+                                }
+                            }
+                        }
+                    },
                     "generation_config": {
                         "type": "string",
                         "description": "Details about how the scores were generated",

From 2dbf00ebc62d9ec0b26217fb0c48b3bdf756b143 Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Sun, 28 Sep 2025 12:03:45 -0400
Subject: [PATCH 3/7] fix indent + add score level names

---
 schema/leaderboard.schema.json | 202 ++++++++++++++++++---------------
 1 file changed, 109 insertions(+), 93 deletions(-)

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
index d2a7ae0..1c1fdc2 100644
--- a/schema/leaderboard.schema.json
+++ b/schema/leaderboard.schema.json
@@ -210,6 +210,22 @@
                                 "type": "boolean",
                                 "description": "Whether a lower score is better"
                             },
+                            "score_type": {
+                                "type": "string",
+                                "description": "Type of score",
+                                "enum": [
+                                    "binary",
+                                    "continuous",
+                                    "levels"
+                                ]
+                            },
+                            "score_level_names": {
+                                "type": "array",
+                                "description": "Names of the score levels",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
                             "min_score": {
                                 "type": "number",
                                 "description": "Minimum possible score"
@@ -259,105 +275,105 @@
                         "type": "string",
                         "description": "Details about how the scores were generated",
                         "prompt_config": {
-                        "type": "object",
-                        "description": "Configuration of the prompt template and formatting",
-                        "required": [
-                            "prompt_class"
-                        ],
-                        "properties": {
-                            "prompt_class": {
-                                "type": "string",
-                                "description": "Type of task and its formatting requirements",
-                                "enum": [
-                                    "MultipleChoice",
-                                    "OpenEnded",
-                                    "Completion"
-                                ]
-                            },
-                            "dimensions": {
-                                "type": "object",
-                                "description": "Format-specific configuration dimensions",
-                                "required": [
-                                    "choices_order",
-                                    "enumerator",
-                                    "instruction_phrasing",
-                                    "separator",
-                                    "shots"
-                                ],
-                                "properties": {
-                                    "choices_order": {
-                                        "type": "object",
-                                        "required": [
-                                            "method",
-                                            "description"
-                                        ],
-                                        "properties": {
-                                            "method": {
-                                                "type": "string",
-                                                "description": "The method to use for ordering choices"
-                                            },
-                                            "description": {
-                                                "type": "string",
-                                                "description": "Detailed explanation of the ordering method"
+                            "type": "object",
+                            "description": "Configuration of the prompt template and formatting",
+                            "required": [
+                                "prompt_class"
+                            ],
+                            "properties": {
+                                "prompt_class": {
+                                    "type": "string",
+                                    "description": "Type of task and its formatting requirements",
+                                    "enum": [
+                                        "MultipleChoice",
+                                        "OpenEnded",
+                                        "Completion"
+                                    ]
+                                },
+                                "dimensions": {
+                                    "type": "object",
+                                    "description": "Format-specific configuration dimensions",
+                                    "required": [
+                                        "choices_order",
+                                        "enumerator",
+                                        "instruction_phrasing",
+                                        "separator",
+                                        "shots"
+                                    ],
+                                    "properties": {
+                                        "choices_order": {
+                                            "type": "object",
+                                            "required": [
+                                                "method",
+                                                "description"
+                                            ],
+                                            "properties": {
+                                                "method": {
+                                                    "type": "string",
+                                                    "description": "The method to use for ordering choices"
+                                                },
+                                                "description": {
+                                                    "type": "string",
+                                                    "description": "Detailed explanation of the ordering method"
+                                                }
                                             }
-                                        }
-                                    },
-                                    "demonstrations": {
-                                        "type": "array",
-                                        "description": "Array of demonstration examples used in few-shot prompting",
-                                        "default": []
-                                    },
-                                    "enumerator": {
-                                        "type": "string",
-                                        "description": "Style of enumeration for multiple choice options",
-                                        "enum": [
-                                            "capitals",
-                                            "lowercase",
-                                            "numbers",
-                                            "roman",
-                                            "keyboard",
-                                            "greek"
-                                        ]
-                                    },
-                                    "instruction_phrasing": {
-                                        "type": "object",
-                                        "required": [
-                                            "name",
-                                            "text"
-                                        ],
-                                        "properties": {
-                                            "name": {
-                                                "type": "string",
-                                                "description": "Name of the instruction template"
-                                            },
-                                            "text": {
-                                                "type": "string",
-                                                "description": "Template text with placeholders for question and choices (or more)"
+                                        },
+                                        "demonstrations": {
+                                            "type": "array",
+                                            "description": "Array of demonstration examples used in few-shot prompting",
+                                            "default": []
+                                        },
+                                        "enumerator": {
+                                            "type": "string",
+                                            "description": "Style of enumeration for multiple choice options",
+                                            "enum": [
+                                                "capitals",
+                                                "lowercase",
+                                                "numbers",
+                                                "roman",
+                                                "keyboard",
+                                                "greek"
+                                            ]
+                                        },
+                                        "instruction_phrasing": {
+                                            "type": "object",
+                                            "required": [
+                                                "name",
+                                                "text"
+                                            ],
+                                            "properties": {
+                                                "name": {
+                                                    "type": "string",
+                                                    "description": "Name of the instruction template"
+                                                },
+                                                "text": {
+                                                    "type": "string",
+                                                    "description": "Template text with placeholders for question and choices (or more)"
+                                                }
                                             }
+                                        },
+                                        "separator": {
+                                            "type": "string",
+                                            "description": "Character(s) used to separate multiple choice options",
+                                            "enum": [
+                                                "\\s",
+                                                "\n",
+                                                ", ",
+                                                "; ",
+                                                " | ",
+                                                " OR ",
+                                                " or "
+                                            ]
+                                        },
+                                        "shots": {
+                                            "type": "integer",
+                                            "description": "Number of examples provided in the prompt",
+                                            "minimum": 0,
+                                            "maximum": 10
                                         }
-                                    },
-                                    "separator": {
-                                        "type": "string",
-                                        "description": "Character(s) used to separate multiple choice options",
-                                        "enum": [
-                                            "\\s",
-                                            "\n",
-                                            ", ",
-                                            "; ",
-                                            " | ",
-                                            " OR ",
-                                            " or "
-                                        ]
-                                    },
-                                    "shots": {
-                                        "type": "integer",
-                                        "description": "Number of examples provided in the prompt",
-                                        "minimum": 0,
-                                        "maximum": 10
                                     }
                                 }
                             }
-                        }
                         },
                         "evaluation_method": {
                             "type": "object",

From 8ceca24d8260b98d48b701834362b9ef2900050a Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Mon, 29 Sep 2025 15:38:27 -0400
Subject: [PATCH 4/7] remove some unused keys

---
 schema/leaderboard.schema.json | 59 ----------------------------------
 1 file changed, 59 deletions(-)

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
index 1c1fdc2..f3443b7 100644
--- a/schema/leaderboard.schema.json
+++ b/schema/leaderboard.schema.json
@@ -34,69 +34,10 @@
                     "type": "string",
                     "description": "Name of the provider for the version of the model used during evaluation."
                 },
-                "family": {
-                    "type": [
-                        "string"
-                    ],
-                    "description": "Model family"
-                },
                 "developer": {
                     "type": "string",
                     "description": "Name of organization that provides the model (e.g. 'OpenAI')"
                 },
-                "configuration": {
-                    "type": "object",
-                    "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
-                    "required": [
-                        "context_window"
-                    ],
-                    "properties": {
-                        "architecture": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "enum": [
-                                "transformer",
-                                "moe",
-                                "ssm",
-                                null
-                            ],
-                            "description": "Model architecture type"
-                        },
-                        "parameters": {
-                            "type": [
-                                "integer",
-                                "null"
-                            ],
-                            "minimum": 1,
-                            "description": "Number of parameters in billions"
-                        },
-                        "context_window": {
-                            "type": "integer",
-                            "minimum": 1,
-                            "description": "Maximum context window size in tokens"
-                        },
-                        "is_instruct": {
-                            "type": "boolean",
-                            "description": "Whether the model is instruction-tuned"
-                        },
-                        "hf_path": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "description": "HuggingFace model path"
-                        },
-                        "revision": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "description": "Model revision/commit hash"
-                        }
-                    }
-                },
                 "inference_settings": {
                     "type": "object",
                     "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",

From 64b190ea03f2cc3c5ce4f8b237c53d0671b14022 Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Mon, 29 Sep 2025 15:49:05 -0400
Subject: [PATCH 5/7] Additional cleanup

---
 schema/leaderboard.schema.json | 52 ++++++----------------------------
 1 file changed, 8 insertions(+), 44 deletions(-)

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
index f3443b7..f3f9c16 100644
--- a/schema/leaderboard.schema.json
+++ b/schema/leaderboard.schema.json
@@ -41,46 +41,14 @@
                 "inference_settings": {
                     "type": "object",
                     "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
-                    "required": [
-                        "quantization"
-                    ],
                     "properties": {
-                        "quantization": {
-                            "type": "object",
-                            "required": [
-                                "bit_precision",
-                                "method"
-                            ],
-                            "properties": {
-                                "bit_precision": {
-                                    "type": "string",
-                                    "enum": [
-                                        "none",
-                                        "int8",
-                                        "int4",
-                                        "float16",
-                                        "float32"
-                                    ],
-                                    "description": "Quantization bit precision"
-                                },
-                                "method": {
-                                    "type": "string",
-                                    "enum": [
-                                        "None",
-                                        "dynamic",
-                                        "static"
-                                    ],
-                                    "description": "Quantization method"
-                                }
-                            }
+                        "quantization_method": {
+                            "type": "string",
+                            "description": "Quantization method used for the model (e.g GPTQ)"
                         },
                         "generation_args": {
                             "type": "object",
                             "properties": {
-                                "use_vllm": {
-                                    "type": "boolean",
-                                    "description": "Whether VLLM was used for inference"
-                                },
                                 "temperature": {
                                     "type": [
                                         "null",
@@ -106,16 +74,9 @@
                                     "type": "integer",
                                     "minimum": 1,
                                     "description": "Maximum number of tokens to generate"
-                                },
-                                "stop_sequences": {
-                                    "type": "array",
-                                    "description": "Sequences that stop generation",
-                                    "items": {
-                                        "type": "string"
-                                    },
-                                    "default": []
                                 }
-                            }
+                            },
+                            "additionalProperties": true
                         }
                     }
                 }
@@ -180,6 +141,9 @@
                     "score_details": {
                         "type": "string",
                         "description": "The score for the evaluation and related details",
+                        "required": [
+                            "score"
+                        ],
                         "properties": {
                             "score": {
                                 "type": "number",

From ccd05dddf1d378d84ed9122b69808d4882a7050d Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Mon, 29 Sep 2025 15:52:36 -0400
Subject: [PATCH 6/7] add Python schema

---
 schema/leaderboard.schema.json   |   6 +-
 schema/leaderboard_eval_types.py | 110 +++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 schema/leaderboard_eval_types.py

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
index f3f9c16..6fd8a54 100644
--- a/schema/leaderboard.schema.json
+++ b/schema/leaderboard.schema.json
@@ -23,13 +23,17 @@
             "description": "Complete model specification including basic information, technical configuration and inference settings",
             "required": [
                     "name",
-                    "provider_name"
+                    "source_url"
             ],
             "properties": {
                 "name": {
                     "type": "string",
                     "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')"
                 },
+                "source_url": {
+                    "type": "string",
+                    "description": "URL for the source of the evaluation data"
+                },
                 "provider_name": {
                     "type": "string",
                     "description": "Name of the provider for the version of the model used during evaluation."
diff --git a/schema/leaderboard_eval_types.py b/schema/leaderboard_eval_types.py
new file mode 100644
index 0000000..6281d72
--- /dev/null
+++ b/schema/leaderboard_eval_types.py
@@ -0,0 +1,110 @@
+# generated by datamodel-codegen:
+#   filename:  leaderboard.schema.json
+#   timestamp: 2025-09-29T19:52:18+00:00
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import List, Optional
+
+from pydantic import BaseModel, ConfigDict, Field, conint
+
+
+class GenerationArgs(BaseModel):
+    model_config = ConfigDict(
+        extra='allow',
+    )
+    temperature: Optional[float] = Field(None, description='Sampling temperature')
+    top_p: Optional[float] = Field(None, description='Nucleus sampling parameter')
+    top_k: Optional[float] = Field(None, description='Top-k sampling parameter')
+    max_tokens: Optional[conint(ge=1)] = Field(
+        None, description='Maximum number of tokens to generate'
+    )
+
+
+class InferenceSettings(BaseModel):
+    quantization_method: Optional[str] = Field(
+        None, description='Quantization method used for the model (e.g GPTQ)'
+    )
+    generation_args: Optional[GenerationArgs] = None
+
+
+class ModelInfo(BaseModel):
+    name: str = Field(
+        ..., description="Model name and version (e.g., 'Llama-2-13b-chat-hf')"
+    )
+    source_url: str = Field(
+        ..., description='URL for the source of the evaluation data'
+    )
+    provider_name: Optional[str] = Field(
+        None,
+        description='Name of the provider for the version of the model used during evaluation.',
+    )
+    developer: Optional[str] = Field(
+        None, description="Name of organization that provides the model (e.g. 'OpenAI')"
+    )
+    inference_settings: Optional[InferenceSettings] = Field(
+        None,
+        description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution',
+    )
+
+
+class ScoreType(Enum):
+    binary = 'binary'
+    continuous = 'continuous'
+    levels = 'levels'
+
+
+class MetricConfig(BaseModel):
+    evaluation_description: Optional[str] = Field(
+        None, description='Description of the evaluation'
+    )
+    lower_is_better: bool = Field(..., description='Whether a lower score is better')
+    score_type: Optional[ScoreType] = Field(None, description='Type of score')
+    score_level_names: Optional[List[str]] = Field(
+        None, description='Names of the score levels'
+    )
+    min_score: Optional[float] = Field(None, description='Minimum possible score')
+    max_score: Optional[float] = Field(None, description='Maximum possible score')
+
+
+class ScoreDetails(BaseModel):
+    score: float = Field(..., description='The score for the evaluation')
+    details: Optional[str] = Field(
+        None, description='Any additional details about the score'
+    )
+
+
+class SampleLevelDatum(BaseModel):
+    sample_id: str = Field(..., description='Unique identifier for the sample')
+    score: float = Field(..., description='Score for the sample')
+
+
+class EvaluationResult(BaseModel):
+    evaluation_name: str = Field(..., description='Name of the evaluation')
+    metric_config: MetricConfig = Field(..., description='Details about the metric')
+    score_details: ScoreDetails = Field(
+        ..., description='The score for the evaluation and related details'
+    )
+    sample_level_data: Optional[List[SampleLevelDatum]] = Field(
+        None, description='Sample level results for items used in evaluation'
+    )
+    generation_config: Optional[str] = Field(
+        None, description='Details about how the scores were generated'
+    )
+
+
+class LeaderboardEvaluationResult(BaseModel):
+    schema_version: str = Field(
+        ..., description='Version of the schema used for this evaluation data'
+    )
+    evaluation_id: str = Field(
+        ..., description='Unique identifier for this specific evaluation run'
+    )
+    model_info: ModelInfo = Field(
+        ...,
+        description='Complete model specification including basic information, technical configuration and inference settings',
+    )
+    evaluation_results: List[EvaluationResult] = Field(
+        ..., description='Array of evaluation results'
+    )

From c6f56ca5211e499c5b988b58c7a675b2e1c763ed Mon Sep 17 00:00:00 2001
From: Anastassia Kornilova <akornilo@Anastassias-Laptop.local>
Date: Wed, 1 Oct 2025 13:57:35 -0400
Subject: [PATCH 7/7] updated schema

---
 schema/leaderboard.schema.json   | 263 +++++--------------------------
 schema/leaderboard_eval_types.py |  48 ++----
 2 files changed, 52 insertions(+), 259 deletions(-)

diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
index 6fd8a54..e02bb10 100644
--- a/schema/leaderboard.schema.json
+++ b/schema/leaderboard.schema.json
@@ -28,7 +28,7 @@
             "properties": {
                 "name": {
                     "type": "string",
-                    "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')"
+                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
                 },
                 "source_url": {
                     "type": "string",
@@ -36,53 +36,15 @@
                 },
                 "provider_name": {
                     "type": "string",
-                    "description": "Name of the provider for the version of the model used during evaluation."
+                    "description": "Name of the provider of the evaluation results."
                 },
                 "developer": {
                     "type": "string",
                     "description": "Name of organization that provides the model (e.g. 'OpenAI')"
                 },
-                "inference_settings": {
-                    "type": "object",
-                    "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
-                    "properties": {
-                        "quantization_method": {
-                            "type": "string",
-                            "description": "Quantization method used for the model (e.g GPTQ)"
-                        },
-                        "generation_args": {
-                            "type": "object",
-                            "properties": {
-                                "temperature": {
-                                    "type": [
-                                        "null",
-                                        "number"
-                                    ],
-                                    "description": "Sampling temperature"
-                                },
-                                "top_p": {
-                                    "type": [
-                                        "null",
-                                        "number"
-                                    ],
-                                    "description": "Nucleus sampling parameter"
-                                },
-                                "top_k": {
-                                    "type": [
-                                        "null",
-                                        "number"
-                                    ],
-                                    "description": "Top-k sampling parameter"
-                                },
-                                "max_tokens": {
-                                    "type": "integer",
-                                    "minimum": 1,
-                                    "description": "Maximum number of tokens to generate"
-                                }
-                            },
-                            "additionalProperties": true
-                        }
-                    }
+                "inference_platform": {
+                    "type": "string",
+                    "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
                 }
             }
         },
@@ -159,191 +121,48 @@
                             }
                         }
                     },
-                    "sample_level_data": {
-                        "type": "array",
-                        "description": "Sample level results for items used in evaluation",
-                        "items": {
-                            "type": "object",
-                            "required": [
-                                "sample_id",
-                                "score"
-                            ],
-                            "properties": {
-                                "sample_id": {
-                                    "type": "string",
-                                    "description": "Unique identifier for the sample"
-                                },
-                                "score": {
-                                    "type": "number",
-                                    "description": "Score for the sample"
-                                }
-                            }
-                        }
+                    "detailed_evaluation_results_url": {
+                        "type": "string",
+                        "description": "Link to detailed evaluation data"
                     },
                     "generation_config": {
-                        "type": "string",
-                        "description": "Details about how the scores were generated",
-                        "prompt_config": {
-                            "type": "object",
-                            "description": "Configuration of the prompt template and formatting",
-                            "required": [
-                                "prompt_class"
-                            ],
-                            "properties": {
-                                "prompt_class": {
-                                    "type": "string",
-                                    "description": "Type of task and its formatting requirements",
-                                    "enum": [
-                                        "MultipleChoice",
-                                        "OpenEnded",
-                                        "Completion"
-                                    ]
-                                },
-                                "dimensions": {
-                                    "type": "object",
-                                    "description": "Format-specific configuration dimensions",
-                                    "required": [
-                                        "choices_order",
-                                        "enumerator",
-                                        "instruction_phrasing",
-                                        "separator",
-                                        "shots"
-                                    ],
-                                    "properties": {
-                                        "choices_order": {
-                                            "type": "object",
-                                            "required": [
-                                                "method",
-                                                "description"
-                                            ],
-                                            "properties": {
-                                                "method": {
-                                                    "type": "string",
-                                                    "description": "The method to use for ordering choices"
-                                                },
-                                                "description": {
-                                                    "type": "string",
-                                                    "description": "Detailed explanation of the ordering method"
-                                                }
-                                            }
-                                        },
-                                        "demonstrations": {
-                                            "type": "array",
-                                            "description": "Array of demonstration examples used in few-shot prompting",
-                                            "default": []
-                                        },
-                                        "enumerator": {
-                                            "type": "string",
-                                            "description": "Style of enumeration for multiple choice options",
-                                            "enum": [
-                                                "capitals",
-                                                "lowercase",
-                                                "numbers",
-                                                "roman",
-                                                "keyboard",
-                                                "greek"
-                                            ]
-                                        },
-                                        "instruction_phrasing": {
-                                            "type": "object",
-                                            "required": [
-                                                "name",
-                                                "text"
-                                            ],
-                                            "properties": {
-                                                "name": {
-                                                    "type": "string",
-                                                    "description": "Name of the instruction template"
-                                                },
-                                                "text": {
-                                                    "type": "string",
-                                                    "description": "Template text with placeholders for question and choices (or more)"
-                                                }
-                                            }
-                                        },
-                                        "separator": {
-                                            "type": "string",
-                                            "description": "Character(s) used to separate multiple choice options",
-                                            "enum": [
-                                                "\\s",
-                                                "\n",
-                                                ", ",
-                                                "; ",
-                                                " | ",
-                                                " OR ",
-                                                " or "
-                                            ]
-                                        },
-                                        "shots": {
-                                            "type": "integer",
-                                            "description": "Number of examples provided in the prompt",
-                                            "minimum": 0,
-                                            "maximum": 10
-                                        }
-                                    }
-                                }
-                            }
-                        },
-                        "evaluation_method": {
-                            "type": "object",
-                            "description": "Evaluation metrics and ground truth",
-                            "required": [
-                                "evaluation_method"
-                            ],
-                            "properties": {
-                                "evaluation_method": {
-                                    "type": "object",
-                                    "description": "Method used to evaluate the answer, including predefined methods and user-defined methods.",
-                                    "properties": {
-                                        "method_name": {
-                                            "type": "string",
-                                            "description": "Name of the evaluation method. Can be a predefined method or a user-defined method."
-                                        },
-                                        "description": {
-                                            "type": "string",
-                                            "description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required."
-                                        },
-                                        "parameters": {
-                                            "type": "object",
-                                            "description": "Optional parameters used by the evaluation method. Allows custom configuration.",
-                                            "additionalProperties": true
-                                        }
+                        "type": "object",
+                        "generation_args": {
+                                "type": "object",
+                                "description": "Parameters used to generate results - properties may vary by model type",
+                                "properties": {
+                                    "temperature": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Sampling temperature"
                                     },
-                                    "required": [
-                                        "method_name",
-                                        "description"
-                                    ],
-                                    "if": {
-                                        "properties": {
-                                            "method_name": {
-                                                "enum": [
-                                                    "label_only_match",
-                                                    "content_similarity"
-                                                ]
-                                            }
-                                        }
+                                    "top_p": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Nucleus sampling parameter"
                                     },
-                                    "then": {
-                                        "properties": {
-                                            "description": {
-                                                "type": "string",
-                                                "enum": [
-                                                    "Compares only the choice identifier/label to evaluate the response.",
-                                                    "Finds the most similar answer among the given choices by comparing the textual content"
-                                                ]
-                                            }
-                                        }
+                                    "top_k": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Top-k sampling parameter"
                                     },
-                                    "else": {
-                                        "properties": {
-                                            "description": {
-                                                "type": "string",
-                                                "description": "Explanation of the custom evaluation method."
-                                            }
-                                        }
+                                    "max_tokens": {
+                                        "type": "integer",
+                                        "minimum": 1,
+                                        "description": "Maximum number of tokens to generate"
                                     }
-                                }
-                            }
+                                },
+                                "additionalProperties": true
+                        },
+                        "additional_details": {
+                            "type": "string",
+                            "description": "Additional details about how the results for this metric were generated."
                         }
                     }
                 }
diff --git a/schema/leaderboard_eval_types.py b/schema/leaderboard_eval_types.py
index 6281d72..6607c43 100644
--- a/schema/leaderboard_eval_types.py
+++ b/schema/leaderboard_eval_types.py
@@ -1,51 +1,32 @@
 # generated by datamodel-codegen:
 #   filename:  leaderboard.schema.json
-#   timestamp: 2025-09-29T19:52:18+00:00
+#   timestamp: 2025-10-01T17:57:26+00:00
 
 from __future__ import annotations
 
 from enum import Enum
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
-from pydantic import BaseModel, ConfigDict, Field, conint
-
-
-class GenerationArgs(BaseModel):
-    model_config = ConfigDict(
-        extra='allow',
-    )
-    temperature: Optional[float] = Field(None, description='Sampling temperature')
-    top_p: Optional[float] = Field(None, description='Nucleus sampling parameter')
-    top_k: Optional[float] = Field(None, description='Top-k sampling parameter')
-    max_tokens: Optional[conint(ge=1)] = Field(
-        None, description='Maximum number of tokens to generate'
-    )
-
-
-class InferenceSettings(BaseModel):
-    quantization_method: Optional[str] = Field(
-        None, description='Quantization method used for the model (e.g GPTQ)'
-    )
-    generation_args: Optional[GenerationArgs] = None
+from pydantic import BaseModel, Field
 
 
 class ModelInfo(BaseModel):
     name: str = Field(
-        ..., description="Model name and version (e.g., 'Llama-2-13b-chat-hf')"
+        ...,
+        description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)',
     )
     source_url: str = Field(
         ..., description='URL for the source of the evaluation data'
     )
     provider_name: Optional[str] = Field(
-        None,
-        description='Name of the provider for the version of the model used during evaluation.',
+        None, description='Name of the provider of the evaluation results.'
     )
     developer: Optional[str] = Field(
         None, description="Name of organization that provides the model (e.g. 'OpenAI')"
     )
-    inference_settings: Optional[InferenceSettings] = Field(
+    inference_platform: Optional[str] = Field(
         None,
-        description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution',
+        description='Description of platform used to run the evaluations (e.g. local machine, Bedrock)',
     )
 
 
@@ -75,23 +56,16 @@ class ScoreDetails(BaseModel):
     )
 
 
-class SampleLevelDatum(BaseModel):
-    sample_id: str = Field(..., description='Unique identifier for the sample')
-    score: float = Field(..., description='Score for the sample')
-
-
 class EvaluationResult(BaseModel):
     evaluation_name: str = Field(..., description='Name of the evaluation')
     metric_config: MetricConfig = Field(..., description='Details about the metric')
     score_details: ScoreDetails = Field(
         ..., description='The score for the evaluation and related details'
     )
-    sample_level_data: Optional[List[SampleLevelDatum]] = Field(
-        None, description='Sample level results for items used in evaluation'
-    )
-    generation_config: Optional[str] = Field(
-        None, description='Details about how the scores were generated'
+    detailed_evaluation_results_url: Optional[str] = Field(
+        None, description='Link to detailed evaluation data'
     )
+    generation_config: Optional[Dict[str, Any]] = None
 
 
 class LeaderboardEvaluationResult(BaseModel):