From a2f96de4594e1e6cbff6d6c6f100c06b41c8f1df Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Thu, 4 Sep 2025 15:31:40 +0200
Subject: [PATCH 1/4] Update Quantization with related to
 https://github.com/evaleval/evalHub/issues/15

---
 pyproject.toml          |  1 +
 schema/README.md        |  6 ++++++
 schema/eval.schema.json | 21 +++++++++++++++++++--
 schema/eval_types.py    | 18 ++++++++++++++++--
 uv.lock                 | 12 ++++++++++++
 5 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index dc5edf3..263d672 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "crfm-helm>=0.5.6",
     "dacite>=1.9.2",
+    "datamodel-code-generator>=0.31.2",
     "numpy>=2.3.1",
     "openai>=1.93.0",
     "pandas>=2.3.0",
diff --git a/schema/README.md b/schema/README.md
index 2bc9a33..9e3dd3c 100644
--- a/schema/README.md
+++ b/schema/README.md
@@ -7,7 +7,13 @@ To generate or update Python types from the JSON schema, you can run the followi
 ```bash
 datamodel-codegen --input eval.schema.json --output eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
 ```
+or 
+```bash
+uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
+```
+from main directory
 
 ## Example Data
 
 Please refer to [this file](./eval.example.json) for a minimal data example adhering to the evaluation format schema.
+uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
\ No newline at end of file
diff --git a/schema/eval.schema.json b/schema/eval.schema.json
index 3b21d0a..f51fc87 100644
--- a/schema/eval.schema.json
+++ b/schema/eval.schema.json
@@ -126,7 +126,8 @@
                             "type": "object",
                             "required": [
                                 "bit_precision",
-                                "method"
+                                "method",
+                                "type"
                             ],
                             "properties": {
                                 "bit_precision": {
@@ -140,13 +141,29 @@
                                     ],
                                     "description": "Quantization bit precision"
                                 },
-                                "method": {
+                                "type": {
                                     "type": "string",
                                     "enum": [
                                         "None",
                                         "dynamic",
                                         "static"
                                     ],
+                                    "description": "Quantization type"
+                                },
+                                "method": {
+                                    "type": "string",
+                                    "enum": [
+                                        "None",
+                                        "AWQ",
+                                        "PTQ",
+                                        "GPTQ",
+                                        "GGUF",
+                                        "QAT",
+                                        "BitsAndBytes",
+                                        "SmoothQuant",
+                                        "HQQ",
+                                        "Quanto"
+                                    ],
                                     "description": "Quantization method"
                                 }
                             }
diff --git a/schema/eval_types.py b/schema/eval_types.py
index 98e4eed..686aa25 100644
--- a/schema/eval_types.py
+++ b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-07-06T13:59:25+00:00
+#   timestamp: 2025-09-04T13:29:15+00:00
 
 from __future__ import annotations
 
@@ -62,14 +62,28 @@ class BitPrecision(Enum):
     float32 = 'float32'
 
 
-class Method(Enum):
+class Type(Enum):
     None_ = 'None'
     dynamic = 'dynamic'
     static = 'static'
 
 
+class Method(Enum):
+    None_ = 'None'
+    AWQ = 'AWQ'
+    PTQ = 'PTQ'
+    GPTQ = 'GPTQ'
+    GGUF = 'GGUF'
+    QAT = 'QAT'
+    BitsAndBytes = 'BitsAndBytes'
+    SmoothQuant = 'SmoothQuant'
+    HQQ = 'HQQ'
+    Quanto = 'Quanto'
+
+
 class Quantization(BaseModel):
     bit_precision: BitPrecision = Field(..., description='Quantization bit precision')
+    type: Type = Field(..., description='Quantization type')
     method: Method = Field(..., description='Quantization method')
 
 
diff --git a/uv.lock b/uv.lock
index 130c14e..7fb48cf 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.12"
 
 [[package]]
@@ -503,6 +504,7 @@ source = { virtual = "." }
 dependencies = [
     { name = "crfm-helm" },
     { name = "dacite" },
+    { name = "datamodel-code-generator" },
     { name = "numpy" },
     { name = "openai" },
     { name = "pandas" },
@@ -518,12 +520,14 @@ dev = [
     { name = "pylint" },
     { name = "pytest" },
     { name = "ruff" },
+    { name = "wget" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "crfm-helm", specifier = ">=0.5.6" },
     { name = "dacite", specifier = ">=1.9.2" },
+    { name = "datamodel-code-generator", specifier = ">=0.31.2" },
     { name = "datamodel-code-generator", marker = "extra == 'dev'", specifier = ">=0.31.2" },
     { name = "numpy", specifier = ">=2.3.1" },
     { name = "openai", specifier = ">=1.93.0" },
@@ -535,7 +539,9 @@ requires-dist = [
     { name = "requests", specifier = ">=2.32.4" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.12.2" },
     { name = "transformers", specifier = ">=4.53.1" },
+    { name = "wget", marker = "extra == 'dev'", specifier = ">=3.2" },
 ]
+provides-extras = ["dev"]
 
 [[package]]
 name = "filelock"
@@ -2449,6 +2455,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/87/abd57374044e1f627f0a905ac33c1a7daab35a3a815abfea4e1bafd3fdb1/weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c", size = 50270 },
 ]
 
+[[package]]
+name = "wget"
+version = "3.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061", size = 10857 }
+
 [[package]]
 name = "wrapt"
 version = "1.17.2"

From 77e0b5906c8353080d03359e86a94407d5e2f41b Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Thu, 4 Sep 2025 15:37:00 +0200
Subject: [PATCH 2/4] Remove Configuration class wrt
 https://github.com/evaleval/evalHub/pull/10 as probably not necessary

---
 schema/eval.schema.json | 53 -----------------------------------------
 1 file changed, 53 deletions(-)

diff --git a/schema/eval.schema.json b/schema/eval.schema.json
index f51fc87..5703bea 100644
--- a/schema/eval.schema.json
+++ b/schema/eval.schema.json
@@ -62,59 +62,6 @@
                         }
                     }
                 },
-                "configuration": {
-                    "type": "object",
-                    "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
-                    "required": [
-                        "context_window"
-                    ],
-                    "properties": {
-                        "architecture": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "enum": [
-                                "transformer",
-                                "moe",
-                                "ssm",
-                                null
-                            ],
-                            "description": "Model architecture type"
-                        },
-                        "parameters": {
-                            "type": [
-                                "integer",
-                                "null"
-                            ],
-                            "minimum": 1,
-                            "description": "Number of parameters in billions"
-                        },
-                        "context_window": {
-                            "type": "integer",
-                            "minimum": 1,
-                            "description": "Maximum context window size in tokens"
-                        },
-                        "is_instruct": {
-                            "type": "boolean",
-                            "description": "Whether the model is instruction-tuned"
-                        },
-                        "hf_path": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "description": "HuggingFace model path"
-                        },
-                        "revision": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "description": "Model revision/commit hash"
-                        }
-                    }
-                },
                 "inference_settings": {
                     "type": "object",
                     "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",

From 8292dd94eb45c1ec64e1c880ddf54a164cf25ad6 Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Thu, 4 Sep 2025 16:33:45 +0200
Subject: [PATCH 3/4] Extension for generation_args wrt
 https://github.com/evaleval/evalHub/issues/13

---
 schema/eval.schema.json | 50 ++++++++++++++++++++++++++++++++++----
 schema/eval_types.py    | 53 ++++++++++++++++-------------------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/schema/eval.schema.json b/schema/eval.schema.json
index 5703bea..13fe0c3 100644
--- a/schema/eval.schema.json
+++ b/schema/eval.schema.json
@@ -118,10 +118,6 @@
                         "generation_args": {
                             "type": "object",
                             "properties": {
-                                "use_vllm": {
-                                    "type": "boolean",
-                                    "description": "Whether VLLM was used for inference"
-                                },
                                 "temperature": {
                                     "type": [
                                         "null",
@@ -155,6 +151,52 @@
                                         "type": "string"
                                     },
                                     "default": []
+                                },
+                                "seed": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Random seed parameter"
+                                },
+                                "frequency_penalty": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim"
+                                },
+                                "presence_penalty": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics."
+                                },
+                                "logprobs": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Return log probabilities of the output tokens or not"
+                                },
+                                "top_logprobs": {
+                                    "type": "integer",
+                                    "description": "Number of most likely tokens (0-20) to return at each token position"
+                                },
+                                "logit_bias": {
+                                    "type": [
+                                        "null", 
+                                        "object"
+                                    ],
+                                    "description": "Map token Ids (float keys) to an associated bias value (integer)",
+                                    "additionalProperties": {
+                                        "type": "integer"
+                                    },
+                                    "propertyNames": {
+                                        "pattern": "^-?\\d+(\\.\\d+)?$",
+                                        "description": "Keys must be valid floats"
+                                    }
                                 }
                             }
                         }
diff --git a/schema/eval_types.py b/schema/eval_types.py
index 686aa25..db38368 100644
--- a/schema/eval_types.py
+++ b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-09-04T13:29:15+00:00
+#   timestamp: 2025-09-04T14:32:50+00:00
 
 from __future__ import annotations
 
@@ -30,30 +30,6 @@ class ModelInfo(BaseModel):
     family: Optional[Family] = Field(None, description='Model family')
 
 
-class Architecture(Enum):
-    transformer = 'transformer'
-    moe = 'moe'
-    ssm = 'ssm'
-    NoneType_None = None
-
-
-class Configuration(BaseModel):
-    architecture: Optional[Architecture] = Field(
-        None, description='Model architecture type'
-    )
-    parameters: Optional[conint(ge=1)] = Field(
-        None, description='Number of parameters in billions'
-    )
-    context_window: conint(ge=1) = Field(
-        ..., description='Maximum context window size in tokens'
-    )
-    is_instruct: Optional[bool] = Field(
-        None, description='Whether the model is instruction-tuned'
-    )
-    hf_path: Optional[str] = Field(None, description='HuggingFace model path')
-    revision: Optional[str] = Field(None, description='Model revision/commit hash')
-
-
 class BitPrecision(Enum):
     none = 'none'
     int8 = 'int8'
@@ -88,9 +64,6 @@ class Quantization(BaseModel):
 
 
 class GenerationArgs(BaseModel):
-    use_vllm: Optional[bool] = Field(
-        None, description='Whether VLLM was used for inference'
-    )
     temperature: Optional[float] = Field(None, description='Sampling temperature')
     top_p: Optional[float] = Field(None, description='Nucleus sampling parameter')
     top_k: Optional[float] = Field(None, description='Top-k sampling parameter')
@@ -100,6 +73,26 @@ class GenerationArgs(BaseModel):
     stop_sequences: Optional[List[str]] = Field(
         [], description='Sequences that stop generation'
     )
+    seed: Optional[float] = Field(None, description='Random seed parameter')
+    frequency_penalty: Optional[float] = Field(
+        None,
+        description='Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim',
+    )
+    presence_penalty: Optional[float] = Field(
+        None,
+        description='Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics.',
+    )
+    logprobs: Optional[float] = Field(
+        None, description='Return log probabilities of the output tokens or not'
+    )
+    top_logprobs: Optional[int] = Field(
+        None,
+        description='Number of most likely tokens (0-20) to return at each token position',
+    )
+    logit_bias: Optional[Dict[str, Any]] = Field(
+        None,
+        description='Map token Ids (float keys) to an associated bias value (integer)',
+    )
 
 
 class InferenceSettings(BaseModel):
@@ -112,10 +105,6 @@ class Model(BaseModel):
         ...,
         description='Basic identifying information about the model - represents the core identity and naming of the model without technical details',
     )
-    configuration: Configuration = Field(
-        ...,
-        description="Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
-    )
     inference_settings: InferenceSettings = Field(
         ...,
         description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution',

From f3705b9ca46ebf6952ac82e55fec72534216a2b0 Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Thu, 4 Sep 2025 16:59:05 +0200
Subject: [PATCH 4/4] Add support for full logprobs for all steps wrt
 https://github.com/evaleval/evalHub/issues/21

---
 schema/eval.schema.json | 40 ++++++++++++++++++++++++++++++++++++++++
 schema/eval_types.py    | 13 ++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/schema/eval.schema.json b/schema/eval.schema.json
index 13fe0c3..72953b8 100644
--- a/schema/eval.schema.json
+++ b/schema/eval.schema.json
@@ -577,6 +577,46 @@
                             }
                         ]
                     }
+                },
+                "full_logprobs": {
+                    "additionalProperties": false,
+                    "items": {
+                        "oneOf": [
+                            {
+                                "type": [
+                                    "null"
+                                ]
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "object",
+                                        "required": [
+                                            "token_id",
+                                            "logprob",
+                                            "decoded_token"
+                                        ],
+                                        "properties": {
+                                            "token_id": {
+                                                "type": "number",
+                                                "description": "Id of token for which we keep its logprob"
+                                            },
+                                            "logprob": {
+                                                "type": "number",
+                                                "description": "Log probability of the token"
+                                            },
+                                            "decoded_token": {
+                                                "type": "string",
+                                                "description": "The decoded string representation of the token"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        ]
+                    }
                 }
             }
         },
diff --git a/schema/eval_types.py b/schema/eval_types.py
index db38368..ecc0e21 100644
--- a/schema/eval_types.py
+++ b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-09-04T14:32:50+00:00
+#   timestamp: 2025-09-04T14:56:48+00:00
 
 from __future__ import annotations
 
@@ -261,6 +261,16 @@ class GeneratedTokensLogprob(BaseModel):
     )
 
 
+class FullLogprob(BaseModel):
+    token_id: float = Field(
+        ..., description='Id of token for which we keep its logprob'
+    )
+    logprob: float = Field(..., description='Log probability of the token')
+    decoded_token: str = Field(
+        ..., description='The decoded string representation of the token'
+    )
+
+
 class Output(BaseModel):
     response: str = Field(..., description="The model's complete text response")
     cumulative_logprob: Optional[float] = Field(
@@ -270,6 +280,7 @@ class Output(BaseModel):
     generated_tokens_logprobs: Optional[
         List[Union[Optional[str], List[GeneratedTokensLogprob]]]
     ] = None
+    full_logprobs: Optional[List[Optional[List[List[FullLogprob]]]]] = None
 
 
 class EvaluationMethod(BaseModel):