evaleval · damian1996 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "crfm-helm>=0.5.6",
     "dacite>=1.9.2",
+    "datamodel-code-generator>=0.31.2",
     "numpy>=2.3.1",
     "openai>=1.93.0",
     "pandas>=2.3.0",

diff --git a/schema/README.md b/schema/README.md
@@ -7,7 +7,13 @@ To generate or update Python types from the JSON schema, you can run the followi
 ```bash
 datamodel-codegen --input eval.schema.json --output eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
 ```
+or 
+```bash
+uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
+```
+from main directory
 
 ## Example Data
 
 Please refer to [this file](./eval.example.json) for a minimal data example adhering to the evaluation format schema.
+uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
diff --git a/schema/eval.schema.json b/schema/eval.schema.json
@@ -62,59 +62,6 @@
                         }
                     }
                 },
-                "configuration": {
-                    "type": "object",
-                    "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
-                    "required": [
-                        "context_window"
-                    ],
-                    "properties": {
-                        "architecture": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "enum": [
-                                "transformer",
-                                "moe",
-                                "ssm",
-                                null
-                            ],
-                            "description": "Model architecture type"
-                        },
-                        "parameters": {
-                            "type": [
-                                "integer",
-                                "null"
-                            ],
-                            "minimum": 1,
-                            "description": "Number of parameters in billions"
-                        },
-                        "context_window": {
-                            "type": "integer",
-                            "minimum": 1,
-                            "description": "Maximum context window size in tokens"
-                        },
-                        "is_instruct": {
-                            "type": "boolean",
-                            "description": "Whether the model is instruction-tuned"
-                        },
-                        "hf_path": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "description": "HuggingFace model path"
-                        },
-                        "revision": {
-                            "type": [
-                                "string",
-                                "null"
-                            ],
-                            "description": "Model revision/commit hash"
-                        }
-                    }
-                },
                 "inference_settings": {
                     "type": "object",
                     "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
@@ -126,7 +73,8 @@
                             "type": "object",
                             "required": [
                                 "bit_precision",
-                                "method"
+                                "method",
+                                "type"
                             ],
                             "properties": {
                                 "bit_precision": {
@@ -140,24 +88,36 @@
                                     ],
                                     "description": "Quantization bit precision"
                                 },
-                                "method": {
+                                "type": {
                                     "type": "string",
                                     "enum": [
                                         "None",
                                         "dynamic",
                                         "static"
                                     ],
+                                    "description": "Quantization type"
+                                },
+                                "method": {
+                                    "type": "string",
+                                    "enum": [
+                                        "None",
+                                        "AWQ",
+                                        "PTQ",
+                                        "GPTQ",
+                                        "GGUF",
+                                        "QAT",
+                                        "BitsAndBytes",
+                                        "SmoothQuant",
+                                        "HQQ",
+                                        "Quanto"
+                                    ],
                                     "description": "Quantization method"
                                 }
                             }
                         },
                         "generation_args": {
                             "type": "object",
                             "properties": {
-                                "use_vllm": {
-                                    "type": "boolean",
-                                    "description": "Whether VLLM was used for inference"
-                                },
                                 "temperature": {
                                     "type": [
                                         "null",
@@ -191,6 +151,52 @@
                                         "type": "string"
                                     },
                                     "default": []
+                                },
+                                "seed": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Random seed parameter"
+                                },
+                                "frequency_penalty": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim"
+                                },
+                                "presence_penalty": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics."
+                                },
+                                "logprobs": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Return log probabilities of the output tokens or not"
+                                },
+                                "top_logprobs": {
+                                    "type": "integer",
+                                    "description": "Number of most likely tokens (0-20) to return at each token position"
+                                },
+                                "logit_bias": {
+                                    "type": [
+                                        "null", 
+                                        "object"
+                                    ],
+                                    "description": "Map token Ids (float keys) to an associated bias value (integer)",
+                                    "additionalProperties": {
+                                        "type": "integer"
+                                    },
+                                    "propertyNames": {
+                                        "pattern": "^-?\\d+(\\.\\d+)?$",
+                                        "description": "Keys must be valid floats"
+                                    }
                                 }
                             }
                         }
@@ -571,6 +577,46 @@
                             }
                         ]
                     }
+                },
+                "full_logprobs": {
+                    "additionalProperties": false,
+                    "items": {
+                        "oneOf": [
+                            {
+                                "type": [
+                                    "null"
+                                ]
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "object",
+                                        "required": [
+                                            "token_id",
+                                            "logprob",
+                                            "decoded_token"
+                                        ],
+                                        "properties": {
+                                            "token_id": {
+                                                "type": "number",
+                                                "description": "Id of token for which we keep its logprob"
+                                            },
+                                            "logprob": {
+                                                "type": "number",
+                                                "description": "Log probability of the token"
+                                            },
+                                            "decoded_token": {
+                                                "type": "string",
+                                                "description": "The decoded string representation of the token"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        ]
+                    }
                 }
             }
         },

diff --git a/schema/eval_types.py b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-07-06T13:59:25+00:00
+#   timestamp: 2025-09-04T14:56:48+00:00
 
 from __future__ import annotations
 
@@ -30,30 +30,6 @@ class ModelInfo(BaseModel):
     family: Optional[Family] = Field(None, description='Model family')
 
 
-class Architecture(Enum):
-    transformer = 'transformer'
-    moe = 'moe'
-    ssm = 'ssm'
-    NoneType_None = None
-
-
-class Configuration(BaseModel):
-    architecture: Optional[Architecture] = Field(
-        None, description='Model architecture type'
-    )
-    parameters: Optional[conint(ge=1)] = Field(
-        None, description='Number of parameters in billions'
-    )
-    context_window: conint(ge=1) = Field(
-        ..., description='Maximum context window size in tokens'
-    )
-    is_instruct: Optional[bool] = Field(
-        None, description='Whether the model is instruction-tuned'
-    )
-    hf_path: Optional[str] = Field(None, description='HuggingFace model path')
-    revision: Optional[str] = Field(None, description='Model revision/commit hash')
-
-
 class BitPrecision(Enum):
     none = 'none'
     int8 = 'int8'
@@ -62,21 +38,32 @@ class BitPrecision(Enum):
     float32 = 'float32'
 
 
-class Method(Enum):
+class Type(Enum):
     None_ = 'None'
     dynamic = 'dynamic'
     static = 'static'
 
 
+class Method(Enum):
+    None_ = 'None'
+    AWQ = 'AWQ'
+    PTQ = 'PTQ'
+    GPTQ = 'GPTQ'
+    GGUF = 'GGUF'
+    QAT = 'QAT'
+    BitsAndBytes = 'BitsAndBytes'
+    SmoothQuant = 'SmoothQuant'
+    HQQ = 'HQQ'
+    Quanto = 'Quanto'
+
+
 class Quantization(BaseModel):
     bit_precision: BitPrecision = Field(..., description='Quantization bit precision')
+    type: Type = Field(..., description='Quantization type')
     method: Method = Field(..., description='Quantization method')
 
 
 class GenerationArgs(BaseModel):
-    use_vllm: Optional[bool] = Field(
-        None, description='Whether VLLM was used for inference'
-    )
     temperature: Optional[float] = Field(None, description='Sampling temperature')
     top_p: Optional[float] = Field(None, description='Nucleus sampling parameter')
     top_k: Optional[float] = Field(None, description='Top-k sampling parameter')
@@ -86,6 +73,26 @@ class GenerationArgs(BaseModel):
     stop_sequences: Optional[List[str]] = Field(
         [], description='Sequences that stop generation'
     )
+    seed: Optional[float] = Field(None, description='Random seed parameter')
+    frequency_penalty: Optional[float] = Field(
+        None,
+        description='Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim',
+    )
+    presence_penalty: Optional[float] = Field(
+        None,
+        description='Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics.',
+    )
+    logprobs: Optional[float] = Field(
+        None, description='Return log probabilities of the output tokens or not'
+    )
+    top_logprobs: Optional[int] = Field(
+        None,
+        description='Number of most likely tokens (0-20) to return at each token position',
+    )
+    logit_bias: Optional[Dict[str, Any]] = Field(
+        None,
+        description='Map token Ids (float keys) to an associated bias value (integer)',
+    )
 
 
 class InferenceSettings(BaseModel):
@@ -98,10 +105,6 @@ class Model(BaseModel):
         ...,
         description='Basic identifying information about the model - represents the core identity and naming of the model without technical details',
     )
-    configuration: Configuration = Field(
-        ...,
-        description="Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
-    )
     inference_settings: InferenceSettings = Field(
         ...,
         description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution',
@@ -258,6 +261,16 @@ class GeneratedTokensLogprob(BaseModel):
     )
 
 
+class FullLogprob(BaseModel):
+    token_id: float = Field(
+        ..., description='Id of token for which we keep its logprob'
+    )
+    logprob: float = Field(..., description='Log probability of the token')
+    decoded_token: str = Field(
+        ..., description='The decoded string representation of the token'
+    )
+
+
 class Output(BaseModel):
     response: str = Field(..., description="The model's complete text response")
     cumulative_logprob: Optional[float] = Field(
@@ -267,6 +280,7 @@ class Output(BaseModel):
     generated_tokens_logprobs: Optional[
         List[Union[Optional[str], List[GeneratedTokensLogprob]]]
     ] = None
+    full_logprobs: Optional[List[Optional[List[List[FullLogprob]]]]] = None
 
 
 class EvaluationMethod(BaseModel):