Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ requires-python = ">=3.12"
dependencies = [
"crfm-helm>=0.5.6",
"dacite>=1.9.2",
"datamodel-code-generator>=0.31.2",
"numpy>=2.3.1",
"openai>=1.93.0",
"pandas>=2.3.0",
Expand Down
6 changes: 6 additions & 0 deletions schema/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@ To generate or update Python types from the JSON schema, you can run the followi
```bash
datamodel-codegen --input eval.schema.json --output eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
```
or
```bash
uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
```
from main directory

## Example Data

Please refer to [this file](./eval.example.json) for a minimal data example adhering to the evaluation format schema.
uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel
164 changes: 105 additions & 59 deletions schema/eval.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,59 +62,6 @@
}
}
},
"configuration": {
"type": "object",
"description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
"required": [
"context_window"
],
"properties": {
"architecture": {
"type": [
"string",
"null"
],
"enum": [
"transformer",
"moe",
"ssm",
null
],
"description": "Model architecture type"
},
"parameters": {
"type": [
"integer",
"null"
],
"minimum": 1,
"description": "Number of parameters in billions"
},
"context_window": {
"type": "integer",
"minimum": 1,
"description": "Maximum context window size in tokens"
},
"is_instruct": {
"type": "boolean",
"description": "Whether the model is instruction-tuned"
},
"hf_path": {
"type": [
"string",
"null"
],
"description": "HuggingFace model path"
},
"revision": {
"type": [
"string",
"null"
],
"description": "Model revision/commit hash"
}
}
},
"inference_settings": {
"type": "object",
"description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
Expand All @@ -126,7 +73,8 @@
"type": "object",
"required": [
"bit_precision",
"method"
"method",
"type"
],
"properties": {
"bit_precision": {
Expand All @@ -140,24 +88,36 @@
],
"description": "Quantization bit precision"
},
"method": {
"type": {
"type": "string",
"enum": [
"None",
"dynamic",
"static"
],
"description": "Quantization type"
},
"method": {
"type": "string",
"enum": [
"None",
"AWQ",
"PTQ",
"GPTQ",
"GGUF",
"QAT",
"BitsAndBytes",
"SmoothQuant",
"HQQ",
"Quanto"
],
"description": "Quantization method"
}
}
},
"generation_args": {
"type": "object",
"properties": {
"use_vllm": {
"type": "boolean",
"description": "Whether VLLM was used for inference"
},
"temperature": {
"type": [
"null",
Expand Down Expand Up @@ -191,6 +151,52 @@
"type": "string"
},
"default": []
},
"seed": {
"type": [
"null",
"number"
],
"description": "Random seed parameter"
},
"frequency_penalty": {
"type": [
"null",
"number"
],
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim"
},
"presence_penalty": {
"type": [
"null",
"number"
],
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics."
},
"logprobs": {
"type": [
"null",
"number"
],
"description": "Return log probabilities of the output tokens or not"
},
"top_logprobs": {
"type": "integer",
"description": "Number of most likely tokens (0-20) to return at each token position"
},
"logit_bias": {
"type": [
"null",
"object"
],
"description": "Map token Ids (float keys) to an associated bias value (integer)",
"additionalProperties": {
"type": "integer"
},
"propertyNames": {
"pattern": "^-?\\d+(\\.\\d+)?$",
"description": "Keys must be valid floats"
}
}
}
}
Expand Down Expand Up @@ -571,6 +577,46 @@
}
]
}
},
"full_logprobs": {
"additionalProperties": false,
"items": {
"oneOf": [
{
"type": [
"null"
]
},
{
"type": "array",
"items": {
"type": "array",
"items": {
"type": "object",
"required": [
"token_id",
"logprob",
"decoded_token"
],
"properties": {
"token_id": {
"type": "number",
"description": "Id of token for which we keep its logprob"
},
"logprob": {
"type": "number",
"description": "Log probability of the token"
},
"decoded_token": {
"type": "string",
"description": "The decoded string representation of the token"
}
}
}
}
}
]
}
}
}
},
Expand Down
80 changes: 47 additions & 33 deletions schema/eval_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: eval.schema.json
# timestamp: 2025-07-06T13:59:25+00:00
# timestamp: 2025-09-04T14:56:48+00:00

from __future__ import annotations

Expand Down Expand Up @@ -30,30 +30,6 @@ class ModelInfo(BaseModel):
family: Optional[Family] = Field(None, description='Model family')


class Architecture(Enum):
transformer = 'transformer'
moe = 'moe'
ssm = 'ssm'
NoneType_None = None


class Configuration(BaseModel):
architecture: Optional[Architecture] = Field(
None, description='Model architecture type'
)
parameters: Optional[conint(ge=1)] = Field(
None, description='Number of parameters in billions'
)
context_window: conint(ge=1) = Field(
..., description='Maximum context window size in tokens'
)
is_instruct: Optional[bool] = Field(
None, description='Whether the model is instruction-tuned'
)
hf_path: Optional[str] = Field(None, description='HuggingFace model path')
revision: Optional[str] = Field(None, description='Model revision/commit hash')


class BitPrecision(Enum):
none = 'none'
int8 = 'int8'
Expand All @@ -62,21 +38,32 @@ class BitPrecision(Enum):
float32 = 'float32'


class Method(Enum):
class Type(Enum):
None_ = 'None'
dynamic = 'dynamic'
static = 'static'


class Method(Enum):
None_ = 'None'
AWQ = 'AWQ'
PTQ = 'PTQ'
GPTQ = 'GPTQ'
GGUF = 'GGUF'
QAT = 'QAT'
BitsAndBytes = 'BitsAndBytes'
SmoothQuant = 'SmoothQuant'
HQQ = 'HQQ'
Quanto = 'Quanto'


class Quantization(BaseModel):
bit_precision: BitPrecision = Field(..., description='Quantization bit precision')
type: Type = Field(..., description='Quantization type')
method: Method = Field(..., description='Quantization method')


class GenerationArgs(BaseModel):
use_vllm: Optional[bool] = Field(
None, description='Whether VLLM was used for inference'
)
temperature: Optional[float] = Field(None, description='Sampling temperature')
top_p: Optional[float] = Field(None, description='Nucleus sampling parameter')
top_k: Optional[float] = Field(None, description='Top-k sampling parameter')
Expand All @@ -86,6 +73,26 @@ class GenerationArgs(BaseModel):
stop_sequences: Optional[List[str]] = Field(
[], description='Sequences that stop generation'
)
seed: Optional[float] = Field(None, description='Random seed parameter')
frequency_penalty: Optional[float] = Field(
None,
description='Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim',
)
presence_penalty: Optional[float] = Field(
None,
description='Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics.',
)
logprobs: Optional[float] = Field(
None, description='Return log probabilities of the output tokens or not'
)
top_logprobs: Optional[int] = Field(
None,
description='Number of most likely tokens (0-20) to return at each token position',
)
logit_bias: Optional[Dict[str, Any]] = Field(
None,
description='Map token Ids (float keys) to an associated bias value (integer)',
)


class InferenceSettings(BaseModel):
Expand All @@ -98,10 +105,6 @@ class Model(BaseModel):
...,
description='Basic identifying information about the model - represents the core identity and naming of the model without technical details',
)
configuration: Configuration = Field(
...,
description="Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted",
)
inference_settings: InferenceSettings = Field(
...,
description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution',
Expand Down Expand Up @@ -258,6 +261,16 @@ class GeneratedTokensLogprob(BaseModel):
)


class FullLogprob(BaseModel):
token_id: float = Field(
..., description='Id of token for which we keep its logprob'
)
logprob: float = Field(..., description='Log probability of the token')
decoded_token: str = Field(
..., description='The decoded string representation of the token'
)


class Output(BaseModel):
response: str = Field(..., description="The model's complete text response")
cumulative_logprob: Optional[float] = Field(
Expand All @@ -267,6 +280,7 @@ class Output(BaseModel):
generated_tokens_logprobs: Optional[
List[Union[Optional[str], List[GeneratedTokensLogprob]]]
] = None
full_logprobs: Optional[List[Optional[List[List[FullLogprob]]]]] = None


class EvaluationMethod(BaseModel):
Expand Down
Loading