From a2f96de4594e1e6cbff6d6c6f100c06b41c8f1df Mon Sep 17 00:00:00 2001 From: Damian Stachura Date: Thu, 4 Sep 2025 15:31:40 +0200 Subject: [PATCH 1/4] Update Quantization with related to https://github.com/evaleval/evalHub/issues/15 --- pyproject.toml | 1 + schema/README.md | 6 ++++++ schema/eval.schema.json | 21 +++++++++++++++++++-- schema/eval_types.py | 18 ++++++++++++++++-- uv.lock | 12 ++++++++++++ 5 files changed, 54 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dc5edf3..263d672 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.12" dependencies = [ "crfm-helm>=0.5.6", "dacite>=1.9.2", + "datamodel-code-generator>=0.31.2", "numpy>=2.3.1", "openai>=1.93.0", "pandas>=2.3.0", diff --git a/schema/README.md b/schema/README.md index 2bc9a33..9e3dd3c 100644 --- a/schema/README.md +++ b/schema/README.md @@ -7,7 +7,13 @@ To generate or update Python types from the JSON schema, you can run the followi ```bash datamodel-codegen --input eval.schema.json --output eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel ``` +or +```bash +uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel +``` +from main directory ## Example Data Please refer to [this file](./eval.example.json) for a minimal data example adhering to the evaluation format schema. +uv run datamodel-codegen --input schema/eval.schema.json --input-file-type jsonschema --output schema/eval_types.py --class-name EvaluationResult --output-model-type pydantic_v2.BaseModel \ No newline at end of file diff --git a/schema/eval.schema.json b/schema/eval.schema.json index 3b21d0a..f51fc87 100644 --- a/schema/eval.schema.json +++ b/schema/eval.schema.json @@ -126,7 +126,8 @@ "type": "object", "required": [ "bit_precision", - "method" + "method", + "type" ], "properties": { "bit_precision": { @@ -140,13 +141,29 @@ ], "description": "Quantization bit precision" }, - "method": { + "type": { "type": "string", "enum": [ "None", "dynamic", "static" ], + "description": "Quantization type" + }, + "method": { + "type": "string", + "enum": [ + "None", + "AWQ", + "PTQ", + "GPTQ", + "GGUF", + "QAT", + "BitsAndBytes", + "SmoothQuant", + "HQQ", + "Quanto" + ], "description": "Quantization method" } } diff --git a/schema/eval_types.py b/schema/eval_types.py index 98e4eed..686aa25 100644 --- a/schema/eval_types.py +++ b/schema/eval_types.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: eval.schema.json -# timestamp: 2025-07-06T13:59:25+00:00 +# timestamp: 2025-09-04T13:29:15+00:00 from __future__ import annotations @@ -62,14 +62,28 @@ class BitPrecision(Enum): float32 = 'float32' -class Method(Enum): +class Type(Enum): None_ = 'None' dynamic = 'dynamic' static = 'static' +class Method(Enum): + None_ = 'None' + AWQ = 'AWQ' + PTQ = 'PTQ' + GPTQ = 'GPTQ' + GGUF = 'GGUF' + QAT = 'QAT' + BitsAndBytes = 'BitsAndBytes' + SmoothQuant = 'SmoothQuant' + HQQ = 'HQQ' + Quanto = 'Quanto' + + class Quantization(BaseModel): bit_precision: BitPrecision = Field(..., description='Quantization bit precision') + type: Type = Field(..., description='Quantization type') method: Method = Field(..., description='Quantization method') diff --git a/uv.lock b/uv.lock index 130c14e..7fb48cf 100644 --- a/uv.lock +++ b/uv.lock @@ -1,4 +1,5 @@ version = 1 +revision = 1 requires-python = ">=3.12" [[package]] @@ -503,6 +504,7 @@ source = { virtual = "." } dependencies = [ { name = "crfm-helm" }, { name = "dacite" }, + { name = "datamodel-code-generator" }, { name = "numpy" }, { name = "openai" }, { name = "pandas" }, @@ -518,12 +520,14 @@ dev = [ { name = "pylint" }, { name = "pytest" }, { name = "ruff" }, + { name = "wget" }, ] [package.metadata] requires-dist = [ { name = "crfm-helm", specifier = ">=0.5.6" }, { name = "dacite", specifier = ">=1.9.2" }, + { name = "datamodel-code-generator", specifier = ">=0.31.2" }, { name = "datamodel-code-generator", marker = "extra == 'dev'", specifier = ">=0.31.2" }, { name = "numpy", specifier = ">=2.3.1" }, { name = "openai", specifier = ">=1.93.0" }, @@ -535,7 +539,9 @@ requires-dist = [ { name = "requests", specifier = ">=2.32.4" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.12.2" }, { name = "transformers", specifier = ">=4.53.1" }, + { name = "wget", marker = "extra == 'dev'", specifier = ">=3.2" }, ] +provides-extras = ["dev"] [[package]] name = "filelock" @@ -2449,6 +2455,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/87/abd57374044e1f627f0a905ac33c1a7daab35a3a815abfea4e1bafd3fdb1/weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c", size = 50270 }, ] +[[package]] +name = "wget" +version = "3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061", size = 10857 } + [[package]] name = "wrapt" version = "1.17.2" From 77e0b5906c8353080d03359e86a94407d5e2f41b Mon Sep 17 00:00:00 2001 From: Damian Stachura Date: Thu, 4 Sep 2025 15:37:00 +0200 Subject: [PATCH 2/4] Remove Configuration class wrt https://github.com/evaleval/evalHub/pull/10 as probably not necessary --- schema/eval.schema.json | 53 ----------------------------------------- 1 file changed, 53 deletions(-) diff --git a/schema/eval.schema.json b/schema/eval.schema.json index f51fc87..5703bea 100644 --- a/schema/eval.schema.json +++ b/schema/eval.schema.json @@ -62,59 +62,6 @@ } } }, - "configuration": { - "type": "object", - "description": "Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted", - "required": [ - "context_window" - ], - "properties": { - "architecture": { - "type": [ - "string", - "null" - ], - "enum": [ - "transformer", - "moe", - "ssm", - null - ], - "description": "Model architecture type" - }, - "parameters": { - "type": [ - "integer", - "null" - ], - "minimum": 1, - "description": "Number of parameters in billions" - }, - "context_window": { - "type": "integer", - "minimum": 1, - "description": "Maximum context window size in tokens" - }, - "is_instruct": { - "type": "boolean", - "description": "Whether the model is instruction-tuned" - }, - "hf_path": { - "type": [ - "string", - "null" - ], - "description": "HuggingFace model path" - }, - "revision": { - "type": [ - "string", - "null" - ], - "description": "Model revision/commit hash" - } - } - }, "inference_settings": { "type": "object", "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution", From 8292dd94eb45c1ec64e1c880ddf54a164cf25ad6 Mon Sep 17 00:00:00 2001 From: Damian Stachura Date: Thu, 4 Sep 2025 16:33:45 +0200 Subject: [PATCH 3/4] Extension for generation_args wrt https://github.com/evaleval/evalHub/issues/13 --- schema/eval.schema.json | 50 ++++++++++++++++++++++++++++++++++---- schema/eval_types.py | 53 ++++++++++++++++------------------------- 2 files changed, 67 insertions(+), 36 deletions(-) diff --git a/schema/eval.schema.json b/schema/eval.schema.json index 5703bea..13fe0c3 100644 --- a/schema/eval.schema.json +++ b/schema/eval.schema.json @@ -118,10 +118,6 @@ "generation_args": { "type": "object", "properties": { - "use_vllm": { - "type": "boolean", - "description": "Whether VLLM was used for inference" - }, "temperature": { "type": [ "null", @@ -155,6 +151,52 @@ "type": "string" }, "default": [] + }, + "seed": { + "type": [ + "null", + "number" + ], + "description": "Random seed parameter" + }, + "frequency_penalty": { + "type": [ + "null", + "number" + ], + "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim" + }, + "presence_penalty": { + "type": [ + "null", + "number" + ], + "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics." + }, + "logprobs": { + "type": [ + "null", + "number" + ], + "description": "Return log probabilities of the output tokens or not" + }, + "top_logprobs": { + "type": "integer", + "description": "Number of most likely tokens (0-20) to return at each token position" + }, + "logit_bias": { + "type": [ + "null", + "object" + ], + "description": "Map token Ids (float keys) to an associated bias value (integer)", + "additionalProperties": { + "type": "integer" + }, + "propertyNames": { + "pattern": "^-?\\d+(\\.\\d+)?$", + "description": "Keys must be valid floats" + } } } } diff --git a/schema/eval_types.py b/schema/eval_types.py index 686aa25..db38368 100644 --- a/schema/eval_types.py +++ b/schema/eval_types.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: eval.schema.json -# timestamp: 2025-09-04T13:29:15+00:00 +# timestamp: 2025-09-04T14:32:50+00:00 from __future__ import annotations @@ -30,30 +30,6 @@ class ModelInfo(BaseModel): family: Optional[Family] = Field(None, description='Model family') -class Architecture(Enum): - transformer = 'transformer' - moe = 'moe' - ssm = 'ssm' - NoneType_None = None - - -class Configuration(BaseModel): - architecture: Optional[Architecture] = Field( - None, description='Model architecture type' - ) - parameters: Optional[conint(ge=1)] = Field( - None, description='Number of parameters in billions' - ) - context_window: conint(ge=1) = Field( - ..., description='Maximum context window size in tokens' - ) - is_instruct: Optional[bool] = Field( - None, description='Whether the model is instruction-tuned' - ) - hf_path: Optional[str] = Field(None, description='HuggingFace model path') - revision: Optional[str] = Field(None, description='Model revision/commit hash') - - class BitPrecision(Enum): none = 'none' int8 = 'int8' @@ -88,9 +64,6 @@ class Quantization(BaseModel): class GenerationArgs(BaseModel): - use_vllm: Optional[bool] = Field( - None, description='Whether VLLM was used for inference' - ) temperature: Optional[float] = Field(None, description='Sampling temperature') top_p: Optional[float] = Field(None, description='Nucleus sampling parameter') top_k: Optional[float] = Field(None, description='Top-k sampling parameter') @@ -100,6 +73,26 @@ class GenerationArgs(BaseModel): stop_sequences: Optional[List[str]] = Field( [], description='Sequences that stop generation' ) + seed: Optional[float] = Field(None, description='Random seed parameter') + frequency_penalty: Optional[float] = Field( + None, + description='Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim', + ) + presence_penalty: Optional[float] = Field( + None, + description='Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics.', + ) + logprobs: Optional[float] = Field( + None, description='Return log probabilities of the output tokens or not' + ) + top_logprobs: Optional[int] = Field( + None, + description='Number of most likely tokens (0-20) to return at each token position', + ) + logit_bias: Optional[Dict[str, Any]] = Field( + None, + description='Map token Ids (float keys) to an associated bias value (integer)', + ) class InferenceSettings(BaseModel): @@ -112,10 +105,6 @@ class Model(BaseModel): ..., description='Basic identifying information about the model - represents the core identity and naming of the model without technical details', ) - configuration: Configuration = Field( - ..., - description="Technical specifications and implementation details of the model - defines how the model is structured and where it's hosted", - ) inference_settings: InferenceSettings = Field( ..., description='Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution', From f3705b9ca46ebf6952ac82e55fec72534216a2b0 Mon Sep 17 00:00:00 2001 From: Damian Stachura Date: Thu, 4 Sep 2025 16:59:05 +0200 Subject: [PATCH 4/4] Add support for full logprobs for all steps wrt https://github.com/evaleval/evalHub/issues/21 --- schema/eval.schema.json | 40 ++++++++++++++++++++++++++++++++++++++++ schema/eval_types.py | 13 ++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/schema/eval.schema.json b/schema/eval.schema.json index 13fe0c3..72953b8 100644 --- a/schema/eval.schema.json +++ b/schema/eval.schema.json @@ -577,6 +577,46 @@ } ] } + }, + "full_logprobs": { + "additionalProperties": false, + "items": { + "oneOf": [ + { + "type": [ + "null" + ] + }, + { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "object", + "required": [ + "token_id", + "logprob", + "decoded_token" + ], + "properties": { + "token_id": { + "type": "number", + "description": "Id of token for which we keep its logprob" + }, + "logprob": { + "type": "number", + "description": "Log probability of the token" + }, + "decoded_token": { + "type": "string", + "description": "The decoded string representation of the token" + } + } + } + } + } + ] + } } } }, diff --git a/schema/eval_types.py b/schema/eval_types.py index db38368..ecc0e21 100644 --- a/schema/eval_types.py +++ b/schema/eval_types.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: eval.schema.json -# timestamp: 2025-09-04T14:32:50+00:00 +# timestamp: 2025-09-04T14:56:48+00:00 from __future__ import annotations @@ -261,6 +261,16 @@ class GeneratedTokensLogprob(BaseModel): ) +class FullLogprob(BaseModel): + token_id: float = Field( + ..., description='Id of token for which we keep its logprob' + ) + logprob: float = Field(..., description='Log probability of the token') + decoded_token: str = Field( + ..., description='The decoded string representation of the token' + ) + + class Output(BaseModel): response: str = Field(..., description="The model's complete text response") cumulative_logprob: Optional[float] = Field( @@ -270,6 +280,7 @@ class Output(BaseModel): generated_tokens_logprobs: Optional[ List[Union[Optional[str], List[GeneratedTokensLogprob]]] ] = None + full_logprobs: Optional[List[Optional[List[List[FullLogprob]]]]] = None class EvaluationMethod(BaseModel):