From 581ebca7fdeef23e9420e705229a76e0e6750ae7 Mon Sep 17 00:00:00 2001 From: Sara Robinson Date: Fri, 30 Jan 2026 12:31:35 -0800 Subject: [PATCH] feat: Add UnifiedMetric support to Vertex Tuning evaluation config PiperOrigin-RevId: 863347730 --- google/genai/_transformers.py | 13 + google/genai/tests/tunings/test_tune.py | 50 +++ google/genai/types.py | 540 ++++++++++++++++++------ 3 files changed, 471 insertions(+), 132 deletions(-) diff --git a/google/genai/_transformers.py b/google/genai/_transformers.py index 91a32a881..2f8fd3d78 100644 --- a/google/genai/_transformers.py +++ b/google/genai/_transformers.py @@ -1295,6 +1295,19 @@ def t_metrics( metrics_payload = [] for metric in metrics: + + if isinstance(metric, dict) and types.UnifiedMetric.model_validate(metric): + metric = types.UnifiedMetric.model_validate(metric) + + if isinstance(metric, types.UnifiedMetric): + unified_metric_payload: dict[str, Any] = metric.model_dump() + unified_metric_payload['aggregation_metrics'] = [ + 'AVERAGE', + 'STANDARD_DEVIATION', + ] + metrics_payload.append(unified_metric_payload) + continue + metric_payload_item: dict[str, Any] = {} metric_payload_item['aggregation_metrics'] = [ 'AVERAGE', diff --git a/google/genai/tests/tunings/test_tune.py b/google/genai/tests/tunings/test_tune.py index f5cc38fe4..f741a2f6b 100755 --- a/google/genai/tests/tunings/test_tune.py +++ b/google/genai/tests/tunings/test_tune.py @@ -344,6 +344,56 @@ def test_eval_config_with_metrics(client): assert tuning_job.state == genai_types.JobState.JOB_STATE_PENDING +@pytest.mark.skipif( + "config.getoption('--private')", + reason="Skipping in pre-public tests" +) +def test_eval_config_with_unified_metrics(client): + """Tests tuning with eval config metrics.""" + if client._api_client.vertexai: + evaluation_config=genai_types.EvaluationConfig( + metrics=[ + genai_types.Metric( + name="prompt-relevance", + prompt_template="How well does the response address the prompt?: PROMPT: {request}\n RESPONSE: {response}\n", + return_raw_output=True, + judge_model_system_instruction="You are a cat. Make all evaluations from this perspective.", + ), + genai_types.Metric(name='bleu'), + genai_types.Metric(name='rouge_1'), + genai_types.UnifiedMetric( + bleu_spec=genai_types.BleuSpec( + use_effective_order=True + ) + ), + ], + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination( + output_uri_prefix="gs://sararob_test/" + ) + ), + autorater_config=genai_types.AutoraterConfig( + sampling_count=1, + autorater_model="test-model", + ), + ) + tuning_job = client.tunings.tune( + base_model="gemini-2.5-flash", + training_dataset=genai_types.TuningDataset(gcs_uri="gs://cloud-samples-data/ai-platform/generative_ai/gemini-2_0/text/sft_train_data.jsonl"), + config=genai_types.CreateTuningJobConfig( + tuned_model_display_name="tuning job with eval config", + epoch_count=1, + learning_rate_multiplier=1.0, + adapter_size="ADAPTER_SIZE_ONE", + validation_dataset=genai_types.TuningValidationDataset( + gcs_uri="gs://cloud-samples-data/ai-platform/generative_ai/gemini-2_0/text/sft_validation_data.jsonl" + ), + evaluation_config=evaluation_config, + ), + ) + assert tuning_job.state == genai_types.JobState.JOB_STATE_PENDING + + @pytest.mark.skipif( "config.getoption('--private')", reason="Skipping in pre-public tests" diff --git a/google/genai/types.py b/google/genai/types.py index 73e6ef6bc..48610c88e 100644 --- a/google/genai/types.py +++ b/google/genai/types.py @@ -609,6 +609,19 @@ class TuningTask(_common.CaseInSensitiveEnum): """Tuning task for reference to video.""" +class RubricContentType(_common.CaseInSensitiveEnum): + """Represents the rubric content type.""" + + RUBRIC_CONTENT_TYPE_UNSPECIFIED = 'RUBRIC_CONTENT_TYPE_UNSPECIFIED' + """Rubric content type is unspecified.""" + PROPERTY = 'PROPERTY' + """Generate rubrics based on properties.""" + NL_QUESTION_ANSWER = 'NL_QUESTION_ANSWER' + """Generate rubrics in an NL question answer format.""" + PYTHON_CODE_ASSERTION = 'PYTHON_CODE_ASSERTION' + """Generate rubrics in a unit test format.""" + + class PartMediaResolutionLevel(_common.CaseInSensitiveEnum): """The tokenization quality used for given media.""" @@ -1004,6 +1017,21 @@ class LiveMusicPlaybackControl(_common.CaseInSensitiveEnum): Retains the current prompts and config.""" +class ComputationBasedMetricType(_common.CaseInSensitiveEnum): + """Represents the type of the computation based metric.""" + + COMPUTATION_BASED_METRIC_TYPE_UNSPECIFIED = ( + 'COMPUTATION_BASED_METRIC_TYPE_UNSPECIFIED' + ) + """Computation based metric type is unspecified.""" + EXACT_MATCH = 'EXACT_MATCH' + """Exact match metric.""" + BLEU = 'BLEU' + """BLEU metric.""" + ROUGE = 'ROUGE' + """ROUGE metric.""" + + class PartMediaResolution(_common.BaseModel): """Media resolution for the input media.""" @@ -5681,6 +5709,10 @@ def _rename_citation_sources(cls, data: Any) -> Any: return data +MetricUnion = Union['Metric', 'UnifiedMetric'] +MetricUnionDict = Union['MetricDict', 'UnifiedMetricDict'] + + class CitationMetadataDict(TypedDict, total=False): """Citation information when the model quotes another source.""" @@ -10772,140 +10804,10 @@ class AutoraterConfigDict(TypedDict, total=False): AutoraterConfigOrDict = Union[AutoraterConfig, AutoraterConfigDict] -class Metric(_common.BaseModel): - """The metric used for evaluation.""" - - name: Optional[str] = Field( - default=None, description="""The name of the metric.""" - ) - custom_function: Optional[Callable[..., Any]] = Field( - default=None, - description="""The custom function that defines the end-to-end logic for metric computation.""", - ) - prompt_template: Optional[str] = Field( - default=None, description="""The prompt template for the metric.""" - ) - judge_model_system_instruction: Optional[str] = Field( - default=None, - description="""The system instruction for the judge model.""", - ) - return_raw_output: Optional[bool] = Field( - default=None, - description="""Whether to return the raw output from the judge model.""", - ) - parse_and_reduce_fn: Optional[Callable[..., Any]] = Field( - default=None, - description="""The parse and reduce function for the judge model.""", - ) - aggregate_summary_fn: Optional[Callable[..., Any]] = Field( - default=None, - description="""The aggregate summary function for the judge model.""", - ) - - # Allow extra fields to support metric-specific config fields. - model_config = ConfigDict(extra='allow') - - _is_predefined: bool = PrivateAttr(default=False) - """A boolean indicating whether the metric is predefined.""" - - _config_source: Optional[str] = PrivateAttr(default=None) - """An optional string indicating the source of the metric configuration.""" - - _version: Optional[str] = PrivateAttr(default=None) - """An optional string indicating the version of the metric.""" - - @model_validator(mode='after') # type: ignore[arg-type] - def validate_name(self) -> 'Metric': - if not self.name: - raise ValueError('Metric name cannot be empty.') - self.name = self.name.lower() - return self - - def to_yaml_file(self, file_path: str, version: Optional[str] = None) -> None: - """Dumps the metric object to a YAML file. - - Args: - file_path: The path to the YAML file. - version: Optional version string to include in the YAML output. - - Raises: - ImportError: If the pyyaml library is not installed. - """ - try: - import yaml - except ImportError: - raise ImportError( - 'YAML serialization requires the pyyaml library. Please install' - " it using 'pip install google-cloud-aiplatform[evaluation]'." - ) - - fields_to_exclude_callables = set() - for field_name, field_info in self.model_fields.items(): - annotation = field_info.annotation - origin = typing.get_origin(annotation) - - is_field_callable_type = False - if annotation is Callable or origin is Callable: # type: ignore[comparison-overlap] - is_field_callable_type = True - elif origin is Union: - args = typing.get_args(annotation) - if any( - arg is Callable or typing.get_origin(arg) is Callable - for arg in args - ): - is_field_callable_type = True - - if is_field_callable_type: - fields_to_exclude_callables.add(field_name) - - data_to_dump = self.model_dump( - exclude_unset=True, - exclude_none=True, - mode='json', - exclude=fields_to_exclude_callables - if fields_to_exclude_callables - else None, - ) - - if version: - data_to_dump['version'] = version - - with open(file_path, 'w', encoding='utf-8') as f: - yaml.dump(data_to_dump, f, sort_keys=False, allow_unicode=True) - - -class MetricDict(TypedDict, total=False): - """The metric used for evaluation.""" - - name: Optional[str] - """The name of the metric.""" - - custom_function: Optional[Callable[..., Any]] - """The custom function that defines the end-to-end logic for metric computation.""" - - prompt_template: Optional[str] - """The prompt template for the metric.""" - - judge_model_system_instruction: Optional[str] - """The system instruction for the judge model.""" - - return_raw_output: Optional[bool] - """Whether to return the raw output from the judge model.""" - - parse_and_reduce_fn: Optional[Callable[..., Any]] - """The parse and reduce function for the judge model.""" - - aggregate_summary_fn: Optional[Callable[..., Any]] - """The aggregate summary function for the judge model.""" - - -MetricOrDict = Union[Metric, MetricDict] - - class EvaluationConfig(_common.BaseModel): """Evaluation config for tuning.""" - metrics: Optional[list[Metric]] = Field( + metrics: Optional[list[MetricUnion]] = Field( default=None, description="""The metrics used for evaluation.""" ) output_config: Optional[OutputConfig] = Field( @@ -10919,7 +10821,7 @@ class EvaluationConfig(_common.BaseModel): class EvaluationConfigDict(TypedDict, total=False): """Evaluation config for tuning.""" - metrics: Optional[list[MetricDict]] + metrics: Optional[list[MetricUnionDict]] """The metrics used for evaluation.""" output_config: Optional[OutputConfigDict] @@ -18504,6 +18406,136 @@ class BleuSpecDict(TypedDict, total=False): BleuSpecOrDict = Union[BleuSpec, BleuSpecDict] +class Metric(_common.BaseModel): + """The metric used for evaluation.""" + + name: Optional[str] = Field( + default=None, description="""The name of the metric.""" + ) + custom_function: Optional[Callable[..., Any]] = Field( + default=None, + description="""The custom function that defines the end-to-end logic for metric computation.""", + ) + prompt_template: Optional[str] = Field( + default=None, description="""The prompt template for the metric.""" + ) + judge_model_system_instruction: Optional[str] = Field( + default=None, + description="""The system instruction for the judge model.""", + ) + return_raw_output: Optional[bool] = Field( + default=None, + description="""Whether to return the raw output from the judge model.""", + ) + parse_and_reduce_fn: Optional[Callable[..., Any]] = Field( + default=None, + description="""The parse and reduce function for the judge model.""", + ) + aggregate_summary_fn: Optional[Callable[..., Any]] = Field( + default=None, + description="""The aggregate summary function for the judge model.""", + ) + + # Allow extra fields to support metric-specific config fields. + model_config = ConfigDict(extra='allow') + + _is_predefined: bool = PrivateAttr(default=False) + """A boolean indicating whether the metric is predefined.""" + + _config_source: Optional[str] = PrivateAttr(default=None) + """An optional string indicating the source of the metric configuration.""" + + _version: Optional[str] = PrivateAttr(default=None) + """An optional string indicating the version of the metric.""" + + @model_validator(mode='after') # type: ignore[arg-type] + def validate_name(self) -> 'Metric': + if not self.name: + raise ValueError('Metric name cannot be empty.') + self.name = self.name.lower() + return self + + def to_yaml_file(self, file_path: str, version: Optional[str] = None) -> None: + """Dumps the metric object to a YAML file. + + Args: + file_path: The path to the YAML file. + version: Optional version string to include in the YAML output. + + Raises: + ImportError: If the pyyaml library is not installed. + """ + try: + import yaml + except ImportError: + raise ImportError( + 'YAML serialization requires the pyyaml library. Please install' + " it using 'pip install google-cloud-aiplatform[evaluation]'." + ) + + fields_to_exclude_callables = set() + for field_name, field_info in self.model_fields.items(): + annotation = field_info.annotation + origin = typing.get_origin(annotation) + + is_field_callable_type = False + if annotation is Callable or origin is Callable: # type: ignore[comparison-overlap] + is_field_callable_type = True + elif origin is Union: + args = typing.get_args(annotation) + if any( + arg is Callable or typing.get_origin(arg) is Callable + for arg in args + ): + is_field_callable_type = True + + if is_field_callable_type: + fields_to_exclude_callables.add(field_name) + + data_to_dump = self.model_dump( + exclude_unset=True, + exclude_none=True, + mode='json', + exclude=fields_to_exclude_callables + if fields_to_exclude_callables + else None, + ) + + if version: + data_to_dump['version'] = version + + with open(file_path, 'w', encoding='utf-8') as f: + yaml.dump(data_to_dump, f, sort_keys=False, allow_unicode=True) + + +class MetricDict(TypedDict, total=False): + """The metric used for evaluation.""" + + name: Optional[str] + """The name of the metric.""" + + custom_function: Optional[Callable[..., Any]] + """The custom function that defines the end-to-end logic for metric computation.""" + + prompt_template: Optional[str] + """The prompt template for the metric.""" + + judge_model_system_instruction: Optional[str] + """The system instruction for the judge model.""" + + return_raw_output: Optional[bool] + """Whether to return the raw output from the judge model.""" + + parse_and_reduce_fn: Optional[Callable[..., Any]] + """The parse and reduce function for the judge model.""" + + aggregate_summary_fn: Optional[Callable[..., Any]] + """The aggregate summary function for the judge model.""" + + +MetricOrDict = Union[Metric, MetricDict] + + class PairwiseMetricSpec(_common.BaseModel): """Spec for pairwise metric.""" @@ -18617,6 +18649,250 @@ class RougeSpecDict(TypedDict, total=False): RougeSpecOrDict = Union[RougeSpec, RougeSpecDict] +class PredefinedMetricSpec(_common.BaseModel): + """Spec for predefined metric.""" + + metric_spec_name: Optional[str] = Field( + default=None, + description="""The name of a pre-defined metric, such as "instruction_following_v1" or + "text_quality_v1".""", + ) + metric_spec_parameters: Optional[dict[str, Any]] = Field( + default=None, + description="""The parameters needed to run the pre-defined metric.""", + ) + + +class PredefinedMetricSpecDict(TypedDict, total=False): + """Spec for predefined metric.""" + + metric_spec_name: Optional[str] + """The name of a pre-defined metric, such as "instruction_following_v1" or + "text_quality_v1".""" + + metric_spec_parameters: Optional[dict[str, Any]] + """The parameters needed to run the pre-defined metric.""" + + +PredefinedMetricSpecOrDict = Union[ + PredefinedMetricSpec, PredefinedMetricSpecDict +] + + +class RubricGenerationSpec(_common.BaseModel): + """Specification for how rubrics should be generated.""" + + prompt_template: Optional[str] = Field( + default=None, + description="""Template for the prompt used to generate rubrics.""", + ) + rubric_content_type: Optional[RubricContentType] = Field( + default=None, + description="""The type of rubric content to be generated.""", + ) + rubric_type_ontology: Optional[list[str]] = Field( + default=None, + description="""An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""", + ) + + +class RubricGenerationSpecDict(TypedDict, total=False): + """Specification for how rubrics should be generated.""" + + prompt_template: Optional[str] + """Template for the prompt used to generate rubrics.""" + + rubric_content_type: Optional[RubricContentType] + """The type of rubric content to be generated.""" + + rubric_type_ontology: Optional[list[str]] + """An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""" + + +RubricGenerationSpecOrDict = Union[ + RubricGenerationSpec, RubricGenerationSpecDict +] + + +class LLMBasedMetricSpec(_common.BaseModel): + """Specification for an LLM based metric.""" + + metric_prompt_template: Optional[str] = Field( + default=None, + description="""Template for the prompt sent to the judge model.""", + ) + system_instruction: Optional[str] = Field( + default=None, description="""System instruction for the judge model.""" + ) + judge_autorater_config: Optional[AutoraterConfig] = Field( + default=None, + description="""Optional configuration for the judge LLM (Autorater).""", + ) + rubric_group_key: Optional[str] = Field( + default=None, + description="""Use a pre-defined group of rubrics associated with the input. + Refers to a key in the rubric_groups map of EvaluationInstance.""", + ) + predefined_rubric_generation_spec: Optional[PredefinedMetricSpec] = Field( + default=None, + description="""Dynamically generate rubrics using a predefined spec.""", + ) + rubric_generation_spec: Optional[RubricGenerationSpec] = Field( + default=None, + description="""Dynamically generate rubrics using this specification.""", + ) + additional_config: Optional[dict[str, Any]] = Field( + default=None, + description="""Optional. Optional additional configuration for the metric.""", + ) + + +class LLMBasedMetricSpecDict(TypedDict, total=False): + """Specification for an LLM based metric.""" + + metric_prompt_template: Optional[str] + """Template for the prompt sent to the judge model.""" + + system_instruction: Optional[str] + """System instruction for the judge model.""" + + judge_autorater_config: Optional[AutoraterConfigDict] + """Optional configuration for the judge LLM (Autorater).""" + + rubric_group_key: Optional[str] + """Use a pre-defined group of rubrics associated with the input. + Refers to a key in the rubric_groups map of EvaluationInstance.""" + + predefined_rubric_generation_spec: Optional[PredefinedMetricSpecDict] + """Dynamically generate rubrics using a predefined spec.""" + + rubric_generation_spec: Optional[RubricGenerationSpecDict] + """Dynamically generate rubrics using this specification.""" + + additional_config: Optional[dict[str, Any]] + """Optional. Optional additional configuration for the metric.""" + + +LLMBasedMetricSpecOrDict = Union[LLMBasedMetricSpec, LLMBasedMetricSpecDict] + + +class CustomCodeExecutionSpec(_common.BaseModel): + """Specificies a metric that is computed by running user-defined Python functions remotely.""" + + evaluation_function: Optional[str] = Field( + default=None, + description="""A string representing a user-defined function for evaluation. + Expected user to define the following function, e.g.: + def evaluate(instance: dict[str, Any]) -> float: + Please include this function signature in the code snippet. + Instance is the evaluation instance, any fields populated in the instance + are available to the function as instance[field_name].""", + ) + + +class CustomCodeExecutionSpecDict(TypedDict, total=False): + """Specificies a metric that is computed by running user-defined Python functions remotely.""" + + evaluation_function: Optional[str] + """A string representing a user-defined function for evaluation. + Expected user to define the following function, e.g.: + def evaluate(instance: dict[str, Any]) -> float: + Please include this function signature in the code snippet. + Instance is the evaluation instance, any fields populated in the instance + are available to the function as instance[field_name].""" + + +CustomCodeExecutionSpecOrDict = Union[ + CustomCodeExecutionSpec, CustomCodeExecutionSpecDict +] + + +class ComputationBasedMetricSpec(_common.BaseModel): + """Specification for a computation based metric.""" + + type: Optional[ComputationBasedMetricType] = Field( + default=None, description="""The type of the computation based metric.""" + ) + parameters: Optional[dict[str, Any]] = Field( + default=None, + description="""A map of parameters for the metric. ROUGE example: {"rouge_type": "rougeL", "split_summaries": True, "use_stemmer": True}. BLEU example: {"use_effective_order": True}.""", + ) + + +class ComputationBasedMetricSpecDict(TypedDict, total=False): + """Specification for a computation based metric.""" + + type: Optional[ComputationBasedMetricType] + """The type of the computation based metric.""" + + parameters: Optional[dict[str, Any]] + """A map of parameters for the metric. ROUGE example: {"rouge_type": "rougeL", "split_summaries": True, "use_stemmer": True}. BLEU example: {"use_effective_order": True}.""" + + +ComputationBasedMetricSpecOrDict = Union[ + ComputationBasedMetricSpec, ComputationBasedMetricSpecDict +] + + +class UnifiedMetric(_common.BaseModel): + """The unified metric used for evaluation.""" + + bleu_spec: Optional[BleuSpec] = Field( + default=None, description="""The Bleu metric spec.""" + ) + rouge_spec: Optional[RougeSpec] = Field( + default=None, description="""The rouge metric spec.""" + ) + pointwise_metric_spec: Optional[PointwiseMetricSpec] = Field( + default=None, description="""The pointwise metric spec.""" + ) + llm_based_metric_spec: Optional[LLMBasedMetricSpec] = Field( + default=None, description="""The spec for an LLM based metric.""" + ) + custom_code_execution_spec: Optional[CustomCodeExecutionSpec] = Field( + default=None, + description="""The spec for a custom code execution metric.""", + ) + predefined_metric_spec: Optional[PredefinedMetricSpec] = Field( + default=None, description="""The spec for a pre-defined metric.""" + ) + computation_based_metric_spec: Optional[ComputationBasedMetricSpec] = Field( + default=None, description="""The spec for a computation based metric.""" + ) + + +class UnifiedMetricDict(TypedDict, total=False): + """The unified metric used for evaluation.""" + + bleu_spec: Optional[BleuSpecDict] + """The Bleu metric spec.""" + + rouge_spec: Optional[RougeSpecDict] + """The rouge metric spec.""" + + pointwise_metric_spec: Optional[PointwiseMetricSpecDict] + """The pointwise metric spec.""" + + llm_based_metric_spec: Optional[LLMBasedMetricSpecDict] + """The spec for an LLM based metric.""" + + custom_code_execution_spec: Optional[CustomCodeExecutionSpecDict] + """The spec for a custom code execution metric.""" + + predefined_metric_spec: Optional[PredefinedMetricSpecDict] + """The spec for a pre-defined metric.""" + + computation_based_metric_spec: Optional[ComputationBasedMetricSpecDict] + """The spec for a computation based metric.""" + + +UnifiedMetricOrDict = Union[UnifiedMetric, UnifiedMetricDict] + + class UploadToFileSearchStoreResponse(_common.BaseModel): """The response when long-running operation for uploading a file to a FileSearchStore complete."""