From 9e71de87d6d19e80478aa641015fba89e75334ac Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Mon, 22 Dec 2025 21:11:27 -0800 Subject: [PATCH 1/2] feat: support 'same-as-agent' model option for legacy evaluators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the 'same-as-agent' model configuration in legacy LLM-based evaluators. When an evaluator specifies 'same-as-agent' as its model, it now resolves to the actual model from agent.json settings instead of throwing an error. Changes: - Updated EvaluatorFactory to accept and pass agent_model parameter - Added _get_agent_model() method to runtime to load model from agent.json - Added logging for model resolution and evaluator creation - Fixed error message in trajectory evaluator (was incorrectly saying "LLM evaluator") 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/uipath/_cli/_evals/_evaluator_factory.py | 57 ++++++++++++++++---- src/uipath/_cli/_evals/_runtime.py | 31 ++++++++++- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index 15c54007e..253ce36d4 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -1,4 +1,5 @@ import importlib.util +import logging import sys from pathlib import Path from typing import Any @@ -68,6 +69,8 @@ ToolCallOutputEvaluatorConfig, ) +logger = logging.getLogger(__name__) + class EvaluatorFactory: """Factory class for creating evaluator instances based on configuration.""" @@ -106,12 +109,15 @@ def _prepare_evaluator_config(data: dict[str, Any]) -> dict[str, Any]: @classmethod def create_evaluator( - cls, data: dict[str, Any], evaluators_dir: Path | None = None + cls, + data: dict[str, Any], + evaluators_dir: Path | None = None, + agent_model: str | None = None, ) -> BaseEvaluator[Any, Any, Any]: if data.get("version", None) == "1.0": return cls._create_evaluator_internal(data, evaluators_dir) else: - return cls._create_legacy_evaluator_internal(data) + return cls._create_legacy_evaluator_internal(data, agent_model) @staticmethod def _create_evaluator_internal( @@ -371,11 +377,14 @@ def _create_llm_judge_simulation_trajectory_evaluator( @staticmethod def _create_legacy_evaluator_internal( data: dict[str, Any], + agent_model: str | None = None, ) -> LegacyBaseEvaluator[Any]: """Create an evaluator instance from configuration data. Args: data: Dictionary containing evaluator configuration from JSON file + agent_model: Optional model name from agent settings for resolving + 'same-as-agent' model configuration Returns: Appropriate evaluator instance based on category @@ -391,9 +400,13 @@ def _create_legacy_evaluator_internal( case JsonSimilarityEvaluatorParams(): return EvaluatorFactory._create_legacy_json_similarity_evaluator(params) case LLMEvaluatorParams(): - return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(params) + return EvaluatorFactory._create_legacy_llm_as_judge_evaluator( + params, agent_model + ) case TrajectoryEvaluatorParams(): - return EvaluatorFactory._create_legacy_trajectory_evaluator(params) + return EvaluatorFactory._create_legacy_trajectory_evaluator( + params, agent_model + ) case _: raise ValueError(f"Unknown evaluator category: {params}") @@ -414,6 +427,7 @@ def _create_legacy_json_similarity_evaluator( @staticmethod def _create_legacy_llm_as_judge_evaluator( params: LLMEvaluatorParams, + agent_model: str | None = None, ) -> LegacyLlmAsAJudgeEvaluator: """Create an LLM-as-a-judge evaluator.""" if not params.prompt: @@ -421,26 +435,51 @@ def _create_legacy_llm_as_judge_evaluator( if not params.model: raise ValueError("LLM evaluator must include 'model' field") + + # Resolve 'same-as-agent' to actual agent model if params.model == "same-as-agent": - raise ValueError( - "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." + if not agent_model: + raise ValueError( + "'same-as-agent' model option requires agent settings. " + "Ensure agent.json contains valid model settings." + ) + logger.info( + f"Resolving 'same-as-agent' to agent model: {agent_model} " + f"for evaluator '{params.name}'" ) + params = params.model_copy(update={"model": agent_model}) + logger.info( + f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}" + ) return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={}) @staticmethod def _create_legacy_trajectory_evaluator( params: TrajectoryEvaluatorParams, + agent_model: str | None = None, ) -> LegacyTrajectoryEvaluator: """Create a trajectory evaluator.""" if not params.prompt: raise ValueError("Trajectory evaluator must include 'prompt' field") if not params.model: - raise ValueError("LLM evaluator must include 'model' field") + raise ValueError("Trajectory evaluator must include 'model' field") + + # Resolve 'same-as-agent' to actual agent model if params.model == "same-as-agent": - raise ValueError( - "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator." + if not agent_model: + raise ValueError( + "'same-as-agent' model option requires agent settings. " + "Ensure agent.json contains valid model settings." + ) + logger.info( + f"Resolving 'same-as-agent' to agent model: {agent_model} " + f"for evaluator '{params.name}'" ) + params = params.model_copy(update={"model": agent_model}) + logger.info( + f"Creating trajectory evaluator '{params.name}' with model: {params.model}" + ) return LegacyTrajectoryEvaluator(**params.model_dump(), config={}) diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 23751d266..17f95f1a0 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -601,6 +601,30 @@ async def run_evaluator( return result + def _get_agent_model(self) -> str | None: + """Load agent model from agent.json. + + Uses the entrypoint from context if available, otherwise falls back + to looking for agent.json in the current working directory. + + Returns: + The model name from agent settings, or None if not found. + """ + # Use entrypoint from context if available (handles explicit paths) + if self.context.entrypoint: + agent_json = Path(self.context.entrypoint) + else: + agent_json = Path.cwd() / "agent.json" + + if agent_json.exists(): + try: + with open(agent_json, "r", encoding="utf-8") as f: + data = json.load(f) + return data.get("settings", {}).get("model") + except (json.JSONDecodeError, OSError): + return None + return None + def _load_evaluators( self, evaluation_set: EvaluationSet ) -> list[BaseEvaluator[Any, Any, Any]]: @@ -611,6 +635,9 @@ def _load_evaluators( raise ValueError("eval_set cannot be None") evaluators_dir = Path(eval_set).parent.parent / "evaluators" + # Load agent model for 'same-as-agent' resolution in legacy evaluators + agent_model = self._get_agent_model() + # If evaluatorConfigs is specified, use that (new field with weights) # Otherwise, fall back to evaluatorRefs (old field without weights) if ( @@ -638,7 +665,9 @@ def _load_evaluators( try: evaluator_id = data.get("id") if evaluator_id in evaluator_ref_ids: - evaluator = EvaluatorFactory.create_evaluator(data, evaluators_dir) + evaluator = EvaluatorFactory.create_evaluator( + data, evaluators_dir, agent_model=agent_model + ) evaluators.append(evaluator) found_evaluator_ids.add(evaluator_id) except Exception as e: From 6beb0e0a18e0e2df754bdbe78c9ea0c03a7032e7 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Mon, 22 Dec 2025 23:43:00 -0800 Subject: [PATCH 2/2] feat: add LLMAgentFactoryProtocol for model resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the Protocol-based approach for getting agent model: - Adds LLMAgentFactoryProtocol with get_agent_model() method - Updates _get_agent_model() to check if factory implements protocol - Falls back to file-based approach if protocol not implemented This allows runtime factories to provide agent model information directly, enabling cleaner 'same-as-agent' resolution for evaluators. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/uipath/_cli/_evals/_runtime.py | 53 ++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 17f95f1a0..f93f8d49a 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -5,7 +5,16 @@ from contextlib import contextmanager from pathlib import Path from time import time -from typing import Any, Awaitable, Iterable, Iterator, Sequence, Tuple +from typing import ( + Any, + Awaitable, + Iterable, + Iterator, + Protocol, + Sequence, + Tuple, + runtime_checkable, +) import coverage from opentelemetry import context as context_api @@ -67,6 +76,27 @@ set_execution_context, ) +logger = logging.getLogger(__name__) + + +@runtime_checkable +class LLMAgentFactoryProtocol(Protocol): + """Protocol for factories that can provide agent model information. + + Runtime factories that implement this protocol can be queried for + the agent's configured LLM model, enabling features like 'same-as-agent' + model resolution for evaluators. + """ + + def get_agent_model(self) -> str | None: + """Return the agent's configured LLM model name. + + Returns: + The model name from agent settings (e.g., 'gpt-4o-2024-11-20'), + or None if no model is configured. + """ + ... + class ExecutionSpanExporter(SpanExporter): """Custom exporter that stores spans grouped by execution ids.""" @@ -602,15 +632,23 @@ async def run_evaluator( return result def _get_agent_model(self) -> str | None: - """Load agent model from agent.json. + """Get agent model from factory or agent.json fallback. - Uses the entrypoint from context if available, otherwise falls back - to looking for agent.json in the current working directory. + First checks if the runtime factory implements LLMAgentFactoryProtocol + and can provide the model directly. Falls back to reading agent.json + from disk if the protocol is not implemented. Returns: The model name from agent settings, or None if not found. """ - # Use entrypoint from context if available (handles explicit paths) + # Prefer getting model from factory if it implements the protocol + if isinstance(self.factory, LLMAgentFactoryProtocol): + model = self.factory.get_agent_model() + if model: + logger.debug(f"Got agent model from factory: {model}") + return model + + # Fallback: read from agent.json file if self.context.entrypoint: agent_json = Path(self.context.entrypoint) else: @@ -620,7 +658,10 @@ def _get_agent_model(self) -> str | None: try: with open(agent_json, "r", encoding="utf-8") as f: data = json.load(f) - return data.get("settings", {}).get("model") + model = data.get("settings", {}).get("model") + if model: + logger.debug(f"Got agent model from file: {model}") + return model except (json.JSONDecodeError, OSError): return None return None