From 9e71de87d6d19e80478aa641015fba89e75334ac Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibivikram@gmail.com>
Date: Mon, 22 Dec 2025 21:11:27 -0800
Subject: [PATCH 1/2] feat: support 'same-as-agent' model option for legacy
 evaluators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the 'same-as-agent' model configuration in legacy LLM-based
evaluators. When an evaluator specifies 'same-as-agent' as its model, it now
resolves to the actual model from agent.json settings instead of throwing an
error.

Changes:
- Updated EvaluatorFactory to accept and pass agent_model parameter
- Added _get_agent_model() method to runtime to load model from agent.json
- Added logging for model resolution and evaluator creation
- Fixed error message in trajectory evaluator (was incorrectly saying "LLM evaluator")

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/uipath/_cli/_evals/_evaluator_factory.py | 57 ++++++++++++++++----
 src/uipath/_cli/_evals/_runtime.py           | 31 ++++++++++-
 2 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py
index 15c54007e..253ce36d4 100644
--- a/src/uipath/_cli/_evals/_evaluator_factory.py
+++ b/src/uipath/_cli/_evals/_evaluator_factory.py
@@ -1,4 +1,5 @@
 import importlib.util
+import logging
 import sys
 from pathlib import Path
 from typing import Any
@@ -68,6 +69,8 @@
     ToolCallOutputEvaluatorConfig,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class EvaluatorFactory:
     """Factory class for creating evaluator instances based on configuration."""
@@ -106,12 +109,15 @@ def _prepare_evaluator_config(data: dict[str, Any]) -> dict[str, Any]:
 
     @classmethod
     def create_evaluator(
-        cls, data: dict[str, Any], evaluators_dir: Path | None = None
+        cls,
+        data: dict[str, Any],
+        evaluators_dir: Path | None = None,
+        agent_model: str | None = None,
     ) -> BaseEvaluator[Any, Any, Any]:
         if data.get("version", None) == "1.0":
             return cls._create_evaluator_internal(data, evaluators_dir)
         else:
-            return cls._create_legacy_evaluator_internal(data)
+            return cls._create_legacy_evaluator_internal(data, agent_model)
 
     @staticmethod
     def _create_evaluator_internal(
@@ -371,11 +377,14 @@ def _create_llm_judge_simulation_trajectory_evaluator(
     @staticmethod
     def _create_legacy_evaluator_internal(
         data: dict[str, Any],
+        agent_model: str | None = None,
     ) -> LegacyBaseEvaluator[Any]:
         """Create an evaluator instance from configuration data.
 
         Args:
             data: Dictionary containing evaluator configuration from JSON file
+            agent_model: Optional model name from agent settings for resolving
+                'same-as-agent' model configuration
 
         Returns:
             Appropriate evaluator instance based on category
@@ -391,9 +400,13 @@ def _create_legacy_evaluator_internal(
             case JsonSimilarityEvaluatorParams():
                 return EvaluatorFactory._create_legacy_json_similarity_evaluator(params)
             case LLMEvaluatorParams():
-                return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(params)
+                return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(
+                    params, agent_model
+                )
             case TrajectoryEvaluatorParams():
-                return EvaluatorFactory._create_legacy_trajectory_evaluator(params)
+                return EvaluatorFactory._create_legacy_trajectory_evaluator(
+                    params, agent_model
+                )
             case _:
                 raise ValueError(f"Unknown evaluator category: {params}")
 
@@ -414,6 +427,7 @@ def _create_legacy_json_similarity_evaluator(
     @staticmethod
     def _create_legacy_llm_as_judge_evaluator(
         params: LLMEvaluatorParams,
+        agent_model: str | None = None,
     ) -> LegacyLlmAsAJudgeEvaluator:
         """Create an LLM-as-a-judge evaluator."""
         if not params.prompt:
@@ -421,26 +435,51 @@ def _create_legacy_llm_as_judge_evaluator(
 
         if not params.model:
             raise ValueError("LLM evaluator must include 'model' field")
+
+        # Resolve 'same-as-agent' to actual agent model
         if params.model == "same-as-agent":
-            raise ValueError(
-                "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
+            if not agent_model:
+                raise ValueError(
+                    "'same-as-agent' model option requires agent settings. "
+                    "Ensure agent.json contains valid model settings."
+                )
+            logger.info(
+                f"Resolving 'same-as-agent' to agent model: {agent_model} "
+                f"for evaluator '{params.name}'"
             )
+            params = params.model_copy(update={"model": agent_model})
 
+        logger.info(
+            f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}"
+        )
         return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})
 
     @staticmethod
     def _create_legacy_trajectory_evaluator(
         params: TrajectoryEvaluatorParams,
+        agent_model: str | None = None,
     ) -> LegacyTrajectoryEvaluator:
         """Create a trajectory evaluator."""
         if not params.prompt:
             raise ValueError("Trajectory evaluator must include 'prompt' field")
 
         if not params.model:
-            raise ValueError("LLM evaluator must include 'model' field")
+            raise ValueError("Trajectory evaluator must include 'model' field")
+
+        # Resolve 'same-as-agent' to actual agent model
         if params.model == "same-as-agent":
-            raise ValueError(
-                "'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
+            if not agent_model:
+                raise ValueError(
+                    "'same-as-agent' model option requires agent settings. "
+                    "Ensure agent.json contains valid model settings."
+                )
+            logger.info(
+                f"Resolving 'same-as-agent' to agent model: {agent_model} "
+                f"for evaluator '{params.name}'"
             )
+            params = params.model_copy(update={"model": agent_model})
 
+        logger.info(
+            f"Creating trajectory evaluator '{params.name}' with model: {params.model}"
+        )
         return LegacyTrajectoryEvaluator(**params.model_dump(), config={})
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index 23751d266..17f95f1a0 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -601,6 +601,30 @@ async def run_evaluator(
 
         return result
 
+    def _get_agent_model(self) -> str | None:
+        """Load agent model from agent.json.
+
+        Uses the entrypoint from context if available, otherwise falls back
+        to looking for agent.json in the current working directory.
+
+        Returns:
+            The model name from agent settings, or None if not found.
+        """
+        # Use entrypoint from context if available (handles explicit paths)
+        if self.context.entrypoint:
+            agent_json = Path(self.context.entrypoint)
+        else:
+            agent_json = Path.cwd() / "agent.json"
+
+        if agent_json.exists():
+            try:
+                with open(agent_json, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                return data.get("settings", {}).get("model")
+            except (json.JSONDecodeError, OSError):
+                return None
+        return None
+
     def _load_evaluators(
         self, evaluation_set: EvaluationSet
     ) -> list[BaseEvaluator[Any, Any, Any]]:
@@ -611,6 +635,9 @@ def _load_evaluators(
             raise ValueError("eval_set cannot be None")
         evaluators_dir = Path(eval_set).parent.parent / "evaluators"
 
+        # Load agent model for 'same-as-agent' resolution in legacy evaluators
+        agent_model = self._get_agent_model()
+
         # If evaluatorConfigs is specified, use that (new field with weights)
         # Otherwise, fall back to evaluatorRefs (old field without weights)
         if (
@@ -638,7 +665,9 @@ def _load_evaluators(
             try:
                 evaluator_id = data.get("id")
                 if evaluator_id in evaluator_ref_ids:
-                    evaluator = EvaluatorFactory.create_evaluator(data, evaluators_dir)
+                    evaluator = EvaluatorFactory.create_evaluator(
+                        data, evaluators_dir, agent_model=agent_model
+                    )
                     evaluators.append(evaluator)
                     found_evaluator_ids.add(evaluator_id)
             except Exception as e:

From 6beb0e0a18e0e2df754bdbe78c9ea0c03a7032e7 Mon Sep 17 00:00:00 2001
From: Chibi Vikram <chibivikram@gmail.com>
Date: Mon, 22 Dec 2025 23:43:00 -0800
Subject: [PATCH 2/2] feat: add LLMAgentFactoryProtocol for model resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the Protocol-based approach for getting agent model:
- Adds LLMAgentFactoryProtocol with get_agent_model() method
- Updates _get_agent_model() to check if factory implements protocol
- Falls back to file-based approach if protocol not implemented

This allows runtime factories to provide agent model information
directly, enabling cleaner 'same-as-agent' resolution for evaluators.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/uipath/_cli/_evals/_runtime.py | 53 ++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index 17f95f1a0..f93f8d49a 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -5,7 +5,16 @@
 from contextlib import contextmanager
 from pathlib import Path
 from time import time
-from typing import Any, Awaitable, Iterable, Iterator, Sequence, Tuple
+from typing import (
+    Any,
+    Awaitable,
+    Iterable,
+    Iterator,
+    Protocol,
+    Sequence,
+    Tuple,
+    runtime_checkable,
+)
 
 import coverage
 from opentelemetry import context as context_api
@@ -67,6 +76,27 @@
     set_execution_context,
 )
 
+logger = logging.getLogger(__name__)
+
+
+@runtime_checkable
+class LLMAgentFactoryProtocol(Protocol):
+    """Protocol for factories that can provide agent model information.
+
+    Runtime factories that implement this protocol can be queried for
+    the agent's configured LLM model, enabling features like 'same-as-agent'
+    model resolution for evaluators.
+    """
+
+    def get_agent_model(self) -> str | None:
+        """Return the agent's configured LLM model name.
+
+        Returns:
+            The model name from agent settings (e.g., 'gpt-4o-2024-11-20'),
+            or None if no model is configured.
+        """
+        ...
+
 
 class ExecutionSpanExporter(SpanExporter):
     """Custom exporter that stores spans grouped by execution ids."""
@@ -602,15 +632,23 @@ async def run_evaluator(
         return result
 
     def _get_agent_model(self) -> str | None:
-        """Load agent model from agent.json.
+        """Get agent model from factory or agent.json fallback.
 
-        Uses the entrypoint from context if available, otherwise falls back
-        to looking for agent.json in the current working directory.
+        First checks if the runtime factory implements LLMAgentFactoryProtocol
+        and can provide the model directly. Falls back to reading agent.json
+        from disk if the protocol is not implemented.
 
         Returns:
             The model name from agent settings, or None if not found.
         """
-        # Use entrypoint from context if available (handles explicit paths)
+        # Prefer getting model from factory if it implements the protocol
+        if isinstance(self.factory, LLMAgentFactoryProtocol):
+            model = self.factory.get_agent_model()
+            if model:
+                logger.debug(f"Got agent model from factory: {model}")
+                return model
+
+        # Fallback: read from agent.json file
         if self.context.entrypoint:
             agent_json = Path(self.context.entrypoint)
         else:
@@ -620,7 +658,10 @@ def _get_agent_model(self) -> str | None:
             try:
                 with open(agent_json, "r", encoding="utf-8") as f:
                     data = json.load(f)
-                return data.get("settings", {}).get("model")
+                model = data.get("settings", {}).get("model")
+                if model:
+                    logger.debug(f"Got agent model from file: {model}")
+                return model
             except (json.JSONDecodeError, OSError):
                 return None
         return None