Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 48 additions & 9 deletions src/uipath/_cli/_evals/_evaluator_factory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.util
import logging
import sys
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -68,6 +69,8 @@
ToolCallOutputEvaluatorConfig,
)

logger = logging.getLogger(__name__)


class EvaluatorFactory:
"""Factory class for creating evaluator instances based on configuration."""
Expand Down Expand Up @@ -106,12 +109,15 @@ def _prepare_evaluator_config(data: dict[str, Any]) -> dict[str, Any]:

@classmethod
def create_evaluator(
cls, data: dict[str, Any], evaluators_dir: Path | None = None
cls,
data: dict[str, Any],
evaluators_dir: Path | None = None,
agent_model: str | None = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I would suggest passing in agent_model_settings object instead, so that @mathurk @AAgnihotry do not have to do double work :)

) -> BaseEvaluator[Any, Any, Any]:
if data.get("version", None) == "1.0":
return cls._create_evaluator_internal(data, evaluators_dir)
else:
return cls._create_legacy_evaluator_internal(data)
return cls._create_legacy_evaluator_internal(data, agent_model)

@staticmethod
def _create_evaluator_internal(
Expand Down Expand Up @@ -371,11 +377,14 @@ def _create_llm_judge_simulation_trajectory_evaluator(
@staticmethod
def _create_legacy_evaluator_internal(
data: dict[str, Any],
agent_model: str | None = None,
) -> LegacyBaseEvaluator[Any]:
"""Create an evaluator instance from configuration data.

Args:
data: Dictionary containing evaluator configuration from JSON file
agent_model: Optional model name from agent settings for resolving
'same-as-agent' model configuration

Returns:
Appropriate evaluator instance based on category
Expand All @@ -391,9 +400,13 @@ def _create_legacy_evaluator_internal(
case JsonSimilarityEvaluatorParams():
return EvaluatorFactory._create_legacy_json_similarity_evaluator(params)
case LLMEvaluatorParams():
return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(params)
return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(
params, agent_model
)
case TrajectoryEvaluatorParams():
return EvaluatorFactory._create_legacy_trajectory_evaluator(params)
return EvaluatorFactory._create_legacy_trajectory_evaluator(
params, agent_model
)
case _:
raise ValueError(f"Unknown evaluator category: {params}")

Expand All @@ -414,33 +427,59 @@ def _create_legacy_json_similarity_evaluator(
@staticmethod
def _create_legacy_llm_as_judge_evaluator(
params: LLMEvaluatorParams,
agent_model: str | None = None,
) -> LegacyLlmAsAJudgeEvaluator:
"""Create an LLM-as-a-judge evaluator."""
if not params.prompt:
raise ValueError("LLM evaluator must include 'prompt' field")

if not params.model:
raise ValueError("LLM evaluator must include 'model' field")

# Resolve 'same-as-agent' to actual agent model
if params.model == "same-as-agent":
raise ValueError(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
if not agent_model:
raise ValueError(
"'same-as-agent' model option requires agent settings. "
"Ensure agent.json contains valid model settings."
)
logger.info(
f"Resolving 'same-as-agent' to agent model: {agent_model} "
f"for evaluator '{params.name}'"
)
params = params.model_copy(update={"model": agent_model})

logger.info(
f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}"
)
return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})

@staticmethod
def _create_legacy_trajectory_evaluator(
params: TrajectoryEvaluatorParams,
agent_model: str | None = None,
) -> LegacyTrajectoryEvaluator:
"""Create a trajectory evaluator."""
if not params.prompt:
raise ValueError("Trajectory evaluator must include 'prompt' field")

if not params.model:
raise ValueError("LLM evaluator must include 'model' field")
raise ValueError("Trajectory evaluator must include 'model' field")

# Resolve 'same-as-agent' to actual agent model
if params.model == "same-as-agent":
raise ValueError(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
if not agent_model:
raise ValueError(
"'same-as-agent' model option requires agent settings. "
"Ensure agent.json contains valid model settings."
)
logger.info(
f"Resolving 'same-as-agent' to agent model: {agent_model} "
f"for evaluator '{params.name}'"
)
params = params.model_copy(update={"model": agent_model})

logger.info(
f"Creating trajectory evaluator '{params.name}' with model: {params.model}"
)
return LegacyTrajectoryEvaluator(**params.model_dump(), config={})
74 changes: 72 additions & 2 deletions src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@
from contextlib import contextmanager
from pathlib import Path
from time import time
from typing import Any, Awaitable, Iterable, Iterator, Sequence, Tuple
from typing import (
Any,
Awaitable,
Iterable,
Iterator,
Protocol,
Sequence,
Tuple,
runtime_checkable,
)

import coverage
from opentelemetry import context as context_api
Expand Down Expand Up @@ -67,6 +76,27 @@
set_execution_context,
)

logger = logging.getLogger(__name__)


@runtime_checkable
class LLMAgentFactoryProtocol(Protocol):
"""Protocol for factories that can provide agent model information.

Runtime factories that implement this protocol can be queried for
the agent's configured LLM model, enabling features like 'same-as-agent'
model resolution for evaluators.
"""

def get_agent_model(self) -> str | None:
"""Return the agent's configured LLM model name.

Returns:
The model name from agent settings (e.g., 'gpt-4o-2024-11-20'),
or None if no model is configured.
"""
...


class ExecutionSpanExporter(SpanExporter):
"""Custom exporter that stores spans grouped by execution ids."""
Expand Down Expand Up @@ -601,6 +631,41 @@ async def run_evaluator(

return result

def _get_agent_model(self) -> str | None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens when we specify a custom model settings cc - @mathurk @AAgnihotry

"""Get agent model from factory or agent.json fallback.

First checks if the runtime factory implements LLMAgentFactoryProtocol
and can provide the model directly. Falls back to reading agent.json
from disk if the protocol is not implemented.

Returns:
The model name from agent settings, or None if not found.
"""
# Prefer getting model from factory if it implements the protocol
if isinstance(self.factory, LLMAgentFactoryProtocol):
model = self.factory.get_agent_model()
if model:
logger.debug(f"Got agent model from factory: {model}")
return model

# Fallback: read from agent.json file
if self.context.entrypoint:
agent_json = Path(self.context.entrypoint)
else:
agent_json = Path.cwd() / "agent.json"

if agent_json.exists():
try:
with open(agent_json, "r", encoding="utf-8") as f:
data = json.load(f)
model = data.get("settings", {}).get("model")
if model:
logger.debug(f"Got agent model from file: {model}")
return model
except (json.JSONDecodeError, OSError):
return None
return None

def _load_evaluators(
self, evaluation_set: EvaluationSet
) -> list[BaseEvaluator[Any, Any, Any]]:
Expand All @@ -611,6 +676,9 @@ def _load_evaluators(
raise ValueError("eval_set cannot be None")
evaluators_dir = Path(eval_set).parent.parent / "evaluators"

# Load agent model for 'same-as-agent' resolution in legacy evaluators
agent_model = self._get_agent_model()

# If evaluatorConfigs is specified, use that (new field with weights)
# Otherwise, fall back to evaluatorRefs (old field without weights)
if (
Expand Down Expand Up @@ -638,7 +706,9 @@ def _load_evaluators(
try:
evaluator_id = data.get("id")
if evaluator_id in evaluator_ref_ids:
evaluator = EvaluatorFactory.create_evaluator(data, evaluators_dir)
evaluator = EvaluatorFactory.create_evaluator(
data, evaluators_dir, agent_model=agent_model
)
evaluators.append(evaluator)
found_evaluator_ids.add(evaluator_id)
except Exception as e:
Expand Down
Loading