diff --git a/hello b/hello new file mode 100644 index 000000000..f0ad0dec6 --- /dev/null +++ b/hello @@ -0,0 +1 @@ +hello.txt diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py index 5d1d17f38..1f3504b0f 100644 --- a/src/uipath/_cli/_evals/_console_progress_reporter.py +++ b/src/uipath/_cli/_evals/_console_progress_reporter.py @@ -29,6 +29,7 @@ def __init__(self): self.evaluators: Dict[str, AnyEvaluator] = {} self.display_started = False self.eval_results_by_name: Dict[str, list[Any]] = {} + self.evaluator_weights: Dict[str, float] = {} def _convert_score_to_numeric(self, eval_result) -> float: """Convert evaluation result score to numeric value.""" @@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N """Handle evaluation set run creation.""" try: self.evaluators = {eval.id: eval for eval in payload.evaluators} + if payload.evaluator_weights: + self.evaluator_weights = payload.evaluator_weights except Exception as e: logger.error(f"Failed to handle create eval set run event: {e}") @@ -206,9 +209,20 @@ def display_final_results(self): summary_table.add_row(*row_values) - # Add separator row before average + # Add separator row before weights and average summary_table.add_section() + # Add weights row if weights are defined + if self.evaluator_weights: + weight_row_values = ["[bold]Weights[/bold]"] + for evaluator_id in evaluator_ids: + weight = self.evaluator_weights.get(evaluator_id, "-") + if weight != "-": + weight_row_values.append(f"[bold]{weight:.1f}[/bold]") + else: + weight_row_values.append("[bold]-[/bold]") + summary_table.add_row(*weight_row_values) + # Add average row avg_row_values = ["[bold]Average[/bold]"] for evaluator_id in evaluator_ids: @@ -217,8 +231,31 @@ def display_final_results(self): summary_table.add_row(*avg_row_values) - self.console.print(summary_table) - self.console.print() + # Calculate and display weighted final score if weights are defined + if self.evaluator_weights: + weighted_total = 0.0 + weights_sum = 0.0 + for evaluator_id in evaluator_ids: + weight = self.evaluator_weights.get(evaluator_id) + if weight is not None: + avg_score = self.final_results[evaluator_id] + weighted_total += weight * avg_score + weights_sum += weight + + # Display as a separate info line + self.console.print(summary_table) + self.console.print() + self.console.print( + f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]" + ) + if weights_sum != 1.0: + self.console.print( + f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]" + ) + self.console.print() + else: + self.console.print(summary_table) + self.console.print() else: self.console.print( "→ [bold green]All evaluations completed successfully![/bold green]" diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py index 137f4f06e..e81c46d77 100644 --- a/src/uipath/_cli/_evals/_models/_evaluation_set.py +++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py @@ -160,6 +160,9 @@ class EvaluationSet(BaseModel): version: Literal["1.0"] = "1.0" evaluator_refs: List[str] = Field(default_factory=list) evaluations: List[EvaluationItem] = Field(default_factory=list) + evaluator_weights: Optional[Dict[str, float]] = Field( + default=None, alias="evaluatorWeights" + ) def extract_selected_evals(self, eval_ids) -> None: selected_evals: list[EvaluationItem] = [] diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py index c3ba1c728..28aa7f42e 100644 --- a/src/uipath/_cli/_evals/_models/_output.py +++ b/src/uipath/_cli/_evals/_models/_output.py @@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) score: float - details: Optional[str | BaseModel] = None + details: Optional[str | Dict[str, Any] | BaseModel] = None evaluation_time: Optional[float] = None @model_serializer(mode="wrap") @@ -56,6 +56,7 @@ def serialize_model( info: core_schema.SerializationInfo, ) -> Any: data = serializer(self) + # Only remove details if it's None, keep empty dicts and populated dicts if self.details is None and isinstance(data, dict): data.pop("details", None) return data @@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel): evaluator_name: str evaluator_id: str + evaluator_type: Optional[str] = None + node_id: Optional[str] = None result: EvaluationResultDto @@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel): evaluation_name: str evaluation_run_results: List[EvaluationRunResultDto] + workflow: Optional[List[str]] = None agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None @property @@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel): evaluation_set_name: str evaluation_set_results: List[EvaluationRunResult] + weighted_final_score: Optional[float] = None + evaluator_weights: Optional[Dict[str, float]] = None @property def score(self) -> float: diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index 5c0516e9b..efb6d5a0e 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -329,10 +329,11 @@ async def update_eval_set_run( eval_set_run_id: str, evaluator_scores: dict[str, float], is_coded: bool = False, + weighted_final_score: float | None = None, ): """Update the evaluation set run status to complete.""" spec = self._update_eval_set_run_spec( - eval_set_run_id, evaluator_scores, is_coded + eval_set_run_id, evaluator_scores, is_coded, weighted_final_score ) await self._client.request_async( method=spec.method, @@ -452,6 +453,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N eval_set_run_id, payload.evaluator_scores, is_coded=is_coded, + weighted_final_score=payload.weighted_final_score, ) logger.debug( f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})" @@ -797,6 +799,7 @@ def _update_eval_set_run_spec( eval_set_run_id: str, evaluator_scores: dict[str, float], is_coded: bool = False, + weighted_final_score: float | None = None, ) -> RequestSpec: # Legacy API expects evaluatorId as GUID, coded accepts string evaluator_scores_list = [] @@ -820,16 +823,24 @@ def _update_eval_set_run_spec( # For legacy evaluations, endpoint is without /coded endpoint_suffix = "coded/" if is_coded else "" + + # Build the JSON payload + json_payload = { + "evalSetRunId": eval_set_run_id, + "status": EvaluationStatus.COMPLETED.value, + "evaluatorScores": evaluator_scores_list, + } + + # Add weighted final score if available + if weighted_final_score is not None: + json_payload["weightedFinalScore"] = weighted_final_score + return RequestSpec( method="PUT", endpoint=Endpoint( f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun" ), - json={ - "evalSetRunId": eval_set_run_id, - "status": EvaluationStatus.COMPLETED.value, - "evaluatorScores": evaluator_scores_list, - }, + json=json_payload, headers=self._tenant_header(), ) diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index b9a7e7d6f..5ec936808 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -12,9 +12,7 @@ from opentelemetry.sdk.trace import ReadableSpan, Span from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult -from uipath._cli._evals.mocks.input_mocker import ( - generate_llm_input, -) +from uipath._cli._evals.mocks.input_mocker import generate_llm_input from ..._events._event_bus import EventBus from ..._events._events import ( @@ -59,15 +57,115 @@ convert_eval_execution_output_to_serializable, ) from ._span_collection import ExecutionSpanCollector -from .mocks.mocks import ( - clear_execution_context, - set_execution_context, -) +from .mocks.mocks import clear_execution_context, set_execution_context T = TypeVar("T", bound=UiPathBaseRuntime) C = TypeVar("C", bound=UiPathRuntimeContext) +def extract_workflow_from_spans(spans: list[ReadableSpan]) -> list[str]: + """Extract ordered list of main workflow nodes from execution spans. + + Only captures workflow nodes that are direct children of a LangGraph parent span, + which naturally filters out sub-nodes and internal components. + + Args: + spans: List of ReadableSpan objects from agent execution + + Returns: + List of unique main node names in execution order + """ + + for i, span in enumerate(spans): + span_name = getattr(span, "name", "NO_NAME") + attributes = getattr(span, "attributes", {}) + parent_context = getattr(span, "parent", None) + parent_span_id = None + if parent_context: + parent_span_id = getattr(parent_context, "span_id", None) + + span_context = span.get_span_context() + span_id = span_context.span_id if span_context else "NO_ID" + + if isinstance(attributes, dict): + node_name = attributes.get("node_name") + langgraph_node = attributes.get("langgraph.node") + + node_order = [] + seen_nodes = set() + + # System nodes to exclude + system_nodes = {"__start__", "__end__"} + + # First pass: Find LangGraph-related parent span IDs + # Look for spans that could be the main graph span (could have different names) + langgraph_span_ids = set() + for span in spans: + span_name = getattr(span, "name", "") + # Check if this is a LangGraph main span + if span_name and "langgraph" in span_name.lower(): + span_context = span.get_span_context() + if span_context: + langgraph_span_ids.add(span_context.span_id) + + + # If we found potential parent spans, use them; otherwise we'll check all spans with langgraph.node + if langgraph_span_ids: + # Second pass: Collect spans that have a LangGraph parent + for span in spans: + # Get parent span ID + parent_context = getattr(span, "parent", None) + parent_span_id = None + if parent_context: + parent_span_id = getattr(parent_context, "span_id", None) + + # Skip if parent is not one of the LangGraph spans + if parent_span_id not in langgraph_span_ids: + continue + + # Get node name - use span name directly since attributes might not have it + span_name = getattr(span, "name", "") + attributes = getattr(span, "attributes", {}) + + # Try to get from attributes first, then fall back to span name + node_name = None + if isinstance(attributes, dict): + node_name = attributes.get("langgraph.node") or attributes.get("node_name") + + if not node_name: + node_name = span_name + + # Skip if no node name found + if not node_name: + continue + + # Filter out system nodes + if node_name in system_nodes: + continue + + # Add to workflow if not seen before + if node_name not in seen_nodes: + seen_nodes.add(node_name) + node_order.append(node_name) + else: + # Fallback: Just get all spans with langgraph.node attribute + for span in spans: + attributes = getattr(span, "attributes", None) + if not attributes or not isinstance(attributes, dict): + continue + + node_name = attributes.get("langgraph.node") + + if not node_name or node_name in system_nodes: + continue + + if node_name not in seen_nodes: + seen_nodes.add(node_name) + node_order.append(node_name) + + return node_order + + class ExecutionSpanExporter(SpanExporter): """Custom exporter that stores spans grouped by execution ids.""" @@ -219,6 +317,7 @@ async def execute(self) -> UiPathRuntimeResult: eval_set_id=evaluation_set.id, no_of_evals=len(evaluation_set.evaluations), evaluators=evaluators, + evaluator_weights=getattr(evaluation_set, "evaluator_weights", None), ), ) @@ -235,16 +334,13 @@ async def execute(self) -> UiPathRuntimeResult: eval_run_result_list = await self._execute_sequential( evaluation_set, evaluators, event_bus ) - results = UiPathEvalOutput( - evaluation_set_name=evaluation_set.name, - evaluation_set_results=eval_run_result_list, - ) # Computing evaluator averages evaluator_averages: Dict[str, float] = defaultdict(float) evaluator_count: Dict[str, int] = defaultdict(int) - for eval_run_result in results.evaluation_set_results: + # Collect all evaluation results first + for eval_run_result in eval_run_result_list: for result_dto in eval_run_result.evaluation_run_results: evaluator_averages[result_dto.evaluator_id] += result_dto.result.score evaluator_count[result_dto.evaluator_id] += 1 @@ -253,11 +349,33 @@ async def execute(self) -> UiPathRuntimeResult: evaluator_averages[eval_id] = ( evaluator_averages[eval_id] / evaluator_count[eval_id] ) + + # Calculate weighted final score if weights are defined + evaluator_weights = getattr(evaluation_set, "evaluator_weights", None) + weighted_final_score = None + if evaluator_weights: + weighted_total = 0.0 + for evaluator_id, avg_score in evaluator_averages.items(): + weight = evaluator_weights.get(evaluator_id) + if weight is not None: + weighted_total += weight * avg_score + weighted_final_score = weighted_total + + # Create results with weighted score and weights + results = UiPathEvalOutput( + evaluation_set_name=evaluation_set.name, + evaluation_set_results=eval_run_result_list, + weighted_final_score=weighted_final_score, + evaluator_weights=evaluator_weights, + ) + await event_bus.publish( EvaluationEvents.UPDATE_EVAL_SET_RUN, EvalSetRunUpdatedEvent( execution_id=self.execution_id, evaluator_scores=evaluator_averages, + weighted_final_score=weighted_final_score, + evaluator_weights=evaluator_weights, ), wait_for_completion=False, ) @@ -405,6 +523,11 @@ async def _execute_eval( ) ) ) + # Extract workflow nodes from spans even in error case + if spans: + workflow = extract_workflow_from_spans(spans) + if workflow: + evaluation_run_results.workflow = workflow raise if self.context.verbose: @@ -413,6 +536,12 @@ async def _execute_eval( agent_execution_output ) ) + + # Extract workflow nodes from spans + workflow = extract_workflow_from_spans(agent_execution_output.spans) + # Always set workflow, even if empty, to distinguish from no extraction + evaluation_run_results.workflow = workflow if workflow else None + evaluation_item_results: list[EvalItemResult] = [] for evaluator in evaluators: @@ -456,11 +585,27 @@ async def _execute_eval( evaluation_result ) + # Extract node_id from evaluation criteria if available + node_id = None + if isinstance(eval_item, EvaluationItem) and evaluator.id in eval_item.evaluation_criterias: + criteria_dict = eval_item.evaluation_criterias[evaluator.id] + if criteria_dict: + node_id = criteria_dict.get("nodeId") + + # Get evaluator type from evaluator's get_evaluator_id method + evaluator_type = None + try: + evaluator_type = evaluator.get_evaluator_id() + except AttributeError: + pass + evaluation_run_results.evaluation_run_results.append( EvaluationRunResultDto( evaluator_name=evaluator.name, result=dto_result, evaluator_id=evaluator.id, + evaluator_type=evaluator_type, + node_id=node_id, ) ) evaluation_item_results.append( @@ -489,10 +634,26 @@ async def _execute_eval( exception_details = EvalItemExceptionDetails(exception=e) for evaluator in evaluators: + # Extract node_id from evaluation criteria if available + node_id = None + if isinstance(eval_item, EvaluationItem) and evaluator.id in eval_item.evaluation_criterias: + criteria_dict = eval_item.evaluation_criterias[evaluator.id] + if criteria_dict: + node_id = criteria_dict.get("nodeId") + + # Get evaluator type from evaluator's get_evaluator_id method + evaluator_type = None + try: + evaluator_type = evaluator.get_evaluator_id() + except AttributeError: + pass + evaluation_run_results.evaluation_run_results.append( EvaluationRunResultDto( evaluator_name=evaluator.name, evaluator_id=evaluator.id, + evaluator_type=evaluator_type, + node_id=node_id, result=EvaluationResultDto(score=0), ) ) diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index a1ca0cf31..0ab974745 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -82,6 +82,12 @@ def setup_reporting_prereq(no_report: bool) -> bool: type=click.Path(exists=False), help="File path where the output will be written", ) +@click.option( + "--verbose", + is_flag=True, + help="Enable verbose debug output for evaluators", + default=False, +) @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None) def eval( entrypoint: Optional[str], @@ -91,6 +97,7 @@ def eval( no_report: bool, workers: int, output_file: Optional[str], + verbose: bool, ) -> None: """Run an evaluation set against the agent. @@ -101,7 +108,18 @@ def eval( eval_set_run_id: Custom evaluation set run ID (optional, will generate UUID if not specified) workers: Number of parallel workers for running evaluations no_report: Do not report the evaluation results + verbose: Enable verbose debug output for evaluators """ + # Configure logging level for evaluators if verbose is enabled + if verbose: + import logging + logging.basicConfig( + level=logging.DEBUG, + format='%(message)s' + ) + # Set the evaluators logger to DEBUG + logging.getLogger('uipath.eval.evaluators').setLevel(logging.DEBUG) + context_args = { "entrypoint": entrypoint or auto_discover_entrypoint(), "eval_set": eval_set, diff --git a/src/uipath/_cli/cli_run.py b/src/uipath/_cli/cli_run.py index 2f8cf0571..a0b366474 100644 --- a/src/uipath/_cli/cli_run.py +++ b/src/uipath/_cli/cli_run.py @@ -1,16 +1,22 @@ # type: ignore import asyncio +import json import os +from datetime import datetime from os import environ as env -from typing import Optional +from typing import Optional, Sequence +import uuid import click +from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult from uipath._cli._runtime._runtime_factory import generate_runtime_factory from uipath._cli._utils._common import read_resource_overwrites_from_file from uipath._cli._utils._debug import setup_debugging from uipath._utils._bindings import ResourceOverwritesContext from uipath.tracing import JsonLinesFileExporter, LlmOpsHttpExporter +from uipath.tracing._utils import _SpanUtils from .._utils.constants import ( ENV_JOB_ID, @@ -20,9 +26,370 @@ from ._utils._console import ConsoleLogger from .middlewares import Middlewares +# Import LangChain instrumentor for automatic span generation +try: + from openinference.instrumentation.langchain import ( + LangChainInstrumentor, + get_current_span, + ) + LANGCHAIN_INSTRUMENTATION_AVAILABLE = True +except ImportError: + LANGCHAIN_INSTRUMENTATION_AVAILABLE = False + console = ConsoleLogger() +class MemorySpanExporter(SpanExporter): + """Span exporter that collects spans in memory for later processing.""" + + def __init__(self): + self.spans = [] + + def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: + """Export spans to memory.""" + try: + for span in spans: + uipath_span = _SpanUtils.otel_span_to_uipath_span( + span, serialize_attributes=True + ) + self.spans.append(uipath_span.to_dict(serialize_attributes=False)) + return SpanExportResult.SUCCESS + except Exception: + return SpanExportResult.FAILURE + + def shutdown(self) -> None: + """Shutdown the exporter.""" + pass + + def force_flush(self, timeout_millis: int = 30000) -> bool: + """Force flush any buffered spans.""" + return True + + +def _generate_evaluation_set( + input_data: str, + output_data: str, + entrypoint: str, + eval_set_path: str, + evaluators: list[str] = None, + spans: list[dict] = None, +) -> None: + """Generate an evaluation set JSON file from a run execution. + + Args: + input_data: The input data used for the run (as JSON string) + output_data: The output data from the run (as JSON string) + entrypoint: Path to the agent script + eval_set_path: Path where the evaluation set JSON file will be saved + evaluators: List of evaluator names to use (e.g., ['json_similarity', 'exact_match']) + spans: Optional list of span dictionaries containing node execution data + """ + try: + # Use json_similarity as default if no evaluators specified + if not evaluators: + evaluators = ["json_similarity"] + + # Create the directory structure for eval sets and evaluators + eval_set_file = os.path.abspath(eval_set_path) + eval_set_dir = os.path.dirname(eval_set_file) + + # If not already in an eval-sets dir, create proper structure + if not eval_set_dir.endswith("eval-sets"): + eval_set_dir = os.path.join(eval_set_dir, "evals", "eval-sets") + eval_set_file = os.path.join(eval_set_dir, os.path.basename(eval_set_path)) + + os.makedirs(eval_set_dir, exist_ok=True) + + # Create evaluators directory at the sibling level + evaluators_dir = os.path.join(os.path.dirname(eval_set_dir), "evaluators") + os.makedirs(evaluators_dir, exist_ok=True) + # Parse input and output + try: + parsed_input = json.loads(input_data) if input_data else {} + except (json.JSONDecodeError, TypeError): + # If input_data is already a dict or not JSON, handle it + if isinstance(input_data, dict): + parsed_input = input_data + else: + parsed_input = {"raw_input": str(input_data)} + + try: + # Handle output_data which might be a string, dict, or other object + if isinstance(output_data, str): + parsed_output = json.loads(output_data) + elif isinstance(output_data, dict): + parsed_output = output_data + else: + # For other types, try to convert to dict + parsed_output = json.loads(str(output_data)) + except (json.JSONDecodeError, TypeError): + parsed_output = {"raw_output": str(output_data)} + + # Generate unique IDs + eval_id = str(uuid.uuid4()) + timestamp = datetime.utcnow().isoformat() + "Z" + + # Build evaluation criteria and create evaluator files + evaluation_criteria = {} + evaluator_refs = [] + + # Evaluator type mapping (supports both short names and full type IDs) + evaluator_type_map = { + "json_similarity": { + "name": "JsonSimilarityEvaluator", + "evaluatorTypeId": "uipath-json-similarity", + "config_defaults": {"name": "JsonSimilarityEvaluator"} + }, + "uipath-json-similarity": { + "name": "JsonSimilarityEvaluator", + "evaluatorTypeId": "uipath-json-similarity", + "config_defaults": {"name": "JsonSimilarityEvaluator"} + }, + "exact_match": { + "name": "ExactMatchEvaluator", + "evaluatorTypeId": "uipath-exact-match", + "config_defaults": {"name": "ExactMatchEvaluator", "case_sensitive": False} + }, + "uipath-exact-match": { + "name": "ExactMatchEvaluator", + "evaluatorTypeId": "uipath-exact-match", + "config_defaults": {"name": "ExactMatchEvaluator", "case_sensitive": False} + }, + "contains": { + "name": "ContainsEvaluator", + "evaluatorTypeId": "uipath-contains", + "config_defaults": {"name": "ContainsEvaluator"} + }, + "uipath-contains": { + "name": "ContainsEvaluator", + "evaluatorTypeId": "uipath-contains", + "config_defaults": {"name": "ContainsEvaluator"} + }, + "llm_judge": { + "name": "LLMJudgeOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "config_defaults": {"name": "LLMJudgeOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "uipath-llm-judge-output-semantic-similarity": { + "name": "LLMJudgeOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "config_defaults": {"name": "LLMJudgeOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "llm_judge_strict_json": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity", + "config_defaults": {"name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "uipath-llm-judge-output-strict-json-similarity": { + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity", + "config_defaults": {"name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "llm_judge_trajectory": { + "name": "LLMJudgeTrajectoryEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory", + "config_defaults": {"name": "LLMJudgeTrajectoryEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "uipath-llm-judge-trajectory": { + "name": "LLMJudgeTrajectoryEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory", + "config_defaults": {"name": "LLMJudgeTrajectoryEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "llm_judge_trajectory_simulation": { + "name": "LLMJudgeTrajectorySimulationEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation", + "config_defaults": {"name": "LLMJudgeTrajectorySimulationEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "uipath-llm-judge-trajectory-simulation": { + "name": "LLMJudgeTrajectorySimulationEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation", + "config_defaults": {"name": "LLMJudgeTrajectorySimulationEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"} + }, + "tool_call_args": { + "name": "ToolCallArgsEvaluator", + "evaluatorTypeId": "uipath-tool-call-args", + "config_defaults": {"name": "ToolCallArgsEvaluator"} + }, + "uipath-tool-call-args": { + "name": "ToolCallArgsEvaluator", + "evaluatorTypeId": "uipath-tool-call-args", + "config_defaults": {"name": "ToolCallArgsEvaluator"} + }, + "tool_call_count": { + "name": "ToolCallCountEvaluator", + "evaluatorTypeId": "uipath-tool-call-count", + "config_defaults": {"name": "ToolCallCountEvaluator"} + }, + "uipath-tool-call-count": { + "name": "ToolCallCountEvaluator", + "evaluatorTypeId": "uipath-tool-call-count", + "config_defaults": {"name": "ToolCallCountEvaluator"} + }, + "tool_call_order": { + "name": "ToolCallOrderEvaluator", + "evaluatorTypeId": "uipath-tool-call-order", + "config_defaults": {"name": "ToolCallOrderEvaluator"} + }, + "uipath-tool-call-order": { + "name": "ToolCallOrderEvaluator", + "evaluatorTypeId": "uipath-tool-call-order", + "config_defaults": {"name": "ToolCallOrderEvaluator"} + }, + "tool_call_output": { + "name": "ToolCallOutputEvaluator", + "evaluatorTypeId": "uipath-tool-call-output", + "config_defaults": {"name": "ToolCallOutputEvaluator"} + }, + "uipath-tool-call-output": { + "name": "ToolCallOutputEvaluator", + "evaluatorTypeId": "uipath-tool-call-output", + "config_defaults": {"name": "ToolCallOutputEvaluator"} + }, + } + + for evaluator_name in evaluators: + if evaluator_name not in evaluator_type_map: + console.warning(f"Unknown evaluator '{evaluator_name}', skipping") + continue + + evaluator_info = evaluator_type_map[evaluator_name] + evaluator_id = str(uuid.uuid4()) + evaluator_refs.append(evaluator_id) + + # Create evaluator JSON file + evaluator_def = { + "id": evaluator_id, + "name": f"{evaluator_info['name']} (auto-generated)", + "version": "1.0", + "evaluatorTypeId": evaluator_info["evaluatorTypeId"], + "evaluatorConfig": evaluator_info["config_defaults"], + } + + evaluator_file = os.path.join( + evaluators_dir, f"{evaluator_name}-{evaluator_id[:8]}.json" + ) + with open(evaluator_file, "w") as f: + json.dump(evaluator_def, f, indent=2) + + # Add evaluation criteria for this eval item (keyed by evaluator ID) + evaluation_criteria[evaluator_id] = { + "expected_output": parsed_output, + } + + # Create evaluation items + evaluation_items = [] + + # If spans are provided, create per-node evaluations + if spans: + # Filter spans to only include workflow nodes + node_spans = {} + node_order = [] # Track order of nodes + + for span in spans: + # First try to get the span name from the Name field (UiPath format) + span_name = span.get('Name', span.get('name', '')) + attributes = span.get('Attributes', span.get('attributes', {})) + + # Parse attributes if they're a JSON string + if isinstance(attributes, str): + try: + attributes = json.loads(attributes) + except: + attributes = {} + + # Determine the node name from various possible sources + node_name = None + if isinstance(attributes, dict): + node_name = attributes.get('node_name', attributes.get('langgraph.node', None)) + + # If no node_name attribute, use the span Name as the node name + if not node_name and span_name: + node_name = span_name + + # Only include valid workflow nodes (exclude system nodes, internal components, and LLM calls) + if node_name and node_name not in ['__start__', '__end__'] and not any( + node_name.startswith(prefix) for prefix in ['Runnable', 'UiPath', 'JsonOutput'] + ): + if node_name not in node_spans: + node_spans[node_name] = [] + node_order.append(node_name) + node_spans[node_name].append(span) + + if node_spans: + console.info(f"Found {len(node_spans)} workflow node(s) for evaluation generation") + + # Create evaluation for each node in execution order + for node_name in node_order: + node_span_list = node_spans[node_name] + # Get the most recent span for this node + node_span = node_span_list[-1] + node_attributes = node_span.get('Attributes', node_span.get('attributes', {})) + + # Parse attributes if they're a JSON string + if isinstance(node_attributes, str): + try: + node_attributes = json.loads(node_attributes) + except: + node_attributes = {} + + # Try different output keys: output.value, output, outputs + node_output = node_attributes.get('output.value', node_attributes.get('output', node_attributes.get('outputs', None))) + if isinstance(node_output, str): + try: + node_output = json.loads(node_output) + except: + pass + + if node_output: + # Create node-specific evaluation + node_eval_id = str(uuid.uuid4()) + node_evaluation_criteria = {} + + # Add evaluation criteria for each evaluator with node output + for evaluator_id in evaluator_refs: + node_evaluation_criteria[evaluator_id] = { + "expected_output": node_output, + } + + evaluation_items.append({ + "id": node_eval_id, + "name": f"Node: {node_name}", + "inputs": parsed_input, # Use agent input, not node-specific input + "evaluationCriterias": node_evaluation_criteria, + "expectedAgentBehavior": f"The agent should execute node '{node_name}' and produce the expected output during the workflow execution.", + "nodeId": node_name, # Add node identifier for evaluators to match against trace + }) + + # Always include final output evaluation + evaluation_item = { + "id": eval_id, + "name": f"Final Output", + "inputs": parsed_input, + "evaluationCriterias": evaluation_criteria, + "expectedAgentBehavior": "Agent should produce the expected output for the given input", + } + evaluation_items.append(evaluation_item) + + # Create evaluation set + eval_set = { + "id": str(uuid.uuid4()), + "name": f"Evaluation set generated from {entrypoint}", + "version": "1.0", + "evaluatorRefs": evaluator_refs, + "evaluations": evaluation_items, + } + + # Save eval set to file + with open(eval_set_file, "w") as f: + json.dump(eval_set, f, indent=2) + + console.success(f"Evaluation set generated and saved to: {eval_set_file}") + console.info(f"Generated {len(evaluation_items)} evaluation(s) with {len(evaluator_refs)} evaluator(s) in: {evaluators_dir}") + + except Exception as e: + console.error(f"Failed to generate evaluation set: {str(e)}", include_traceback=True) + + @click.command() @click.argument("entrypoint", required=False) @click.argument("input", required=False, default="{}") @@ -43,8 +410,8 @@ @click.option( "--output-file", required=False, - type=click.Path(exists=False), - help="File path where the output will be written", + type=click.Path(), + help="File path where the output will be written (will overwrite if exists)", ) @click.option( "--trace-file", @@ -63,6 +430,18 @@ default=5678, help="Port for the debug server (default: 5678)", ) +@click.option( + "--generate-evals", + required=False, + type=click.Path(), + help="Generate an evaluation set file from this run and save it to the specified path (will overwrite if exists)", +) +@click.option( + "--eval-evaluators", + multiple=True, + default=["json_similarity"], + help="Evaluators to use for generated eval set (can be specified multiple times). Available: json_similarity, exact_match, contains, llm_judge, llm_judge_strict_json, llm_judge_trajectory, llm_judge_trajectory_simulation, tool_call_args, tool_call_count, tool_call_order, tool_call_output. You can also use full type IDs like 'uipath-json-similarity'.", +) @track(when=lambda *_a, **_kw: env.get(ENV_JOB_ID) is None) def run( entrypoint: Optional[str], @@ -74,6 +453,8 @@ def run( trace_file: Optional[str], debug: bool, debug_port: int, + generate_evals: Optional[str], + eval_evaluators: tuple[str], ) -> None: """Execute the project.""" context_args = { @@ -84,6 +465,9 @@ def run( "execution_output_file": output_file, "trace_file": trace_file, "debug": debug, + "generate_evals": generate_evals, + # Enable tracing if we're generating evals to capture node data + "tracing_enabled": True if generate_evals else None, } input_file = file or input_file # Setup debugging if requested @@ -115,8 +499,11 @@ def run( Usage: `uipath run [-f ]`""") try: + execution_result = None + memory_span_exporter = None async def execute() -> None: + nonlocal execution_result, memory_span_exporter runtime_factory = generate_runtime_factory() context = runtime_factory.new_context(**context_args) if context.job_id: @@ -125,6 +512,16 @@ async def execute() -> None: if trace_file: runtime_factory.add_span_exporter(JsonLinesFileExporter(trace_file)) + # Add memory span exporter if generating evals to capture node-level data + # Use batch=False to ensure immediate export of spans + if generate_evals: + memory_span_exporter = MemorySpanExporter() + runtime_factory.add_span_exporter(memory_span_exporter, batch=False) + + # Add LangChain instrumentor to automatically trace LangChain/LangGraph operations + if LANGCHAIN_INSTRUMENTATION_AVAILABLE: + runtime_factory.add_instrumentor(LangChainInstrumentor, get_current_span) + if context.job_id: async with ResourceOverwritesContext( lambda: read_resource_overwrites_from_file(context.runtime_dir) @@ -133,15 +530,55 @@ async def execute() -> None: f"Applied {ctx.overwrites_count} resource overwrite(s)" ) - result = await runtime_factory.execute(context) + execution_result = await runtime_factory.execute(context) else: - result = await runtime_factory.execute(context) + execution_result = await runtime_factory.execute(context) if not context.job_id: - console.info(result.output) + console.info(execution_result.output) asyncio.run(execute()) + # Generate evaluation set if requested + if generate_evals and execution_result: + # Get the actual input data (from file or argument) + actual_input = input + if input_file and os.path.exists(input_file): + try: + with open(input_file, 'r') as f: + actual_input = f.read() + except Exception as e: + console.warning(f"Failed to read input file for eval generation: {e}") + + # Convert output to proper format for eval generation + output_for_eval = execution_result.output if hasattr(execution_result, 'output') else execution_result + + # If output is a Pydantic model, convert to dict + if hasattr(output_for_eval, 'model_dump'): + output_for_eval = output_for_eval.model_dump() + elif hasattr(output_for_eval, 'dict'): + output_for_eval = output_for_eval.dict() + # If it's already a dict, ensure it's not wrapped + elif isinstance(output_for_eval, dict) and 'dict' in output_for_eval: + # Unwrap if it's in the format {"dict": "..."} + try: + import ast + output_for_eval = ast.literal_eval(output_for_eval['dict']) + except: + pass # Keep as-is if parsing fails + + # Get spans from memory exporter if available + collected_spans = memory_span_exporter.spans if memory_span_exporter else None + + _generate_evaluation_set( + input_data=actual_input, + output_data=output_for_eval, + entrypoint=entrypoint, + eval_set_path=generate_evals, + evaluators=list(eval_evaluators) if eval_evaluators else None, + spans=collected_spans, + ) + except UiPathRuntimeError as e: console.error(f"{e.error_info.title} - {e.error_info.detail}") except Exception as e: diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py index ffffcff14..44c6ba8e7 100644 --- a/src/uipath/_events/_events.py +++ b/src/uipath/_events/_events.py @@ -24,6 +24,7 @@ class EvalSetRunCreatedEvent(BaseModel): no_of_evals: int # skip validation to avoid abstract class instantiation evaluators: SkipValidation[List[AnyEvaluator]] + evaluator_weights: Optional[Dict[str, float]] = None class EvalRunCreatedEvent(BaseModel): @@ -61,6 +62,8 @@ def validate_exception_details(self): class EvalSetRunUpdatedEvent(BaseModel): execution_id: str evaluator_scores: dict[str, float] + weighted_final_score: Optional[float] = None + evaluator_weights: Optional[Dict[str, float]] = None ProgressEvent = Union[ diff --git a/src/uipath/_services/context_grounding_service.py b/src/uipath/_services/context_grounding_service.py index 57f373d5c..1fe0a63a3 100644 --- a/src/uipath/_services/context_grounding_service.py +++ b/src/uipath/_services/context_grounding_service.py @@ -476,6 +476,7 @@ def search( spec.method, spec.endpoint, json=spec.json, + headers=spec.headers, ) return TypeAdapter(List[ContextGroundingQueryResponse]).validate_python( @@ -527,6 +528,7 @@ async def search_async( spec.method, spec.endpoint, json=spec.json, + headers=spec.headers, ) return TypeAdapter(List[ContextGroundingQueryResponse]).validate_python( diff --git a/src/uipath/eval/_helpers/evaluators_helpers.py b/src/uipath/eval/_helpers/evaluators_helpers.py index 8620130cf..7f8f4b356 100644 --- a/src/uipath/eval/_helpers/evaluators_helpers.py +++ b/src/uipath/eval/_helpers/evaluators_helpers.py @@ -420,6 +420,41 @@ def tool_calls_output_score( ), justifications +def extract_node_output_from_trace(agent_trace: Sequence[ReadableSpan], node_id: str) -> Any: + """Extract the output of a specific node from the agent execution trace. + + Args: + agent_trace: List of ReadableSpan objects from agent execution. + node_id: The identifier of the node to extract output from. + + Returns: + The output value of the node, or None if not found. + """ + for span in agent_trace: + if not span.attributes: + continue + + # Check if this span matches the node_id + span_name = span.name + node_name_attr = span.attributes.get('node_name') or span.attributes.get('langgraph.node') + + # Match by span name or node_name attribute + if span_name == node_id or node_name_attr == node_id: + # Extract output from span attributes + output_value = span.attributes.get('output.value') or span.attributes.get('output') + + # Try to parse if it's a JSON string + if isinstance(output_value, str): + try: + return json.loads(output_value) + except (json.JSONDecodeError, ValueError): + return output_value + + return output_value + + return None + + def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str: """Convert OTEL spans to a platform-style agent run history string. diff --git a/src/uipath/eval/evaluators/base_evaluator.py b/src/uipath/eval/evaluators/base_evaluator.py index 5a7e4615b..7f8333e15 100644 --- a/src/uipath/eval/evaluators/base_evaluator.py +++ b/src/uipath/eval/evaluators/base_evaluator.py @@ -3,7 +3,7 @@ import json import warnings from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar, Union, cast, get_args +from typing import Any, Generic, Optional, TypeVar, Union, cast, get_args from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic.alias_generators import to_camel @@ -17,7 +17,10 @@ class BaseEvaluationCriteria(BaseModel): """Base class for all evaluation criteria.""" model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) - pass + + node_id: Optional[str] = Field( + default=None, alias="nodeId" + ) # Optional node identifier for node-level evaluations # Type variable for evaluation criteria, used by both Config and Evaluator diff --git a/src/uipath/eval/evaluators/contains_evaluator.py b/src/uipath/eval/evaluators/contains_evaluator.py index 964c9a709..39f55e3be 100644 --- a/src/uipath/eval/evaluators/contains_evaluator.py +++ b/src/uipath/eval/evaluators/contains_evaluator.py @@ -1,5 +1,7 @@ """Contains evaluator for agent outputs.""" +import logging + from ..models import ( AgentExecution, EvaluationResult, @@ -12,6 +14,8 @@ OutputEvaluatorConfig, ) +logger = logging.getLogger(__name__) + class ContainsEvaluationCriteria(BaseEvaluationCriteria): """Evaluation criteria for the contains evaluator.""" @@ -58,19 +62,58 @@ async def evaluate( Returns: EvaluationResult: Boolean result indicating if output contains expected value (True/False) """ - actual_output = str(self._get_actual_output(agent_execution)) + actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria)) expected_output = str(self._get_expected_output(evaluation_criteria)) + # Debug logging (before case conversion) + if logger.isEnabledFor(logging.DEBUG): + logger.debug("\n" + "="*80) + logger.debug("[DEBUG] ContainsEvaluator - Comparison:") + logger.debug("="*80) + logger.debug("[ACTUAL OUTPUT (original)]:\n%s", actual_output) + logger.debug("\n" + "-"*80) + logger.debug("[EXPECTED OUTPUT (original)]:\n%s", expected_output) + logger.debug("-"*80) + if not self.evaluator_config.case_sensitive: actual_output = actual_output.lower() expected_output = expected_output.lower() + if logger.isEnabledFor(logging.DEBUG): + logger.debug("[ACTUAL OUTPUT (lowercased)]:\n%s", actual_output) + logger.debug("\n" + "-"*80) + logger.debug("[EXPECTED OUTPUT (lowercased)]:\n%s", expected_output) + logger.debug("-"*80) is_contains = expected_output in actual_output + if logger.isEnabledFor(logging.DEBUG): + logger.debug("[CASE SENSITIVE]: %s", self.evaluator_config.case_sensitive) + logger.debug("[NEGATED]: %s", self.evaluator_config.negated) + logger.debug("[CONTAINS RESULT]: %s", is_contains) + if self.evaluator_config.negated: is_contains = not is_contains + if logger.isEnabledFor(logging.DEBUG): + logger.debug("[FINAL RESULT (after negation)]: %s", is_contains) + else: + if logger.isEnabledFor(logging.DEBUG): + logger.debug("[FINAL RESULT]: %s", is_contains) + + if logger.isEnabledFor(logging.DEBUG): + logger.debug("="*80 + "\n") + + # Create details with comparison information + details = { + "actual_output": str(self._get_actual_output(agent_execution, evaluation_criteria)), + "search_text": str(self._get_expected_output(evaluation_criteria)), + "case_sensitive": self.evaluator_config.case_sensitive, + "negated": self.evaluator_config.negated, + "contains": is_contains, + } + return NumericEvaluationResult( score=float(is_contains), + details=details, ) def _get_expected_output( diff --git a/src/uipath/eval/evaluators/exact_match_evaluator.py b/src/uipath/eval/evaluators/exact_match_evaluator.py index 0ff8ebd2c..b3cc2f2ab 100644 --- a/src/uipath/eval/evaluators/exact_match_evaluator.py +++ b/src/uipath/eval/evaluators/exact_match_evaluator.py @@ -53,7 +53,7 @@ async def evaluate( Returns: EvaluationResult: Boolean result indicating exact match (True/False) """ - actual_output = str(self._get_actual_output(agent_execution)) + actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria)) expected_output = str(self._get_expected_output(evaluation_criteria)) if not self.evaluator_config.case_sensitive: actual_output = actual_output.lower() @@ -63,6 +63,16 @@ async def evaluate( if self.evaluator_config.negated: is_exact_match = not is_exact_match + # Create details with comparison information + details = { + "actual_output": str(self._get_actual_output(agent_execution, evaluation_criteria)), + "expected_output": str(self._get_expected_output(evaluation_criteria)), + "case_sensitive": self.evaluator_config.case_sensitive, + "negated": self.evaluator_config.negated, + "match": is_exact_match, + } + return NumericEvaluationResult( score=float(is_exact_match), + details=details, ) diff --git a/src/uipath/eval/evaluators/json_similarity_evaluator.py b/src/uipath/eval/evaluators/json_similarity_evaluator.py index 1e90c171c..10a0ae590 100644 --- a/src/uipath/eval/evaluators/json_similarity_evaluator.py +++ b/src/uipath/eval/evaluators/json_similarity_evaluator.py @@ -56,14 +56,23 @@ async def evaluate( Returns: EvaluationResult: Numerical score between 0-100 indicating similarity """ - score, justification = self._compare_json( - self._get_expected_output(evaluation_criteria), - self._get_actual_output(agent_execution), - ) + expected_output = self._get_expected_output(evaluation_criteria) + actual_output = self._get_actual_output(agent_execution, evaluation_criteria) + + score, justification = self._compare_json(expected_output, actual_output) validated_justification = self.validate_justification(justification) + + # Create details with comparison information + details = { + "actual_output": str(actual_output), + "expected_output": str(expected_output), + "similarity_details": validated_justification, + "score": score, + } + return NumericEvaluationResult( score=score, - details=validated_justification, + details=details, ) def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]: diff --git a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py index c55296583..814b78971 100644 --- a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py @@ -71,9 +71,17 @@ async def evaluate( llm_response = await self._get_llm_response(evaluation_prompt) + # Create details with comparison information + details = { + "actual_output": str(agent_execution.agent_output), + "expected_output": str(evaluation_criteria), + "llm_justification": llm_response.justification, + "llm_score": llm_response.score, + } + return NumericEvaluationResult( score=llm_response.score, - details=llm_response.justification, + details=details, ) def _create_evaluation_prompt( diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py index 71a543ab1..128026804 100644 --- a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py @@ -1,12 +1,15 @@ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs.""" import json +import logging from abc import abstractmethod from collections.abc import Callable from typing import Any, TypeVar from pydantic import BaseModel, Field, model_validator +logger = logging.getLogger(__name__) + from .._helpers.evaluators_helpers import COMMUNITY_agents_SUFFIX from ..models import ( AgentExecution, @@ -92,7 +95,9 @@ def _get_llm_service(self): ) from e @abstractmethod - def _get_actual_output(self, agent_execution: AgentExecution) -> Any: + def _get_actual_output( + self, agent_execution: AgentExecution, evaluation_criteria: T + ) -> Any: """Get the actual output from the agent execution. Must be implemented by concrete evaluator classes.""" pass @@ -107,6 +112,9 @@ async def evaluate( evaluation_criteria: T, ) -> EvaluationResult: """Evaluate using an LLM as a judge.""" + actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria)) + expected_output = str(self._get_expected_output(evaluation_criteria)) + evaluation_prompt = self._create_evaluation_prompt( agent_execution=agent_execution, evaluation_criteria=evaluation_criteria, @@ -117,9 +125,17 @@ async def evaluate( llm_response.justification ) + # Create detailed response with comparison info and LLM justification + details = { + "actual_output": actual_output, + "expected_output": expected_output, + "llm_justification": validated_justification, + "llm_score": llm_response.score, + } + return NumericEvaluationResult( score=max(0.0, min(1.0, round(llm_response.score / 100.0, 2))), - details=validated_justification, + details=details, ) def _create_evaluation_prompt( @@ -128,13 +144,26 @@ def _create_evaluation_prompt( evaluation_criteria: T, ) -> str: """Create the evaluation prompt for the LLM.""" + actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria)) + expected_output = str(self._get_expected_output(evaluation_criteria)) + + # Debug logging + if logger.isEnabledFor(logging.DEBUG): + logger.debug("\n" + "="*80) + logger.debug("[DEBUG] LLMJudgeOutputEvaluator - Comparison:") + logger.debug("="*80) + logger.debug("[ACTUAL OUTPUT]:\n%s", actual_output) + logger.debug("\n" + "-"*80) + logger.debug("[EXPECTED OUTPUT]:\n%s", expected_output) + logger.debug("="*80 + "\n") + formatted_prompt = self.evaluator_config.prompt.replace( self.actual_output_placeholder, - str(self._get_actual_output(agent_execution)), + actual_output, ) formatted_prompt = formatted_prompt.replace( self.expected_output_placeholder, - str(self._get_expected_output(evaluation_criteria)), + expected_output, ) return formatted_prompt @@ -147,22 +176,33 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: model = model.replace(COMMUNITY_agents_SUFFIX, "") # Prepare the request + # For Anthropic models, explicitly request JSON in the user message + is_anthropic = model.startswith("anthropic.") + user_content = evaluation_prompt + if is_anthropic: + schema_json = json.dumps(self.output_schema.model_json_schema(), indent=2) + user_content = f"{evaluation_prompt}\n\nYou MUST respond with valid JSON matching this exact schema:\n{schema_json}\n\nProvide ONLY the JSON response, no other text." + request_data = { "model": model, "messages": [ {"role": "system", "content": self.system_prompt}, - {"role": "user", "content": evaluation_prompt}, + {"role": "user", "content": user_content}, ], - "response_format": { + "max_tokens": self.evaluator_config.max_tokens, + "temperature": self.evaluator_config.temperature, + } + + # Only add response_format for non-Anthropic models + # Anthropic models don't support json_schema response_format via Normalized API + if not is_anthropic: + request_data["response_format"] = { "type": "json_schema", "json_schema": { "name": "evaluation_response", "schema": self.output_schema.model_json_schema(), }, - }, - "max_tokens": self.evaluator_config.max_tokens, - "temperature": self.evaluator_config.temperature, - } + } if self.llm_service is None: raise UiPathEvaluationError( @@ -191,12 +231,26 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse: detail="The LLM response message content was None.", category=UiPathEvaluationErrorCategory.SYSTEM, ) + if not content or not str(content).strip(): + raise UiPathEvaluationError( + code="EMPTY_LLM_RESPONSE", + title="Empty LLM response", + detail=f"The LLM response message content was empty. Content: '{content}'", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) parsed_response = json.loads(str(content)) + except json.JSONDecodeError as e: + raise UiPathEvaluationError( + code="FAILED_TO_PARSE_LLM_RESPONSE", + title="Failed to parse LLM response", + detail=f"Error: {e}\nContent received: '{content}'\nContent type: {type(content)}", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) from e except Exception as e: raise UiPathEvaluationError( code="FAILED_TO_PARSE_LLM_RESPONSE", title="Failed to parse LLM response", - detail=f"Error: {e}", + detail=f"Error: {e}\nContent received: '{content}'", category=UiPathEvaluationErrorCategory.SYSTEM, ) from e return LLMResponse(**parsed_response) diff --git a/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py b/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py index eac5c11b1..5972bc9f5 100644 --- a/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +++ b/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py @@ -76,7 +76,9 @@ async def evaluate( """Evaluate using trajectory analysis.""" return await super().evaluate(agent_execution, evaluation_criteria) - def _get_actual_output(self, agent_execution: AgentExecution) -> Any: + def _get_actual_output( + self, agent_execution: AgentExecution, evaluation_criteria: TrajectoryEvaluationCriteria + ) -> Any: """Get the actual output from the agent execution.""" return trace_to_str(agent_execution.agent_trace) diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py index 2aa362e18..987e5f40f 100644 --- a/src/uipath/eval/evaluators/output_evaluator.py +++ b/src/uipath/eval/evaluators/output_evaluator.py @@ -1,7 +1,7 @@ """Base class for all output evaluator configurations.""" import json -from typing import Any, TypeVar, Union +from typing import Any, Optional, TypeVar, Union from pydantic import Field @@ -13,12 +13,14 @@ BaseEvaluatorConfig, BaseEvaluatorJustification, ) +from .._helpers.evaluators_helpers import extract_node_output_from_trace class OutputEvaluationCriteria(BaseEvaluationCriteria): """Base class for all output evaluation criteria.""" expected_output: dict[str, Any] | str + # node_id inherited from BaseEvaluationCriteria T = TypeVar("T", bound=BaseEvaluationCriteria) @@ -50,8 +52,31 @@ class BaseOutputEvaluator(BaseEvaluator[T, C, J]): J: The justification type """ - def _get_actual_output(self, agent_execution: AgentExecution) -> Any: - """Get the actual output from the agent execution.""" + def _get_actual_output( + self, agent_execution: AgentExecution, evaluation_criteria: T + ) -> Any: + """Get the actual output from the agent execution. + + If the evaluation criteria contains a node_id, extract the node output from the trace. + Otherwise, return the agent's final output. + """ + # Check if this is a node-level evaluation + node_id = getattr(evaluation_criteria, "node_id", None) + if node_id: + # Extract node output from trace + node_output = extract_node_output_from_trace( + agent_execution.agent_trace, node_id + ) + if node_output is None: + raise UiPathEvaluationError( + code="NODE_OUTPUT_NOT_FOUND", + title=f"Node output not found for node_id: {node_id}", + detail=f"Could not find output for node '{node_id}' in agent trace", + category=UiPathEvaluationErrorCategory.USER, + ) + return node_output + + # Standard agent output extraction if self.evaluator_config.target_output_key != "*": try: return agent_execution.agent_output[ diff --git a/src/uipath/eval/evaluators/tool_call_args_evaluator.py b/src/uipath/eval/evaluators/tool_call_args_evaluator.py index 2703e3c76..880f93aba 100644 --- a/src/uipath/eval/evaluators/tool_call_args_evaluator.py +++ b/src/uipath/eval/evaluators/tool_call_args_evaluator.py @@ -76,7 +76,17 @@ async def evaluate( self.evaluator_config.subset, ) validated_justification = self.validate_justification(justification) + + # Create details with comparison information + details = { + "actual_tool_calls": [{"name": tc.name, "args": tc.args} for tc in tool_calls_order], + "expected_tool_calls": [{"name": tc.name, "args": tc.args} for tc in evaluation_criteria.tool_calls], + "strict_mode": self.evaluator_config.strict, + "subset_mode": self.evaluator_config.subset, + "explanation": validated_justification, + } + return NumericEvaluationResult( score=score, - details=validated_justification, + details=details, ) diff --git a/src/uipath/eval/evaluators/tool_call_count_evaluator.py b/src/uipath/eval/evaluators/tool_call_count_evaluator.py index 11d684ae1..3079ff198 100644 --- a/src/uipath/eval/evaluators/tool_call_count_evaluator.py +++ b/src/uipath/eval/evaluators/tool_call_count_evaluator.py @@ -81,7 +81,16 @@ async def evaluate( self.evaluator_config.strict, ) validated_justification = self.validate_justification(justification) + + # Create details with comparison information + details = { + "actual_tool_calls": dict(tool_calls_count), + "expected_tool_calls": evaluation_criteria.tool_calls_count, + "strict_mode": self.evaluator_config.strict, + "explanation": validated_justification, + } + return NumericEvaluationResult( score=score, - details=validated_justification, + details=details, ) diff --git a/src/uipath/eval/models/models.py b/src/uipath/eval/models/models.py index f3e9e3ca9..5ed4805ea 100644 --- a/src/uipath/eval/models/models.py +++ b/src/uipath/eval/models/models.py @@ -39,7 +39,7 @@ class ScoreType(IntEnum): class BaseEvaluationResult(BaseModel): """Base class for evaluation results.""" - details: Optional[str | BaseModel] = None + details: Optional[str | Dict[str, Any] | BaseModel] = None # this is marked as optional, as it is populated inside the 'measure_execution_time' decorator evaluation_time: Optional[float] = None