diff --git a/hello b/hello
new file mode 100644
index 000000000..f0ad0dec6
--- /dev/null
+++ b/hello
@@ -0,0 +1 @@
+hello.txt
diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py
index 5d1d17f38..1f3504b0f 100644
--- a/src/uipath/_cli/_evals/_console_progress_reporter.py
+++ b/src/uipath/_cli/_evals/_console_progress_reporter.py
@@ -29,6 +29,7 @@ def __init__(self):
         self.evaluators: Dict[str, AnyEvaluator] = {}
         self.display_started = False
         self.eval_results_by_name: Dict[str, list[Any]] = {}
+        self.evaluator_weights: Dict[str, float] = {}
 
     def _convert_score_to_numeric(self, eval_result) -> float:
         """Convert evaluation result score to numeric value."""
@@ -99,6 +100,8 @@ async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> N
         """Handle evaluation set run creation."""
         try:
             self.evaluators = {eval.id: eval for eval in payload.evaluators}
+            if payload.evaluator_weights:
+                self.evaluator_weights = payload.evaluator_weights
         except Exception as e:
             logger.error(f"Failed to handle create eval set run event: {e}")
 
@@ -206,9 +209,20 @@ def display_final_results(self):
 
                 summary_table.add_row(*row_values)
 
-            # Add separator row before average
+            # Add separator row before weights and average
             summary_table.add_section()
 
+            # Add weights row if weights are defined
+            if self.evaluator_weights:
+                weight_row_values = ["[bold]Weights[/bold]"]
+                for evaluator_id in evaluator_ids:
+                    weight = self.evaluator_weights.get(evaluator_id, "-")
+                    if weight != "-":
+                        weight_row_values.append(f"[bold]{weight:.1f}[/bold]")
+                    else:
+                        weight_row_values.append("[bold]-[/bold]")
+                summary_table.add_row(*weight_row_values)
+
             # Add average row
             avg_row_values = ["[bold]Average[/bold]"]
             for evaluator_id in evaluator_ids:
@@ -217,8 +231,31 @@ def display_final_results(self):
 
             summary_table.add_row(*avg_row_values)
 
-            self.console.print(summary_table)
-            self.console.print()
+            # Calculate and display weighted final score if weights are defined
+            if self.evaluator_weights:
+                weighted_total = 0.0
+                weights_sum = 0.0
+                for evaluator_id in evaluator_ids:
+                    weight = self.evaluator_weights.get(evaluator_id)
+                    if weight is not None:
+                        avg_score = self.final_results[evaluator_id]
+                        weighted_total += weight * avg_score
+                        weights_sum += weight
+
+                # Display as a separate info line
+                self.console.print(summary_table)
+                self.console.print()
+                self.console.print(
+                    f"[bold cyan]Weighted Final Score:[/bold cyan] [bold green]{weighted_total:.2f}[/bold green]"
+                )
+                if weights_sum != 1.0:
+                    self.console.print(
+                        f"[dim](Note: Weights sum to {weights_sum:.2f})[/dim]"
+                    )
+                self.console.print()
+            else:
+                self.console.print(summary_table)
+                self.console.print()
         else:
             self.console.print(
                 "→ [bold green]All evaluations completed successfully![/bold green]"
diff --git a/src/uipath/_cli/_evals/_models/_evaluation_set.py b/src/uipath/_cli/_evals/_models/_evaluation_set.py
index 137f4f06e..e81c46d77 100644
--- a/src/uipath/_cli/_evals/_models/_evaluation_set.py
+++ b/src/uipath/_cli/_evals/_models/_evaluation_set.py
@@ -160,6 +160,9 @@ class EvaluationSet(BaseModel):
     version: Literal["1.0"] = "1.0"
     evaluator_refs: List[str] = Field(default_factory=list)
     evaluations: List[EvaluationItem] = Field(default_factory=list)
+    evaluator_weights: Optional[Dict[str, float]] = Field(
+        default=None, alias="evaluatorWeights"
+    )
 
     def extract_selected_evals(self, eval_ids) -> None:
         selected_evals: list[EvaluationItem] = []
diff --git a/src/uipath/_cli/_evals/_models/_output.py b/src/uipath/_cli/_evals/_models/_output.py
index c3ba1c728..28aa7f42e 100644
--- a/src/uipath/_cli/_evals/_models/_output.py
+++ b/src/uipath/_cli/_evals/_models/_output.py
@@ -46,7 +46,7 @@ class EvaluationResultDto(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     score: float
-    details: Optional[str | BaseModel] = None
+    details: Optional[str | Dict[str, Any] | BaseModel] = None
     evaluation_time: Optional[float] = None
 
     @model_serializer(mode="wrap")
@@ -56,6 +56,7 @@ def serialize_model(
         info: core_schema.SerializationInfo,
     ) -> Any:
         data = serializer(self)
+        # Only remove details if it's None, keep empty dicts and populated dicts
         if self.details is None and isinstance(data, dict):
             data.pop("details", None)
         return data
@@ -85,6 +86,8 @@ class EvaluationRunResultDto(BaseModel):
 
     evaluator_name: str
     evaluator_id: str
+    evaluator_type: Optional[str] = None
+    node_id: Optional[str] = None
     result: EvaluationResultDto
 
 
@@ -93,6 +96,7 @@ class EvaluationRunResult(BaseModel):
 
     evaluation_name: str
     evaluation_run_results: List[EvaluationRunResultDto]
+    workflow: Optional[List[str]] = None
     agent_execution_output: Optional[UiPathSerializableEvalRunExecutionOutput] = None
 
     @property
@@ -110,6 +114,8 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: List[EvaluationRunResult]
+    weighted_final_score: Optional[float] = None
+    evaluator_weights: Optional[Dict[str, float]] = None
 
     @property
     def score(self) -> float:
diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py
index 5c0516e9b..efb6d5a0e 100644
--- a/src/uipath/_cli/_evals/_progress_reporter.py
+++ b/src/uipath/_cli/_evals/_progress_reporter.py
@@ -329,10 +329,11 @@ async def update_eval_set_run(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        weighted_final_score: float | None = None,
     ):
         """Update the evaluation set run status to complete."""
         spec = self._update_eval_set_run_spec(
-            eval_set_run_id, evaluator_scores, is_coded
+            eval_set_run_id, evaluator_scores, is_coded, weighted_final_score
         )
         await self._client.request_async(
             method=spec.method,
@@ -452,6 +453,7 @@ async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> N
                     eval_set_run_id,
                     payload.evaluator_scores,
                     is_coded=is_coded,
+                    weighted_final_score=payload.weighted_final_score,
                 )
                 logger.debug(
                     f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded})"
@@ -797,6 +799,7 @@ def _update_eval_set_run_spec(
         eval_set_run_id: str,
         evaluator_scores: dict[str, float],
         is_coded: bool = False,
+        weighted_final_score: float | None = None,
     ) -> RequestSpec:
         # Legacy API expects evaluatorId as GUID, coded accepts string
         evaluator_scores_list = []
@@ -820,16 +823,24 @@ def _update_eval_set_run_spec(
 
         # For legacy evaluations, endpoint is without /coded
         endpoint_suffix = "coded/" if is_coded else ""
+
+        # Build the JSON payload
+        json_payload = {
+            "evalSetRunId": eval_set_run_id,
+            "status": EvaluationStatus.COMPLETED.value,
+            "evaluatorScores": evaluator_scores_list,
+        }
+
+        # Add weighted final score if available
+        if weighted_final_score is not None:
+            json_payload["weightedFinalScore"] = weighted_final_score
+
         return RequestSpec(
             method="PUT",
             endpoint=Endpoint(
                 f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun"
             ),
-            json={
-                "evalSetRunId": eval_set_run_id,
-                "status": EvaluationStatus.COMPLETED.value,
-                "evaluatorScores": evaluator_scores_list,
-            },
+            json=json_payload,
             headers=self._tenant_header(),
         )
 
diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py
index b9a7e7d6f..5ec936808 100644
--- a/src/uipath/_cli/_evals/_runtime.py
+++ b/src/uipath/_cli/_evals/_runtime.py
@@ -12,9 +12,7 @@
 from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
 
-from uipath._cli._evals.mocks.input_mocker import (
-    generate_llm_input,
-)
+from uipath._cli._evals.mocks.input_mocker import generate_llm_input
 
 from ..._events._event_bus import EventBus
 from ..._events._events import (
@@ -59,15 +57,115 @@
     convert_eval_execution_output_to_serializable,
 )
 from ._span_collection import ExecutionSpanCollector
-from .mocks.mocks import (
-    clear_execution_context,
-    set_execution_context,
-)
+from .mocks.mocks import clear_execution_context, set_execution_context
 
 T = TypeVar("T", bound=UiPathBaseRuntime)
 C = TypeVar("C", bound=UiPathRuntimeContext)
 
 
+def extract_workflow_from_spans(spans: list[ReadableSpan]) -> list[str]:
+    """Extract ordered list of main workflow nodes from execution spans.
+
+    Only captures workflow nodes that are direct children of a LangGraph parent span,
+    which naturally filters out sub-nodes and internal components.
+
+    Args:
+        spans: List of ReadableSpan objects from agent execution
+
+    Returns:
+        List of unique main node names in execution order
+    """
+   
+    for i, span in enumerate(spans):
+        span_name = getattr(span, "name", "NO_NAME")
+        attributes = getattr(span, "attributes", {})
+        parent_context = getattr(span, "parent", None)
+        parent_span_id = None
+        if parent_context:
+            parent_span_id = getattr(parent_context, "span_id", None)
+
+        span_context = span.get_span_context()
+        span_id = span_context.span_id if span_context else "NO_ID"
+
+        if isinstance(attributes, dict):
+            node_name = attributes.get("node_name")
+            langgraph_node = attributes.get("langgraph.node")
+
+    node_order = []
+    seen_nodes = set()
+
+    # System nodes to exclude
+    system_nodes = {"__start__", "__end__"}
+
+    # First pass: Find LangGraph-related parent span IDs
+    # Look for spans that could be the main graph span (could have different names)
+    langgraph_span_ids = set()
+    for span in spans:
+        span_name = getattr(span, "name", "")
+        # Check if this is a LangGraph main span
+        if span_name and "langgraph" in span_name.lower():
+            span_context = span.get_span_context()
+            if span_context:
+                langgraph_span_ids.add(span_context.span_id)
+
+
+    # If we found potential parent spans, use them; otherwise we'll check all spans with langgraph.node
+    if langgraph_span_ids:
+        # Second pass: Collect spans that have a LangGraph parent
+        for span in spans:
+            # Get parent span ID
+            parent_context = getattr(span, "parent", None)
+            parent_span_id = None
+            if parent_context:
+                parent_span_id = getattr(parent_context, "span_id", None)
+
+            # Skip if parent is not one of the LangGraph spans
+            if parent_span_id not in langgraph_span_ids:
+                continue
+
+            # Get node name - use span name directly since attributes might not have it
+            span_name = getattr(span, "name", "")
+            attributes = getattr(span, "attributes", {})
+
+            # Try to get from attributes first, then fall back to span name
+            node_name = None
+            if isinstance(attributes, dict):
+                node_name = attributes.get("langgraph.node") or attributes.get("node_name")
+
+            if not node_name:
+                node_name = span_name
+
+            # Skip if no node name found
+            if not node_name:
+                continue
+
+            # Filter out system nodes
+            if node_name in system_nodes:
+                continue
+
+            # Add to workflow if not seen before
+            if node_name not in seen_nodes:
+                seen_nodes.add(node_name)
+                node_order.append(node_name)
+    else:
+        # Fallback: Just get all spans with langgraph.node attribute
+        for span in spans:
+            attributes = getattr(span, "attributes", None)
+            if not attributes or not isinstance(attributes, dict):
+                continue
+
+            node_name = attributes.get("langgraph.node")
+
+            if not node_name or node_name in system_nodes:
+                continue
+
+            if node_name not in seen_nodes:
+                seen_nodes.add(node_name)
+                node_order.append(node_name)
+
+    return node_order
+
+
 class ExecutionSpanExporter(SpanExporter):
     """Custom exporter that stores spans grouped by execution ids."""
 
@@ -219,6 +317,7 @@ async def execute(self) -> UiPathRuntimeResult:
                 eval_set_id=evaluation_set.id,
                 no_of_evals=len(evaluation_set.evaluations),
                 evaluators=evaluators,
+                evaluator_weights=getattr(evaluation_set, "evaluator_weights", None),
             ),
         )
 
@@ -235,16 +334,13 @@ async def execute(self) -> UiPathRuntimeResult:
             eval_run_result_list = await self._execute_sequential(
                 evaluation_set, evaluators, event_bus
             )
-        results = UiPathEvalOutput(
-            evaluation_set_name=evaluation_set.name,
-            evaluation_set_results=eval_run_result_list,
-        )
 
         # Computing evaluator averages
         evaluator_averages: Dict[str, float] = defaultdict(float)
         evaluator_count: Dict[str, int] = defaultdict(int)
 
-        for eval_run_result in results.evaluation_set_results:
+        # Collect all evaluation results first
+        for eval_run_result in eval_run_result_list:
             for result_dto in eval_run_result.evaluation_run_results:
                 evaluator_averages[result_dto.evaluator_id] += result_dto.result.score
                 evaluator_count[result_dto.evaluator_id] += 1
@@ -253,11 +349,33 @@ async def execute(self) -> UiPathRuntimeResult:
             evaluator_averages[eval_id] = (
                 evaluator_averages[eval_id] / evaluator_count[eval_id]
             )
+
+        # Calculate weighted final score if weights are defined
+        evaluator_weights = getattr(evaluation_set, "evaluator_weights", None)
+        weighted_final_score = None
+        if evaluator_weights:
+            weighted_total = 0.0
+            for evaluator_id, avg_score in evaluator_averages.items():
+                weight = evaluator_weights.get(evaluator_id)
+                if weight is not None:
+                    weighted_total += weight * avg_score
+            weighted_final_score = weighted_total
+
+        # Create results with weighted score and weights
+        results = UiPathEvalOutput(
+            evaluation_set_name=evaluation_set.name,
+            evaluation_set_results=eval_run_result_list,
+            weighted_final_score=weighted_final_score,
+            evaluator_weights=evaluator_weights,
+        )
+
         await event_bus.publish(
             EvaluationEvents.UPDATE_EVAL_SET_RUN,
             EvalSetRunUpdatedEvent(
                 execution_id=self.execution_id,
                 evaluator_scores=evaluator_averages,
+                weighted_final_score=weighted_final_score,
+                evaluator_weights=evaluator_weights,
             ),
             wait_for_completion=False,
         )
@@ -405,6 +523,11 @@ async def _execute_eval(
                             )
                         )
                     )
+                    # Extract workflow nodes from spans even in error case
+                    if spans:
+                        workflow = extract_workflow_from_spans(spans)
+                        if workflow:
+                            evaluation_run_results.workflow = workflow
                 raise
 
             if self.context.verbose:
@@ -413,6 +536,12 @@ async def _execute_eval(
                         agent_execution_output
                     )
                 )
+
+            # Extract workflow nodes from spans
+            workflow = extract_workflow_from_spans(agent_execution_output.spans)
+            # Always set workflow, even if empty, to distinguish from no extraction
+            evaluation_run_results.workflow = workflow if workflow else None
+
             evaluation_item_results: list[EvalItemResult] = []
 
             for evaluator in evaluators:
@@ -456,11 +585,27 @@ async def _execute_eval(
                     evaluation_result
                 )
 
+                # Extract node_id from evaluation criteria if available
+                node_id = None
+                if isinstance(eval_item, EvaluationItem) and evaluator.id in eval_item.evaluation_criterias:
+                    criteria_dict = eval_item.evaluation_criterias[evaluator.id]
+                    if criteria_dict:
+                        node_id = criteria_dict.get("nodeId")
+
+                # Get evaluator type from evaluator's get_evaluator_id method
+                evaluator_type = None
+                try:
+                    evaluator_type = evaluator.get_evaluator_id()
+                except AttributeError:
+                    pass
+
                 evaluation_run_results.evaluation_run_results.append(
                     EvaluationRunResultDto(
                         evaluator_name=evaluator.name,
                         result=dto_result,
                         evaluator_id=evaluator.id,
+                        evaluator_type=evaluator_type,
+                        node_id=node_id,
                     )
                 )
                 evaluation_item_results.append(
@@ -489,10 +634,26 @@ async def _execute_eval(
             exception_details = EvalItemExceptionDetails(exception=e)
 
             for evaluator in evaluators:
+                # Extract node_id from evaluation criteria if available
+                node_id = None
+                if isinstance(eval_item, EvaluationItem) and evaluator.id in eval_item.evaluation_criterias:
+                    criteria_dict = eval_item.evaluation_criterias[evaluator.id]
+                    if criteria_dict:
+                        node_id = criteria_dict.get("nodeId")
+
+                # Get evaluator type from evaluator's get_evaluator_id method
+                evaluator_type = None
+                try:
+                    evaluator_type = evaluator.get_evaluator_id()
+                except AttributeError:
+                    pass
+
                 evaluation_run_results.evaluation_run_results.append(
                     EvaluationRunResultDto(
                         evaluator_name=evaluator.name,
                         evaluator_id=evaluator.id,
+                        evaluator_type=evaluator_type,
+                        node_id=node_id,
                         result=EvaluationResultDto(score=0),
                     )
                 )
diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py
index a1ca0cf31..0ab974745 100644
--- a/src/uipath/_cli/cli_eval.py
+++ b/src/uipath/_cli/cli_eval.py
@@ -82,6 +82,12 @@ def setup_reporting_prereq(no_report: bool) -> bool:
     type=click.Path(exists=False),
     help="File path where the output will be written",
 )
+@click.option(
+    "--verbose",
+    is_flag=True,
+    help="Enable verbose debug output for evaluators",
+    default=False,
+)
 @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None)
 def eval(
     entrypoint: Optional[str],
@@ -91,6 +97,7 @@ def eval(
     no_report: bool,
     workers: int,
     output_file: Optional[str],
+    verbose: bool,
 ) -> None:
     """Run an evaluation set against the agent.
 
@@ -101,7 +108,18 @@ def eval(
         eval_set_run_id: Custom evaluation set run ID (optional, will generate UUID if not specified)
         workers: Number of parallel workers for running evaluations
         no_report: Do not report the evaluation results
+        verbose: Enable verbose debug output for evaluators
     """
+    # Configure logging level for evaluators if verbose is enabled
+    if verbose:
+        import logging
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format='%(message)s'
+        )
+        # Set the evaluators logger to DEBUG
+        logging.getLogger('uipath.eval.evaluators').setLevel(logging.DEBUG)
+
     context_args = {
         "entrypoint": entrypoint or auto_discover_entrypoint(),
         "eval_set": eval_set,
diff --git a/src/uipath/_cli/cli_run.py b/src/uipath/_cli/cli_run.py
index 2f8cf0571..a0b366474 100644
--- a/src/uipath/_cli/cli_run.py
+++ b/src/uipath/_cli/cli_run.py
@@ -1,16 +1,22 @@
 # type: ignore
 import asyncio
+import json
 import os
+from datetime import datetime
 from os import environ as env
-from typing import Optional
+from typing import Optional, Sequence
+import uuid
 
 import click
+from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
 
 from uipath._cli._runtime._runtime_factory import generate_runtime_factory
 from uipath._cli._utils._common import read_resource_overwrites_from_file
 from uipath._cli._utils._debug import setup_debugging
 from uipath._utils._bindings import ResourceOverwritesContext
 from uipath.tracing import JsonLinesFileExporter, LlmOpsHttpExporter
+from uipath.tracing._utils import _SpanUtils
 
 from .._utils.constants import (
     ENV_JOB_ID,
@@ -20,9 +26,370 @@
 from ._utils._console import ConsoleLogger
 from .middlewares import Middlewares
 
+# Import LangChain instrumentor for automatic span generation
+try:
+    from openinference.instrumentation.langchain import (
+        LangChainInstrumentor,
+        get_current_span,
+    )
+    LANGCHAIN_INSTRUMENTATION_AVAILABLE = True
+except ImportError:
+    LANGCHAIN_INSTRUMENTATION_AVAILABLE = False
+
 console = ConsoleLogger()
 
 
+class MemorySpanExporter(SpanExporter):
+    """Span exporter that collects spans in memory for later processing."""
+
+    def __init__(self):
+        self.spans = []
+
+    def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
+        """Export spans to memory."""
+        try:
+            for span in spans:
+                uipath_span = _SpanUtils.otel_span_to_uipath_span(
+                    span, serialize_attributes=True
+                )
+                self.spans.append(uipath_span.to_dict(serialize_attributes=False))
+            return SpanExportResult.SUCCESS
+        except Exception:
+            return SpanExportResult.FAILURE
+
+    def shutdown(self) -> None:
+        """Shutdown the exporter."""
+        pass
+
+    def force_flush(self, timeout_millis: int = 30000) -> bool:
+        """Force flush any buffered spans."""
+        return True
+
+
+def _generate_evaluation_set(
+    input_data: str,
+    output_data: str,
+    entrypoint: str,
+    eval_set_path: str,
+    evaluators: list[str] = None,
+    spans: list[dict] = None,
+) -> None:
+    """Generate an evaluation set JSON file from a run execution.
+
+    Args:
+        input_data: The input data used for the run (as JSON string)
+        output_data: The output data from the run (as JSON string)
+        entrypoint: Path to the agent script
+        eval_set_path: Path where the evaluation set JSON file will be saved
+        evaluators: List of evaluator names to use (e.g., ['json_similarity', 'exact_match'])
+        spans: Optional list of span dictionaries containing node execution data
+    """
+    try:
+        # Use json_similarity as default if no evaluators specified
+        if not evaluators:
+            evaluators = ["json_similarity"]
+
+        # Create the directory structure for eval sets and evaluators
+        eval_set_file = os.path.abspath(eval_set_path)
+        eval_set_dir = os.path.dirname(eval_set_file)
+
+        # If not already in an eval-sets dir, create proper structure
+        if not eval_set_dir.endswith("eval-sets"):
+            eval_set_dir = os.path.join(eval_set_dir, "evals", "eval-sets")
+            eval_set_file = os.path.join(eval_set_dir, os.path.basename(eval_set_path))
+
+        os.makedirs(eval_set_dir, exist_ok=True)
+
+        # Create evaluators directory at the sibling level
+        evaluators_dir = os.path.join(os.path.dirname(eval_set_dir), "evaluators")
+        os.makedirs(evaluators_dir, exist_ok=True)
+        # Parse input and output
+        try:
+            parsed_input = json.loads(input_data) if input_data else {}
+        except (json.JSONDecodeError, TypeError):
+            # If input_data is already a dict or not JSON, handle it
+            if isinstance(input_data, dict):
+                parsed_input = input_data
+            else:
+                parsed_input = {"raw_input": str(input_data)}
+
+        try:
+            # Handle output_data which might be a string, dict, or other object
+            if isinstance(output_data, str):
+                parsed_output = json.loads(output_data)
+            elif isinstance(output_data, dict):
+                parsed_output = output_data
+            else:
+                # For other types, try to convert to dict
+                parsed_output = json.loads(str(output_data))
+        except (json.JSONDecodeError, TypeError):
+            parsed_output = {"raw_output": str(output_data)}
+
+        # Generate unique IDs
+        eval_id = str(uuid.uuid4())
+        timestamp = datetime.utcnow().isoformat() + "Z"
+
+        # Build evaluation criteria and create evaluator files
+        evaluation_criteria = {}
+        evaluator_refs = []
+
+        # Evaluator type mapping (supports both short names and full type IDs)
+        evaluator_type_map = {
+            "json_similarity": {
+                "name": "JsonSimilarityEvaluator",
+                "evaluatorTypeId": "uipath-json-similarity",
+                "config_defaults": {"name": "JsonSimilarityEvaluator"}
+            },
+            "uipath-json-similarity": {
+                "name": "JsonSimilarityEvaluator",
+                "evaluatorTypeId": "uipath-json-similarity",
+                "config_defaults": {"name": "JsonSimilarityEvaluator"}
+            },
+            "exact_match": {
+                "name": "ExactMatchEvaluator",
+                "evaluatorTypeId": "uipath-exact-match",
+                "config_defaults": {"name": "ExactMatchEvaluator", "case_sensitive": False}
+            },
+            "uipath-exact-match": {
+                "name": "ExactMatchEvaluator",
+                "evaluatorTypeId": "uipath-exact-match",
+                "config_defaults": {"name": "ExactMatchEvaluator", "case_sensitive": False}
+            },
+            "contains": {
+                "name": "ContainsEvaluator",
+                "evaluatorTypeId": "uipath-contains",
+                "config_defaults": {"name": "ContainsEvaluator"}
+            },
+            "uipath-contains": {
+                "name": "ContainsEvaluator",
+                "evaluatorTypeId": "uipath-contains",
+                "config_defaults": {"name": "ContainsEvaluator"}
+            },
+            "llm_judge": {
+                "name": "LLMJudgeOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+                "config_defaults": {"name": "LLMJudgeOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "uipath-llm-judge-output-semantic-similarity": {
+                "name": "LLMJudgeOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity",
+                "config_defaults": {"name": "LLMJudgeOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "llm_judge_strict_json": {
+                "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity",
+                "config_defaults": {"name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "uipath-llm-judge-output-strict-json-similarity": {
+                "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-output-strict-json-similarity",
+                "config_defaults": {"name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "llm_judge_trajectory": {
+                "name": "LLMJudgeTrajectoryEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory",
+                "config_defaults": {"name": "LLMJudgeTrajectoryEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "uipath-llm-judge-trajectory": {
+                "name": "LLMJudgeTrajectoryEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory",
+                "config_defaults": {"name": "LLMJudgeTrajectoryEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "llm_judge_trajectory_simulation": {
+                "name": "LLMJudgeTrajectorySimulationEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation",
+                "config_defaults": {"name": "LLMJudgeTrajectorySimulationEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "uipath-llm-judge-trajectory-simulation": {
+                "name": "LLMJudgeTrajectorySimulationEvaluator",
+                "evaluatorTypeId": "uipath-llm-judge-trajectory-simulation",
+                "config_defaults": {"name": "LLMJudgeTrajectorySimulationEvaluator", "model": "anthropic.claude-3-5-sonnet-20240620-v1:0"}
+            },
+            "tool_call_args": {
+                "name": "ToolCallArgsEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-args",
+                "config_defaults": {"name": "ToolCallArgsEvaluator"}
+            },
+            "uipath-tool-call-args": {
+                "name": "ToolCallArgsEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-args",
+                "config_defaults": {"name": "ToolCallArgsEvaluator"}
+            },
+            "tool_call_count": {
+                "name": "ToolCallCountEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-count",
+                "config_defaults": {"name": "ToolCallCountEvaluator"}
+            },
+            "uipath-tool-call-count": {
+                "name": "ToolCallCountEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-count",
+                "config_defaults": {"name": "ToolCallCountEvaluator"}
+            },
+            "tool_call_order": {
+                "name": "ToolCallOrderEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-order",
+                "config_defaults": {"name": "ToolCallOrderEvaluator"}
+            },
+            "uipath-tool-call-order": {
+                "name": "ToolCallOrderEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-order",
+                "config_defaults": {"name": "ToolCallOrderEvaluator"}
+            },
+            "tool_call_output": {
+                "name": "ToolCallOutputEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-output",
+                "config_defaults": {"name": "ToolCallOutputEvaluator"}
+            },
+            "uipath-tool-call-output": {
+                "name": "ToolCallOutputEvaluator",
+                "evaluatorTypeId": "uipath-tool-call-output",
+                "config_defaults": {"name": "ToolCallOutputEvaluator"}
+            },
+        }
+
+        for evaluator_name in evaluators:
+            if evaluator_name not in evaluator_type_map:
+                console.warning(f"Unknown evaluator '{evaluator_name}', skipping")
+                continue
+
+            evaluator_info = evaluator_type_map[evaluator_name]
+            evaluator_id = str(uuid.uuid4())
+            evaluator_refs.append(evaluator_id)
+
+            # Create evaluator JSON file
+            evaluator_def = {
+                "id": evaluator_id,
+                "name": f"{evaluator_info['name']} (auto-generated)",
+                "version": "1.0",
+                "evaluatorTypeId": evaluator_info["evaluatorTypeId"],
+                "evaluatorConfig": evaluator_info["config_defaults"],
+            }
+
+            evaluator_file = os.path.join(
+                evaluators_dir, f"{evaluator_name}-{evaluator_id[:8]}.json"
+            )
+            with open(evaluator_file, "w") as f:
+                json.dump(evaluator_def, f, indent=2)
+
+            # Add evaluation criteria for this eval item (keyed by evaluator ID)
+            evaluation_criteria[evaluator_id] = {
+                "expected_output": parsed_output,
+            }
+
+        # Create evaluation items
+        evaluation_items = []
+
+        # If spans are provided, create per-node evaluations
+        if spans:
+            # Filter spans to only include workflow nodes
+            node_spans = {}
+            node_order = []  # Track order of nodes
+
+            for span in spans:
+                # First try to get the span name from the Name field (UiPath format)
+                span_name = span.get('Name', span.get('name', ''))
+                attributes = span.get('Attributes', span.get('attributes', {}))
+
+                # Parse attributes if they're a JSON string
+                if isinstance(attributes, str):
+                    try:
+                        attributes = json.loads(attributes)
+                    except:
+                        attributes = {}
+
+                # Determine the node name from various possible sources
+                node_name = None
+                if isinstance(attributes, dict):
+                    node_name = attributes.get('node_name', attributes.get('langgraph.node', None))
+
+                # If no node_name attribute, use the span Name as the node name
+                if not node_name and span_name:
+                    node_name = span_name
+
+                # Only include valid workflow nodes (exclude system nodes, internal components, and LLM calls)
+                if node_name and node_name not in ['__start__', '__end__'] and not any(
+                    node_name.startswith(prefix) for prefix in ['Runnable', 'UiPath', 'JsonOutput']
+                ):
+                    if node_name not in node_spans:
+                        node_spans[node_name] = []
+                        node_order.append(node_name)
+                    node_spans[node_name].append(span)
+
+            if node_spans:
+                console.info(f"Found {len(node_spans)} workflow node(s) for evaluation generation")
+
+                # Create evaluation for each node in execution order
+                for node_name in node_order:
+                    node_span_list = node_spans[node_name]
+                    # Get the most recent span for this node
+                    node_span = node_span_list[-1]
+                    node_attributes = node_span.get('Attributes', node_span.get('attributes', {}))
+
+                    # Parse attributes if they're a JSON string
+                    if isinstance(node_attributes, str):
+                        try:
+                            node_attributes = json.loads(node_attributes)
+                        except:
+                            node_attributes = {}
+
+                    # Try different output keys: output.value, output, outputs
+                    node_output = node_attributes.get('output.value', node_attributes.get('output', node_attributes.get('outputs', None)))
+                    if isinstance(node_output, str):
+                        try:
+                            node_output = json.loads(node_output)
+                        except:
+                            pass
+
+                    if node_output:
+                        # Create node-specific evaluation
+                        node_eval_id = str(uuid.uuid4())
+                        node_evaluation_criteria = {}
+
+                        # Add evaluation criteria for each evaluator with node output
+                        for evaluator_id in evaluator_refs:
+                            node_evaluation_criteria[evaluator_id] = {
+                                "expected_output": node_output,
+                            }
+
+                        evaluation_items.append({
+                            "id": node_eval_id,
+                            "name": f"Node: {node_name}",
+                            "inputs": parsed_input,  # Use agent input, not node-specific input
+                            "evaluationCriterias": node_evaluation_criteria,
+                            "expectedAgentBehavior": f"The agent should execute node '{node_name}' and produce the expected output during the workflow execution.",
+                            "nodeId": node_name,  # Add node identifier for evaluators to match against trace
+                        })
+
+        # Always include final output evaluation
+        evaluation_item = {
+            "id": eval_id,
+            "name": f"Final Output",
+            "inputs": parsed_input,
+            "evaluationCriterias": evaluation_criteria,
+            "expectedAgentBehavior": "Agent should produce the expected output for the given input",
+        }
+        evaluation_items.append(evaluation_item)
+
+        # Create evaluation set
+        eval_set = {
+            "id": str(uuid.uuid4()),
+            "name": f"Evaluation set generated from {entrypoint}",
+            "version": "1.0",
+            "evaluatorRefs": evaluator_refs,
+            "evaluations": evaluation_items,
+        }
+
+        # Save eval set to file
+        with open(eval_set_file, "w") as f:
+            json.dump(eval_set, f, indent=2)
+
+        console.success(f"Evaluation set generated and saved to: {eval_set_file}")
+        console.info(f"Generated {len(evaluation_items)} evaluation(s) with {len(evaluator_refs)} evaluator(s) in: {evaluators_dir}")
+
+    except Exception as e:
+        console.error(f"Failed to generate evaluation set: {str(e)}", include_traceback=True)
+
+
 @click.command()
 @click.argument("entrypoint", required=False)
 @click.argument("input", required=False, default="{}")
@@ -43,8 +410,8 @@
 @click.option(
     "--output-file",
     required=False,
-    type=click.Path(exists=False),
-    help="File path where the output will be written",
+    type=click.Path(),
+    help="File path where the output will be written (will overwrite if exists)",
 )
 @click.option(
     "--trace-file",
@@ -63,6 +430,18 @@
     default=5678,
     help="Port for the debug server (default: 5678)",
 )
+@click.option(
+    "--generate-evals",
+    required=False,
+    type=click.Path(),
+    help="Generate an evaluation set file from this run and save it to the specified path (will overwrite if exists)",
+)
+@click.option(
+    "--eval-evaluators",
+    multiple=True,
+    default=["json_similarity"],
+    help="Evaluators to use for generated eval set (can be specified multiple times). Available: json_similarity, exact_match, contains, llm_judge, llm_judge_strict_json, llm_judge_trajectory, llm_judge_trajectory_simulation, tool_call_args, tool_call_count, tool_call_order, tool_call_output. You can also use full type IDs like 'uipath-json-similarity'.",
+)
 @track(when=lambda *_a, **_kw: env.get(ENV_JOB_ID) is None)
 def run(
     entrypoint: Optional[str],
@@ -74,6 +453,8 @@ def run(
     trace_file: Optional[str],
     debug: bool,
     debug_port: int,
+    generate_evals: Optional[str],
+    eval_evaluators: tuple[str],
 ) -> None:
     """Execute the project."""
     context_args = {
@@ -84,6 +465,9 @@ def run(
         "execution_output_file": output_file,
         "trace_file": trace_file,
         "debug": debug,
+        "generate_evals": generate_evals,
+        # Enable tracing if we're generating evals to capture node data
+        "tracing_enabled": True if generate_evals else None,
     }
     input_file = file or input_file
     # Setup debugging if requested
@@ -115,8 +499,11 @@ def run(
     Usage: `uipath run <entrypoint_path> <input_arguments> [-f <input_json_file_path>]`""")
 
         try:
+            execution_result = None
+            memory_span_exporter = None
 
             async def execute() -> None:
+                nonlocal execution_result, memory_span_exporter
                 runtime_factory = generate_runtime_factory()
                 context = runtime_factory.new_context(**context_args)
                 if context.job_id:
@@ -125,6 +512,16 @@ async def execute() -> None:
                 if trace_file:
                     runtime_factory.add_span_exporter(JsonLinesFileExporter(trace_file))
 
+                # Add memory span exporter if generating evals to capture node-level data
+                # Use batch=False to ensure immediate export of spans
+                if generate_evals:
+                    memory_span_exporter = MemorySpanExporter()
+                    runtime_factory.add_span_exporter(memory_span_exporter, batch=False)
+
+                    # Add LangChain instrumentor to automatically trace LangChain/LangGraph operations
+                    if LANGCHAIN_INSTRUMENTATION_AVAILABLE:
+                        runtime_factory.add_instrumentor(LangChainInstrumentor, get_current_span)
+
                 if context.job_id:
                     async with ResourceOverwritesContext(
                         lambda: read_resource_overwrites_from_file(context.runtime_dir)
@@ -133,15 +530,55 @@ async def execute() -> None:
                             f"Applied {ctx.overwrites_count} resource overwrite(s)"
                         )
 
-                        result = await runtime_factory.execute(context)
+                        execution_result = await runtime_factory.execute(context)
                 else:
-                    result = await runtime_factory.execute(context)
+                    execution_result = await runtime_factory.execute(context)
 
                 if not context.job_id:
-                    console.info(result.output)
+                    console.info(execution_result.output)
 
             asyncio.run(execute())
 
+            # Generate evaluation set if requested
+            if generate_evals and execution_result:
+                # Get the actual input data (from file or argument)
+                actual_input = input
+                if input_file and os.path.exists(input_file):
+                    try:
+                        with open(input_file, 'r') as f:
+                            actual_input = f.read()
+                    except Exception as e:
+                        console.warning(f"Failed to read input file for eval generation: {e}")
+
+                # Convert output to proper format for eval generation
+                output_for_eval = execution_result.output if hasattr(execution_result, 'output') else execution_result
+
+                # If output is a Pydantic model, convert to dict
+                if hasattr(output_for_eval, 'model_dump'):
+                    output_for_eval = output_for_eval.model_dump()
+                elif hasattr(output_for_eval, 'dict'):
+                    output_for_eval = output_for_eval.dict()
+                # If it's already a dict, ensure it's not wrapped
+                elif isinstance(output_for_eval, dict) and 'dict' in output_for_eval:
+                    # Unwrap if it's in the format {"dict": "..."}
+                    try:
+                        import ast
+                        output_for_eval = ast.literal_eval(output_for_eval['dict'])
+                    except:
+                        pass  # Keep as-is if parsing fails
+
+                # Get spans from memory exporter if available
+                collected_spans = memory_span_exporter.spans if memory_span_exporter else None
+
+                _generate_evaluation_set(
+                    input_data=actual_input,
+                    output_data=output_for_eval,
+                    entrypoint=entrypoint,
+                    eval_set_path=generate_evals,
+                    evaluators=list(eval_evaluators) if eval_evaluators else None,
+                    spans=collected_spans,
+                )
+
         except UiPathRuntimeError as e:
             console.error(f"{e.error_info.title} - {e.error_info.detail}")
         except Exception as e:
diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py
index ffffcff14..44c6ba8e7 100644
--- a/src/uipath/_events/_events.py
+++ b/src/uipath/_events/_events.py
@@ -24,6 +24,7 @@ class EvalSetRunCreatedEvent(BaseModel):
     no_of_evals: int
     # skip validation to avoid abstract class instantiation
     evaluators: SkipValidation[List[AnyEvaluator]]
+    evaluator_weights: Optional[Dict[str, float]] = None
 
 
 class EvalRunCreatedEvent(BaseModel):
@@ -61,6 +62,8 @@ def validate_exception_details(self):
 class EvalSetRunUpdatedEvent(BaseModel):
     execution_id: str
     evaluator_scores: dict[str, float]
+    weighted_final_score: Optional[float] = None
+    evaluator_weights: Optional[Dict[str, float]] = None
 
 
 ProgressEvent = Union[
diff --git a/src/uipath/_services/context_grounding_service.py b/src/uipath/_services/context_grounding_service.py
index 57f373d5c..1fe0a63a3 100644
--- a/src/uipath/_services/context_grounding_service.py
+++ b/src/uipath/_services/context_grounding_service.py
@@ -476,6 +476,7 @@ def search(
             spec.method,
             spec.endpoint,
             json=spec.json,
+            headers=spec.headers,
         )
 
         return TypeAdapter(List[ContextGroundingQueryResponse]).validate_python(
@@ -527,6 +528,7 @@ async def search_async(
             spec.method,
             spec.endpoint,
             json=spec.json,
+            headers=spec.headers,
         )
 
         return TypeAdapter(List[ContextGroundingQueryResponse]).validate_python(
diff --git a/src/uipath/eval/_helpers/evaluators_helpers.py b/src/uipath/eval/_helpers/evaluators_helpers.py
index 8620130cf..7f8f4b356 100644
--- a/src/uipath/eval/_helpers/evaluators_helpers.py
+++ b/src/uipath/eval/_helpers/evaluators_helpers.py
@@ -420,6 +420,41 @@ def tool_calls_output_score(
     ), justifications
 
 
+def extract_node_output_from_trace(agent_trace: Sequence[ReadableSpan], node_id: str) -> Any:
+    """Extract the output of a specific node from the agent execution trace.
+
+    Args:
+        agent_trace: List of ReadableSpan objects from agent execution.
+        node_id: The identifier of the node to extract output from.
+
+    Returns:
+        The output value of the node, or None if not found.
+    """
+    for span in agent_trace:
+        if not span.attributes:
+            continue
+
+        # Check if this span matches the node_id
+        span_name = span.name
+        node_name_attr = span.attributes.get('node_name') or span.attributes.get('langgraph.node')
+
+        # Match by span name or node_name attribute
+        if span_name == node_id or node_name_attr == node_id:
+            # Extract output from span attributes
+            output_value = span.attributes.get('output.value') or span.attributes.get('output')
+
+            # Try to parse if it's a JSON string
+            if isinstance(output_value, str):
+                try:
+                    return json.loads(output_value)
+                except (json.JSONDecodeError, ValueError):
+                    return output_value
+
+            return output_value
+
+    return None
+
+
 def trace_to_str(agent_trace: Sequence[ReadableSpan]) -> str:
     """Convert OTEL spans to a platform-style agent run history string.
 
diff --git a/src/uipath/eval/evaluators/base_evaluator.py b/src/uipath/eval/evaluators/base_evaluator.py
index 5a7e4615b..7f8333e15 100644
--- a/src/uipath/eval/evaluators/base_evaluator.py
+++ b/src/uipath/eval/evaluators/base_evaluator.py
@@ -3,7 +3,7 @@
 import json
 import warnings
 from abc import ABC, abstractmethod
-from typing import Any, Generic, TypeVar, Union, cast, get_args
+from typing import Any, Generic, Optional, TypeVar, Union, cast, get_args
 
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from pydantic.alias_generators import to_camel
@@ -17,7 +17,10 @@ class BaseEvaluationCriteria(BaseModel):
     """Base class for all evaluation criteria."""
 
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
-    pass
+
+    node_id: Optional[str] = Field(
+        default=None, alias="nodeId"
+    )  # Optional node identifier for node-level evaluations
 
 
 # Type variable for evaluation criteria, used by both Config and Evaluator
diff --git a/src/uipath/eval/evaluators/contains_evaluator.py b/src/uipath/eval/evaluators/contains_evaluator.py
index 964c9a709..39f55e3be 100644
--- a/src/uipath/eval/evaluators/contains_evaluator.py
+++ b/src/uipath/eval/evaluators/contains_evaluator.py
@@ -1,5 +1,7 @@
 """Contains evaluator for agent outputs."""
 
+import logging
+
 from ..models import (
     AgentExecution,
     EvaluationResult,
@@ -12,6 +14,8 @@
     OutputEvaluatorConfig,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class ContainsEvaluationCriteria(BaseEvaluationCriteria):
     """Evaluation criteria for the contains evaluator."""
@@ -58,19 +62,58 @@ async def evaluate(
         Returns:
             EvaluationResult: Boolean result indicating if output contains expected value (True/False)
         """
-        actual_output = str(self._get_actual_output(agent_execution))
+        actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria))
         expected_output = str(self._get_expected_output(evaluation_criteria))
 
+        # Debug logging (before case conversion)
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("\n" + "="*80)
+            logger.debug("[DEBUG] ContainsEvaluator - Comparison:")
+            logger.debug("="*80)
+            logger.debug("[ACTUAL OUTPUT (original)]:\n%s", actual_output)
+            logger.debug("\n" + "-"*80)
+            logger.debug("[EXPECTED OUTPUT (original)]:\n%s", expected_output)
+            logger.debug("-"*80)
+
         if not self.evaluator_config.case_sensitive:
             actual_output = actual_output.lower()
             expected_output = expected_output.lower()
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug("[ACTUAL OUTPUT (lowercased)]:\n%s", actual_output)
+                logger.debug("\n" + "-"*80)
+                logger.debug("[EXPECTED OUTPUT (lowercased)]:\n%s", expected_output)
+                logger.debug("-"*80)
 
         is_contains = expected_output in actual_output
 
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("[CASE SENSITIVE]: %s", self.evaluator_config.case_sensitive)
+            logger.debug("[NEGATED]: %s", self.evaluator_config.negated)
+            logger.debug("[CONTAINS RESULT]: %s", is_contains)
+
         if self.evaluator_config.negated:
             is_contains = not is_contains
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug("[FINAL RESULT (after negation)]: %s", is_contains)
+        else:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug("[FINAL RESULT]: %s", is_contains)
+
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("="*80 + "\n")
+
+        # Create details with comparison information
+        details = {
+            "actual_output": str(self._get_actual_output(agent_execution, evaluation_criteria)),
+            "search_text": str(self._get_expected_output(evaluation_criteria)),
+            "case_sensitive": self.evaluator_config.case_sensitive,
+            "negated": self.evaluator_config.negated,
+            "contains": is_contains,
+        }
+
         return NumericEvaluationResult(
             score=float(is_contains),
+            details=details,
         )
 
     def _get_expected_output(
diff --git a/src/uipath/eval/evaluators/exact_match_evaluator.py b/src/uipath/eval/evaluators/exact_match_evaluator.py
index 0ff8ebd2c..b3cc2f2ab 100644
--- a/src/uipath/eval/evaluators/exact_match_evaluator.py
+++ b/src/uipath/eval/evaluators/exact_match_evaluator.py
@@ -53,7 +53,7 @@ async def evaluate(
         Returns:
             EvaluationResult: Boolean result indicating exact match (True/False)
         """
-        actual_output = str(self._get_actual_output(agent_execution))
+        actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria))
         expected_output = str(self._get_expected_output(evaluation_criteria))
         if not self.evaluator_config.case_sensitive:
             actual_output = actual_output.lower()
@@ -63,6 +63,16 @@ async def evaluate(
         if self.evaluator_config.negated:
             is_exact_match = not is_exact_match
 
+        # Create details with comparison information
+        details = {
+            "actual_output": str(self._get_actual_output(agent_execution, evaluation_criteria)),
+            "expected_output": str(self._get_expected_output(evaluation_criteria)),
+            "case_sensitive": self.evaluator_config.case_sensitive,
+            "negated": self.evaluator_config.negated,
+            "match": is_exact_match,
+        }
+
         return NumericEvaluationResult(
             score=float(is_exact_match),
+            details=details,
         )
diff --git a/src/uipath/eval/evaluators/json_similarity_evaluator.py b/src/uipath/eval/evaluators/json_similarity_evaluator.py
index 1e90c171c..10a0ae590 100644
--- a/src/uipath/eval/evaluators/json_similarity_evaluator.py
+++ b/src/uipath/eval/evaluators/json_similarity_evaluator.py
@@ -56,14 +56,23 @@ async def evaluate(
         Returns:
             EvaluationResult: Numerical score between 0-100 indicating similarity
         """
-        score, justification = self._compare_json(
-            self._get_expected_output(evaluation_criteria),
-            self._get_actual_output(agent_execution),
-        )
+        expected_output = self._get_expected_output(evaluation_criteria)
+        actual_output = self._get_actual_output(agent_execution, evaluation_criteria)
+
+        score, justification = self._compare_json(expected_output, actual_output)
         validated_justification = self.validate_justification(justification)
+
+        # Create details with comparison information
+        details = {
+            "actual_output": str(actual_output),
+            "expected_output": str(expected_output),
+            "similarity_details": validated_justification,
+            "score": score,
+        }
+
         return NumericEvaluationResult(
             score=score,
-            details=validated_justification,
+            details=details,
         )
 
     def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]:
diff --git a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py
index c55296583..814b78971 100644
--- a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py
+++ b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py
@@ -71,9 +71,17 @@ async def evaluate(
 
         llm_response = await self._get_llm_response(evaluation_prompt)
 
+        # Create details with comparison information
+        details = {
+            "actual_output": str(agent_execution.agent_output),
+            "expected_output": str(evaluation_criteria),
+            "llm_justification": llm_response.justification,
+            "llm_score": llm_response.score,
+        }
+
         return NumericEvaluationResult(
             score=llm_response.score,
-            details=llm_response.justification,
+            details=details,
         )
 
     def _create_evaluation_prompt(
diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py
index 71a543ab1..128026804 100644
--- a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py
+++ b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py
@@ -1,12 +1,15 @@
 """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
 
 import json
+import logging
 from abc import abstractmethod
 from collections.abc import Callable
 from typing import Any, TypeVar
 
 from pydantic import BaseModel, Field, model_validator
 
+logger = logging.getLogger(__name__)
+
 from .._helpers.evaluators_helpers import COMMUNITY_agents_SUFFIX
 from ..models import (
     AgentExecution,
@@ -92,7 +95,9 @@ def _get_llm_service(self):
             ) from e
 
     @abstractmethod
-    def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
+    def _get_actual_output(
+        self, agent_execution: AgentExecution, evaluation_criteria: T
+    ) -> Any:
         """Get the actual output from the agent execution. Must be implemented by concrete evaluator classes."""
         pass
 
@@ -107,6 +112,9 @@ async def evaluate(
         evaluation_criteria: T,
     ) -> EvaluationResult:
         """Evaluate using an LLM as a judge."""
+        actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria))
+        expected_output = str(self._get_expected_output(evaluation_criteria))
+
         evaluation_prompt = self._create_evaluation_prompt(
             agent_execution=agent_execution,
             evaluation_criteria=evaluation_criteria,
@@ -117,9 +125,17 @@ async def evaluate(
             llm_response.justification
         )
 
+        # Create detailed response with comparison info and LLM justification
+        details = {
+            "actual_output": actual_output,
+            "expected_output": expected_output,
+            "llm_justification": validated_justification,
+            "llm_score": llm_response.score,
+        }
+
         return NumericEvaluationResult(
             score=max(0.0, min(1.0, round(llm_response.score / 100.0, 2))),
-            details=validated_justification,
+            details=details,
         )
 
     def _create_evaluation_prompt(
@@ -128,13 +144,26 @@ def _create_evaluation_prompt(
         evaluation_criteria: T,
     ) -> str:
         """Create the evaluation prompt for the LLM."""
+        actual_output = str(self._get_actual_output(agent_execution, evaluation_criteria))
+        expected_output = str(self._get_expected_output(evaluation_criteria))
+
+        # Debug logging
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("\n" + "="*80)
+            logger.debug("[DEBUG] LLMJudgeOutputEvaluator - Comparison:")
+            logger.debug("="*80)
+            logger.debug("[ACTUAL OUTPUT]:\n%s", actual_output)
+            logger.debug("\n" + "-"*80)
+            logger.debug("[EXPECTED OUTPUT]:\n%s", expected_output)
+            logger.debug("="*80 + "\n")
+
         formatted_prompt = self.evaluator_config.prompt.replace(
             self.actual_output_placeholder,
-            str(self._get_actual_output(agent_execution)),
+            actual_output,
         )
         formatted_prompt = formatted_prompt.replace(
             self.expected_output_placeholder,
-            str(self._get_expected_output(evaluation_criteria)),
+            expected_output,
         )
 
         return formatted_prompt
@@ -147,22 +176,33 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
             model = model.replace(COMMUNITY_agents_SUFFIX, "")
 
         # Prepare the request
+        # For Anthropic models, explicitly request JSON in the user message
+        is_anthropic = model.startswith("anthropic.")
+        user_content = evaluation_prompt
+        if is_anthropic:
+            schema_json = json.dumps(self.output_schema.model_json_schema(), indent=2)
+            user_content = f"{evaluation_prompt}\n\nYou MUST respond with valid JSON matching this exact schema:\n{schema_json}\n\nProvide ONLY the JSON response, no other text."
+
         request_data = {
             "model": model,
             "messages": [
                 {"role": "system", "content": self.system_prompt},
-                {"role": "user", "content": evaluation_prompt},
+                {"role": "user", "content": user_content},
             ],
-            "response_format": {
+            "max_tokens": self.evaluator_config.max_tokens,
+            "temperature": self.evaluator_config.temperature,
+        }
+
+        # Only add response_format for non-Anthropic models
+        # Anthropic models don't support json_schema response_format via Normalized API
+        if not is_anthropic:
+            request_data["response_format"] = {
                 "type": "json_schema",
                 "json_schema": {
                     "name": "evaluation_response",
                     "schema": self.output_schema.model_json_schema(),
                 },
-            },
-            "max_tokens": self.evaluator_config.max_tokens,
-            "temperature": self.evaluator_config.temperature,
-        }
+            }
 
         if self.llm_service is None:
             raise UiPathEvaluationError(
@@ -191,12 +231,26 @@ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
                     detail="The LLM response message content was None.",
                     category=UiPathEvaluationErrorCategory.SYSTEM,
                 )
+            if not content or not str(content).strip():
+                raise UiPathEvaluationError(
+                    code="EMPTY_LLM_RESPONSE",
+                    title="Empty LLM response",
+                    detail=f"The LLM response message content was empty. Content: '{content}'",
+                    category=UiPathEvaluationErrorCategory.SYSTEM,
+                )
             parsed_response = json.loads(str(content))
+        except json.JSONDecodeError as e:
+            raise UiPathEvaluationError(
+                code="FAILED_TO_PARSE_LLM_RESPONSE",
+                title="Failed to parse LLM response",
+                detail=f"Error: {e}\nContent received: '{content}'\nContent type: {type(content)}",
+                category=UiPathEvaluationErrorCategory.SYSTEM,
+            ) from e
         except Exception as e:
             raise UiPathEvaluationError(
                 code="FAILED_TO_PARSE_LLM_RESPONSE",
                 title="Failed to parse LLM response",
-                detail=f"Error: {e}",
+                detail=f"Error: {e}\nContent received: '{content}'",
                 category=UiPathEvaluationErrorCategory.SYSTEM,
             ) from e
         return LLMResponse(**parsed_response)
diff --git a/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py b/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py
index eac5c11b1..5972bc9f5 100644
--- a/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py
+++ b/src/uipath/eval/evaluators/llm_judge_trajectory_evaluator.py
@@ -76,7 +76,9 @@ async def evaluate(
         """Evaluate using trajectory analysis."""
         return await super().evaluate(agent_execution, evaluation_criteria)
 
-    def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
+    def _get_actual_output(
+        self, agent_execution: AgentExecution, evaluation_criteria: TrajectoryEvaluationCriteria
+    ) -> Any:
         """Get the actual output from the agent execution."""
         return trace_to_str(agent_execution.agent_trace)
 
diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py
index 2aa362e18..987e5f40f 100644
--- a/src/uipath/eval/evaluators/output_evaluator.py
+++ b/src/uipath/eval/evaluators/output_evaluator.py
@@ -1,7 +1,7 @@
 """Base class for all output evaluator configurations."""
 
 import json
-from typing import Any, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 
 from pydantic import Field
 
@@ -13,12 +13,14 @@
     BaseEvaluatorConfig,
     BaseEvaluatorJustification,
 )
+from .._helpers.evaluators_helpers import extract_node_output_from_trace
 
 
 class OutputEvaluationCriteria(BaseEvaluationCriteria):
     """Base class for all output evaluation criteria."""
 
     expected_output: dict[str, Any] | str
+    # node_id inherited from BaseEvaluationCriteria
 
 
 T = TypeVar("T", bound=BaseEvaluationCriteria)
@@ -50,8 +52,31 @@ class BaseOutputEvaluator(BaseEvaluator[T, C, J]):
         J: The justification type
     """
 
-    def _get_actual_output(self, agent_execution: AgentExecution) -> Any:
-        """Get the actual output from the agent execution."""
+    def _get_actual_output(
+        self, agent_execution: AgentExecution, evaluation_criteria: T
+    ) -> Any:
+        """Get the actual output from the agent execution.
+
+        If the evaluation criteria contains a node_id, extract the node output from the trace.
+        Otherwise, return the agent's final output.
+        """
+        # Check if this is a node-level evaluation
+        node_id = getattr(evaluation_criteria, "node_id", None)
+        if node_id:
+            # Extract node output from trace
+            node_output = extract_node_output_from_trace(
+                agent_execution.agent_trace, node_id
+            )
+            if node_output is None:
+                raise UiPathEvaluationError(
+                    code="NODE_OUTPUT_NOT_FOUND",
+                    title=f"Node output not found for node_id: {node_id}",
+                    detail=f"Could not find output for node '{node_id}' in agent trace",
+                    category=UiPathEvaluationErrorCategory.USER,
+                )
+            return node_output
+
+        # Standard agent output extraction
         if self.evaluator_config.target_output_key != "*":
             try:
                 return agent_execution.agent_output[
diff --git a/src/uipath/eval/evaluators/tool_call_args_evaluator.py b/src/uipath/eval/evaluators/tool_call_args_evaluator.py
index 2703e3c76..880f93aba 100644
--- a/src/uipath/eval/evaluators/tool_call_args_evaluator.py
+++ b/src/uipath/eval/evaluators/tool_call_args_evaluator.py
@@ -76,7 +76,17 @@ async def evaluate(
             self.evaluator_config.subset,
         )
         validated_justification = self.validate_justification(justification)
+
+        # Create details with comparison information
+        details = {
+            "actual_tool_calls": [{"name": tc.name, "args": tc.args} for tc in tool_calls_order],
+            "expected_tool_calls": [{"name": tc.name, "args": tc.args} for tc in evaluation_criteria.tool_calls],
+            "strict_mode": self.evaluator_config.strict,
+            "subset_mode": self.evaluator_config.subset,
+            "explanation": validated_justification,
+        }
+
         return NumericEvaluationResult(
             score=score,
-            details=validated_justification,
+            details=details,
         )
diff --git a/src/uipath/eval/evaluators/tool_call_count_evaluator.py b/src/uipath/eval/evaluators/tool_call_count_evaluator.py
index 11d684ae1..3079ff198 100644
--- a/src/uipath/eval/evaluators/tool_call_count_evaluator.py
+++ b/src/uipath/eval/evaluators/tool_call_count_evaluator.py
@@ -81,7 +81,16 @@ async def evaluate(
             self.evaluator_config.strict,
         )
         validated_justification = self.validate_justification(justification)
+
+        # Create details with comparison information
+        details = {
+            "actual_tool_calls": dict(tool_calls_count),
+            "expected_tool_calls": evaluation_criteria.tool_calls_count,
+            "strict_mode": self.evaluator_config.strict,
+            "explanation": validated_justification,
+        }
+
         return NumericEvaluationResult(
             score=score,
-            details=validated_justification,
+            details=details,
         )
diff --git a/src/uipath/eval/models/models.py b/src/uipath/eval/models/models.py
index f3e9e3ca9..5ed4805ea 100644
--- a/src/uipath/eval/models/models.py
+++ b/src/uipath/eval/models/models.py
@@ -39,7 +39,7 @@ class ScoreType(IntEnum):
 class BaseEvaluationResult(BaseModel):
     """Base class for evaluation results."""
 
-    details: Optional[str | BaseModel] = None
+    details: Optional[str | Dict[str, Any] | BaseModel] = None
     # this is marked as optional, as it is populated inside the 'measure_execution_time' decorator
     evaluation_time: Optional[float] = None