Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/strands_evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .conciseness_evaluator import ConcisenessEvaluator
from .evaluator import Evaluator
from .faithfulness_evaluator import FaithfulnessEvaluator
from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
Expand All @@ -22,4 +23,5 @@
"ResponseRelevanceEvaluator",
"ToolSelectionAccuracyEvaluator",
"ToolParameterAccuracyEvaluator",
"ConcisenessEvaluator",
]
139 changes: 139 additions & 0 deletions src/strands_evals/evaluators/conciseness_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
from strands.models.model import Model
from typing_extensions import TypeVar, Union

from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
from .evaluator import Evaluator
from .prompt_templates.conciseness import get_template

InputT = TypeVar("InputT")
OutputT = TypeVar("OutputT")


class ConcisenessScore(str, Enum):
"""Categorical conciseness ratings."""

NOT_CONCISE = "Not Concise"
PARTIALLY_CONCISE = "Partially Concise"
PERFECTLY_CONCISE = "Perfectly Concise"


class ConcisenessRating(BaseModel):
"""Structured output for conciseness evaluation."""

reasoning: str = Field(description="Step by step reasoning to derive the final score")
score: ConcisenessScore = Field(description="Categorical conciseness rating")


class ConcisenessEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates how concise the assistant's response is."""

evaluation_level = EvaluationLevel.TRACE_LEVEL

_score_mapping = {
ConcisenessScore.NOT_CONCISE: 0.0,
ConcisenessScore.PARTIALLY_CONCISE: 0.5,
ConcisenessScore.PERFECTLY_CONCISE: 1.0,
}

def __init__(
self,
version: str = "v0",
model: Union[Model, str, None] = None,
system_prompt: str | None = None,
include_inputs: bool = True,
):
super().__init__()
self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
self.version = version
self.model = model
self.include_inputs = include_inputs

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
return self._create_evaluation_output(result)

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
return self._create_evaluation_output(result)

def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
rating = cast(ConcisenessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Extract user prompt from last message in session history.

Args:
parsed_input: Trace-level input containing session history

Returns:
User prompt text, or empty string if not available
"""
if not parsed_input.session_history:
return ""

last_msg = parsed_input.session_history[-1]
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
first_content = last_msg.content[0]
if isinstance(first_content, TextContent):
return first_content.text

return ""

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed trace data.

Args:
parsed_input: Trace-level input containing agent response and session history

Returns:
Formatted prompt string with conversation history and target turn
"""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
continue # Skip tool execution lists
if not isinstance(msg, list) and self._has_text_content(msg):
first_content = msg.content[0]
if isinstance(first_content, TextContent):
history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
history_str = "\n".join(history_lines)
parts.append(f"# Previous turns:\n{history_str}")

user_prompt = self._extract_user_prompt(parsed_input)
parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")

return "\n\n".join(parts)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from . import conciseness_v0

VERSIONS = {
"v0": conciseness_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SYSTEM_PROMPT = """You are evaluating how concise the Assistant's response is.
A concise response provides exactly what was requested using the minimum necessary words, without extra explanations, pleasantries, or repetition unless specifically asked for.
## Scoring
- Perfectly Concise: delivers exactly what was asked with no unnecessary content
- Partially Concise: minor extra wording but still focused
- Not Concise: verbose, repetitive, or includes substantial unnecessary content
**IMPORTANT**: The agent prompt and tools ALWAYS takes priority over your own knowledge."""
119 changes: 119 additions & 0 deletions tests/strands_evals/evaluators/test_conciseness_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from datetime import datetime
from unittest.mock import Mock, patch

import pytest

from strands_evals.evaluators import ConcisenessEvaluator
from strands_evals.evaluators.conciseness_evaluator import ConcisenessRating, ConcisenessScore
from strands_evals.types import EvaluationData
from strands_evals.types.trace import (
AgentInvocationSpan,
EvaluationLevel,
Session,
SpanInfo,
Trace,
)


@pytest.fixture
def evaluation_data():
now = datetime.now()
span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now)
agent_span = AgentInvocationSpan(
span_info=span_info, user_prompt="What is the capital of France?", agent_response="Paris", available_tools=[]
)
trace = Trace(spans=[agent_span], trace_id="trace1", session_id="test-session")
session = Session(traces=[trace], session_id="test-session")

return EvaluationData(
input="What is the capital of France?", actual_output="Paris", actual_trajectory=session, name="test"
)


def test_init_with_defaults():
evaluator = ConcisenessEvaluator()

assert evaluator.version == "v0"
assert evaluator.model is None
assert evaluator.include_inputs is True
assert evaluator.system_prompt is not None
assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL


def test_init_with_custom_values():
evaluator = ConcisenessEvaluator(version="v1", model="gpt-4", system_prompt="Custom", include_inputs=False)

assert evaluator.version == "v1"
assert evaluator.model == "gpt-4"
assert evaluator.include_inputs is False
assert evaluator.system_prompt == "Custom"


@patch("strands_evals.evaluators.conciseness_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = ConcisenessRating(
reasoning="The response is concise", score=ConcisenessScore.PERFECTLY_CONCISE
)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ConcisenessEvaluator()

result = evaluator.evaluate(evaluation_data)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "The response is concise"
assert result[0].label == ConcisenessScore.PERFECTLY_CONCISE


@pytest.mark.parametrize(
"score,expected_value,expected_pass",
[
(ConcisenessScore.NOT_CONCISE, 0.0, False),
(ConcisenessScore.PARTIALLY_CONCISE, 0.5, True),
(ConcisenessScore.PERFECTLY_CONCISE, 1.0, True),
],
)
@patch("strands_evals.evaluators.conciseness_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = ConcisenessRating(reasoning="Test", score=score)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ConcisenessEvaluator()

result = evaluator.evaluate(evaluation_data)

assert len(result) == 1
assert result[0].score == expected_value
assert result[0].test_pass == expected_pass
assert result[0].label == score


@pytest.mark.asyncio
@patch("strands_evals.evaluators.conciseness_evaluator.Agent")
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()

async def mock_invoke_async(*args, **kwargs):
mock_result = Mock()
mock_result.structured_output = ConcisenessRating(
reasoning="The response is concise", score=ConcisenessScore.PERFECTLY_CONCISE
)
return mock_result

mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = ConcisenessEvaluator()

result = await evaluator.evaluate_async(evaluation_data)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "The response is concise"
assert result[0].label == ConcisenessScore.PERFECTLY_CONCISE