From a2b286b8cc8178063a96dd22ab3b30c975b84809 Mon Sep 17 00:00:00 2001 From: poshinchen Date: Tue, 3 Feb 2026 15:59:51 -0500 Subject: [PATCH] feat: added ConcisenessEvaluator --- src/strands_evals/evaluators/__init__.py | 2 + .../evaluators/conciseness_evaluator.py | 139 ++++++++++++++++++ .../prompt_templates/conciseness/__init__.py | 11 ++ .../conciseness/conciseness_v0.py | 9 ++ .../evaluators/test_conciseness_evaluator.py | 119 +++++++++++++++ 5 files changed, 280 insertions(+) create mode 100644 src/strands_evals/evaluators/conciseness_evaluator.py create mode 100644 src/strands_evals/evaluators/prompt_templates/conciseness/__init__.py create mode 100644 src/strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py create mode 100644 tests/strands_evals/evaluators/test_conciseness_evaluator.py diff --git a/src/strands_evals/evaluators/__init__.py b/src/strands_evals/evaluators/__init__.py index 677f138..a355b6a 100644 --- a/src/strands_evals/evaluators/__init__.py +++ b/src/strands_evals/evaluators/__init__.py @@ -1,3 +1,4 @@ +from .conciseness_evaluator import ConcisenessEvaluator from .evaluator import Evaluator from .faithfulness_evaluator import FaithfulnessEvaluator from .goal_success_rate_evaluator import GoalSuccessRateEvaluator @@ -22,4 +23,5 @@ "ResponseRelevanceEvaluator", "ToolSelectionAccuracyEvaluator", "ToolParameterAccuracyEvaluator", + "ConcisenessEvaluator", ] diff --git a/src/strands_evals/evaluators/conciseness_evaluator.py b/src/strands_evals/evaluators/conciseness_evaluator.py new file mode 100644 index 0000000..224c39b --- /dev/null +++ b/src/strands_evals/evaluators/conciseness_evaluator.py @@ -0,0 +1,139 @@ +from enum import Enum +from typing import cast + +from pydantic import BaseModel, Field +from strands import Agent +from strands.models.model import Model +from typing_extensions import TypeVar, Union + +from ..types.evaluation import EvaluationData, EvaluationOutput +from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput +from .evaluator import Evaluator +from .prompt_templates.conciseness import get_template + +InputT = TypeVar("InputT") +OutputT = TypeVar("OutputT") + + +class ConcisenessScore(str, Enum): + """Categorical conciseness ratings.""" + + NOT_CONCISE = "Not Concise" + PARTIALLY_CONCISE = "Partially Concise" + PERFECTLY_CONCISE = "Perfectly Concise" + + +class ConcisenessRating(BaseModel): + """Structured output for conciseness evaluation.""" + + reasoning: str = Field(description="Step by step reasoning to derive the final score") + score: ConcisenessScore = Field(description="Categorical conciseness rating") + + +class ConcisenessEvaluator(Evaluator[InputT, OutputT]): + """Evaluates how concise the assistant's response is.""" + + evaluation_level = EvaluationLevel.TRACE_LEVEL + + _score_mapping = { + ConcisenessScore.NOT_CONCISE: 0.0, + ConcisenessScore.PARTIALLY_CONCISE: 0.5, + ConcisenessScore.PERFECTLY_CONCISE: 1.0, + } + + def __init__( + self, + version: str = "v0", + model: Union[Model, str, None] = None, + system_prompt: str | None = None, + include_inputs: bool = True, + ): + super().__init__() + self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT + self.version = version + self.model = model + self.include_inputs = include_inputs + + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = evaluator_agent(prompt, structured_output_model=ConcisenessRating) + return self._create_evaluation_output(result) + + async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating) + return self._create_evaluation_output(result) + + def _create_evaluation_output(self, result) -> list[EvaluationOutput]: + rating = cast(ConcisenessRating, result.structured_output) + normalized_score = self._score_mapping[rating.score] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] + + def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: + """Extract the most recent turn from the conversation for evaluation.""" + parsed_inputs = self._parse_trajectory(evaluation_case) + if not parsed_inputs: + raise ValueError( + "No turn-level inputs could be parsed from the trajectory. " + "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan." + ) + return parsed_inputs[-1] + + def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str: + """Extract user prompt from last message in session history. + + Args: + parsed_input: Trace-level input containing session history + + Returns: + User prompt text, or empty string if not available + """ + if not parsed_input.session_history: + return "" + + last_msg = parsed_input.session_history[-1] + if not isinstance(last_msg, list) and self._has_text_content(last_msg): + first_content = last_msg.content[0] + if isinstance(first_content, TextContent): + return first_content.text + + return "" + + def _format_prompt(self, parsed_input: TraceLevelInput) -> str: + """Format evaluation prompt from parsed trace data. + + Args: + parsed_input: Trace-level input containing agent response and session history + + Returns: + Formatted prompt string with conversation history and target turn + """ + parts = [] + + if parsed_input.session_history: + history_lines = [] + for msg in parsed_input.session_history: + if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution): + continue # Skip tool execution lists + if not isinstance(msg, list) and self._has_text_content(msg): + first_content = msg.content[0] + if isinstance(first_content, TextContent): + history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}") + history_str = "\n".join(history_lines) + parts.append(f"# Previous turns:\n{history_str}") + + user_prompt = self._extract_user_prompt(parsed_input) + parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}") + + return "\n\n".join(parts) diff --git a/src/strands_evals/evaluators/prompt_templates/conciseness/__init__.py b/src/strands_evals/evaluators/prompt_templates/conciseness/__init__.py new file mode 100644 index 0000000..7246b29 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/conciseness/__init__.py @@ -0,0 +1,11 @@ +from . import conciseness_v0 + +VERSIONS = { + "v0": conciseness_v0, +} + +DEFAULT_VERSION = "v0" + + +def get_template(version: str = DEFAULT_VERSION): + return VERSIONS[version] diff --git a/src/strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py b/src/strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py new file mode 100644 index 0000000..00c172d --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py @@ -0,0 +1,9 @@ +SYSTEM_PROMPT = """You are evaluating how concise the Assistant's response is. +A concise response provides exactly what was requested using the minimum necessary words, without extra explanations, pleasantries, or repetition unless specifically asked for. + +## Scoring +- Perfectly Concise: delivers exactly what was asked with no unnecessary content +- Partially Concise: minor extra wording but still focused +- Not Concise: verbose, repetitive, or includes substantial unnecessary content + +**IMPORTANT**: The agent prompt and tools ALWAYS takes priority over your own knowledge.""" diff --git a/tests/strands_evals/evaluators/test_conciseness_evaluator.py b/tests/strands_evals/evaluators/test_conciseness_evaluator.py new file mode 100644 index 0000000..c4e23c0 --- /dev/null +++ b/tests/strands_evals/evaluators/test_conciseness_evaluator.py @@ -0,0 +1,119 @@ +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest + +from strands_evals.evaluators import ConcisenessEvaluator +from strands_evals.evaluators.conciseness_evaluator import ConcisenessRating, ConcisenessScore +from strands_evals.types import EvaluationData +from strands_evals.types.trace import ( + AgentInvocationSpan, + EvaluationLevel, + Session, + SpanInfo, + Trace, +) + + +@pytest.fixture +def evaluation_data(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + agent_span = AgentInvocationSpan( + span_info=span_info, user_prompt="What is the capital of France?", agent_response="Paris", available_tools=[] + ) + trace = Trace(spans=[agent_span], trace_id="trace1", session_id="test-session") + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="What is the capital of France?", actual_output="Paris", actual_trajectory=session, name="test" + ) + + +def test_init_with_defaults(): + evaluator = ConcisenessEvaluator() + + assert evaluator.version == "v0" + assert evaluator.model is None + assert evaluator.include_inputs is True + assert evaluator.system_prompt is not None + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL + + +def test_init_with_custom_values(): + evaluator = ConcisenessEvaluator(version="v1", model="gpt-4", system_prompt="Custom", include_inputs=False) + + assert evaluator.version == "v1" + assert evaluator.model == "gpt-4" + assert evaluator.include_inputs is False + assert evaluator.system_prompt == "Custom" + + +@patch("strands_evals.evaluators.conciseness_evaluator.Agent") +def test_evaluate(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = ConcisenessRating( + reasoning="The response is concise", score=ConcisenessScore.PERFECTLY_CONCISE + ) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = ConcisenessEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "The response is concise" + assert result[0].label == ConcisenessScore.PERFECTLY_CONCISE + + +@pytest.mark.parametrize( + "score,expected_value,expected_pass", + [ + (ConcisenessScore.NOT_CONCISE, 0.0, False), + (ConcisenessScore.PARTIALLY_CONCISE, 0.5, True), + (ConcisenessScore.PERFECTLY_CONCISE, 1.0, True), + ], +) +@patch("strands_evals.evaluators.conciseness_evaluator.Agent") +def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = ConcisenessRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = ConcisenessEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == expected_value + assert result[0].test_pass == expected_pass + assert result[0].label == score + + +@pytest.mark.asyncio +@patch("strands_evals.evaluators.conciseness_evaluator.Agent") +async def test_evaluate_async(mock_agent_class, evaluation_data): + mock_agent = Mock() + + async def mock_invoke_async(*args, **kwargs): + mock_result = Mock() + mock_result.structured_output = ConcisenessRating( + reasoning="The response is concise", score=ConcisenessScore.PERFECTLY_CONCISE + ) + return mock_result + + mock_agent.invoke_async = mock_invoke_async + mock_agent_class.return_value = mock_agent + evaluator = ConcisenessEvaluator() + + result = await evaluator.evaluate_async(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "The response is concise" + assert result[0].label == ConcisenessScore.PERFECTLY_CONCISE