Skip to content
Open
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,54 @@ tool_parameter_evaluator = ToolParameterAccuracyEvaluator(
)
```

### RAG Evaluation with Contextual Faithfulness

Evaluate whether RAG (Retrieval-Augmented Generation) responses are grounded in the retrieved context:

```python
from strands_evals import Case, Experiment
from strands_evals.evaluators import ContextualFaithfulnessEvaluator

# Create test cases with retrieval context
test_cases = [
Case(
name="refund-policy",
input="What is the refund policy?",
retrieval_context=[
"Refunds are available within 30 days of purchase.",
"Items must be unopened and in original packaging for a full refund.",
"Opened items may be eligible for store credit only."
]
)
]

# Evaluator checks if response claims are supported by the context
evaluator = ContextualFaithfulnessEvaluator()

experiment = Experiment(cases=test_cases, evaluators=[evaluator])

def rag_pipeline(case: Case) -> str:
# Your RAG implementation here
# Returns the generated response
return "You can get a full refund within 30 days if the item is unopened."

reports = experiment.run_evaluations(rag_pipeline)
reports[0].run_display()

# Scoring: Fully Faithful (1.0), Mostly Faithful (0.67),
# Partially Faithful (0.33), Not Faithful (0.0)
```

## Available Evaluators

### Core Evaluators
- **OutputEvaluator**: Flexible LLM-based evaluation with custom rubrics
- **TrajectoryEvaluator**: Action sequence evaluation with built-in scoring tools
- **HelpfulnessEvaluator**: Seven-level helpfulness assessment from user perspective
- **FaithfulnessEvaluator**: Evaluates if responses are grounded in conversation history
- **ContextualFaithfulnessEvaluator**: Evaluates if RAG responses are grounded in retrieval context (detects hallucinations)
- **GoalSuccessRateEvaluator**: Measures if user goals were achieved
- **HarmfulnessEvaluator**: Binary safety evaluation for harmful content

### Specialized Evaluators
- **ToolSelectionAccuracyEvaluator**: Evaluates appropriateness of tool choices
Expand Down
7 changes: 7 additions & 0 deletions src/strands_evals/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Case(BaseModel, Generic[InputT, OutputT]):
expected_output: The expected response given the input. eg. the agent's response
expected_trajectory: The expected trajectory of a task given the input. eg. sequence of tools
expected_interactions: The expected interaction sequence given the input (ideal for multi-agent systems).
retrieval_context: The retrieved context for RAG evaluation. Used by ContextualFaithfulnessEvaluator.
metadata: Additional information about the test case.

Example:
Expand All @@ -42,6 +43,11 @@ class Case(BaseModel, Generic[InputT, OutputT]):
{"agent_2":"What is 2x2?"}
]
)

rag_case = Case(
input="What is the company's return policy?",
retrieval_context=["Returns accepted within 30 days.", "Full refund for unopened items."]
)
"""

name: str | None = None
Expand All @@ -50,4 +56,5 @@ class Case(BaseModel, Generic[InputT, OutputT]):
expected_output: OutputT | None = None
expected_trajectory: list[Any] | None = None
expected_interactions: list[Interaction] | None = None
retrieval_context: list[str] | None = None
metadata: dict[str, Any] | None = None
2 changes: 2 additions & 0 deletions src/strands_evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .contextual_faithfulness_evaluator import ContextualFaithfulnessEvaluator
from .evaluator import Evaluator
from .faithfulness_evaluator import FaithfulnessEvaluator
from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
Expand All @@ -10,6 +11,7 @@
from .trajectory_evaluator import TrajectoryEvaluator

__all__ = [
"ContextualFaithfulnessEvaluator",
"Evaluator",
"OutputEvaluator",
"TrajectoryEvaluator",
Expand Down
155 changes: 155 additions & 0 deletions src/strands_evals/evaluators/contextual_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
from enum import Enum

from pydantic import BaseModel, Field
from strands import Agent
from strands.models.model import Model
from typing_extensions import TypeVar, Union

from ..types.evaluation import EvaluationData, EvaluationOutput
from .evaluator import Evaluator
from .prompt_templates.contextual_faithfulness import get_template

InputT = TypeVar("InputT")
OutputT = TypeVar("OutputT")


class ContextualFaithfulnessScore(str, Enum):
"""Categorical contextual faithfulness ratings for RAG evaluation."""

NOT_FAITHFUL = "Not Faithful"
PARTIALLY_FAITHFUL = "Partially Faithful"
MOSTLY_FAITHFUL = "Mostly Faithful"
FULLY_FAITHFUL = "Fully Faithful"


class ContextualFaithfulnessRating(BaseModel):
"""Structured output for contextual faithfulness evaluation."""

reasoning: str = Field(description="Step by step reasoning analyzing each claim against the retrieval context")
score: ContextualFaithfulnessScore = Field(description="Categorical faithfulness rating")


class ContextualFaithfulnessEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates whether an LLM response is faithful to the provided retrieval context.

This evaluator is designed for RAG (Retrieval-Augmented Generation) systems.
It checks if the claims in the response are grounded in the retrieved documents,
helping detect hallucinations where the model generates information not present
in the context.

Unlike FaithfulnessEvaluator which checks against conversation history,
this evaluator specifically validates against retrieval context provided
in the test case.

Attributes:
version: The version of the prompt template to use.
model: A string representing the model-id for Bedrock to use, or a Model instance.
system_prompt: System prompt to guide model behavior.
include_input: Whether to include the user's input query in the evaluation prompt.

Example:
evaluator = ContextualFaithfulnessEvaluator()
case = Case(
input="What is the refund policy?",
retrieval_context=[
"Refunds are available within 30 days of purchase.",
"Items must be unopened for a full refund."
]
)
# Run with experiment or evaluate directly
"""

_score_mapping = {
ContextualFaithfulnessScore.NOT_FAITHFUL: 0.0,
ContextualFaithfulnessScore.PARTIALLY_FAITHFUL: 0.33,
ContextualFaithfulnessScore.MOSTLY_FAITHFUL: 0.67,
ContextualFaithfulnessScore.FULLY_FAITHFUL: 1.0,
}

def __init__(
self,
version: str = "v0",
model: Union[Model, str, None] = None,
system_prompt: str | None = None,
include_input: bool = True,
):
super().__init__()
self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
self.version = version
self.model = model
self.include_input = include_input

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
"""Evaluate the contextual faithfulness of the response.

Args:
evaluation_case: The test case containing the response and retrieval context.

Returns:
A list containing a single EvaluationOutput with the faithfulness score.

Raises:
ValueError: If retrieval_context is not provided in the evaluation case.
"""
self._validate_evaluation_case(evaluation_case)
prompt = self._format_prompt(evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(ContextualFaithfulnessRating, prompt)
return [self._create_output(rating)]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
"""Evaluate the contextual faithfulness of the response asynchronously.

Args:
evaluation_case: The test case containing the response and retrieval context.

Returns:
A list containing a single EvaluationOutput with the faithfulness score.

Raises:
ValueError: If retrieval_context is not provided in the evaluation case.
"""
self._validate_evaluation_case(evaluation_case)
prompt = self._format_prompt(evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(ContextualFaithfulnessRating, prompt)
return [self._create_output(rating)]

def _validate_evaluation_case(self, evaluation_case: EvaluationData[InputT, OutputT]) -> None:
"""Validate that the evaluation case has required fields."""
if not evaluation_case.retrieval_context:
raise ValueError(
"retrieval_context is required for ContextualFaithfulnessEvaluator. "
"Please provide retrieval_context in your Case."
)
if evaluation_case.actual_output is None:
raise ValueError(
"actual_output is required for ContextualFaithfulnessEvaluator. "
"Please make sure the task function returns the output."
)

def _format_prompt(self, evaluation_case: EvaluationData[InputT, OutputT]) -> str:
"""Format the evaluation prompt with context and response."""
parts = []

if self.include_input:
parts.append(f"# User Query:\n{evaluation_case.input}")

context_str = "\n\n".join(
f"[Document {i + 1}]\n{doc}" for i, doc in enumerate(evaluation_case.retrieval_context or [])
)
parts.append(f"# Retrieval Context:\n{context_str}")

parts.append(f"# Assistant's Response:\n{evaluation_case.actual_output}")

return "\n\n".join(parts)

def _create_output(self, rating: ContextualFaithfulnessRating) -> EvaluationOutput:
"""Create an EvaluationOutput from the rating."""
normalized_score = self._score_mapping[rating.score]
return EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.67,
reason=rating.reasoning,
label=rating.score,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from . import contextual_faithfulness_v0

VERSIONS = {
"v0": contextual_faithfulness_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
SYSTEM_PROMPT = """You are an objective judge evaluating whether an AI assistant's response is faithful to the provided retrieval context. Your task is to determine if the claims and information in the response are supported by the retrieved documents.

# Evaluation Task
Assess whether each factual claim in the assistant's response can be verified from the retrieval context. A response is faithful if all its factual claims are supported by the context.

# Evaluation Guidelines
Rate the contextual faithfulness using this scale:

1. Not Faithful
- The response contains significant claims that directly contradict the retrieval context
- The response includes fabricated information not present in the context
- Major factual errors that could mislead the user

2. Partially Faithful
- Some claims in the response are supported by the context, but others are not
- The response extrapolates beyond what the context supports
- Minor inaccuracies or unsupported details mixed with accurate information

3. Mostly Faithful
- Most claims in the response are supported by the retrieval context
- Only minor details may lack explicit support
- No contradictions with the context

4. Fully Faithful
- All factual claims in the response are directly supported by the retrieval context
- The response accurately represents information from the context
- No fabricated or contradictory information
- If the response appropriately states it cannot answer due to insufficient context, it is "Fully Faithful"

# Important Notes
- Focus only on factual claims, not opinions or subjective statements
- Generic statements that don't require context support (e.g., greetings) should not be penalized
- If the context is empty or irrelevant and the response acknowledges this, consider it faithful
- Pay attention to nuance: a claim may be partially supported but misleadingly presented

Please provide step-by-step reasoning before giving your final score."""
2 changes: 2 additions & 0 deletions src/strands_evals/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def _run_task(
expected_output=case.expected_output,
expected_trajectory=case.expected_trajectory,
expected_interactions=case.expected_interactions,
retrieval_context=case.retrieval_context,
metadata=case.metadata,
)
task_output = task(case)
Expand Down Expand Up @@ -198,6 +199,7 @@ async def _run_task_async(
expected_output=case.expected_output,
expected_trajectory=case.expected_trajectory,
expected_interactions=case.expected_interactions,
retrieval_context=case.retrieval_context,
metadata=case.metadata,
)

Expand Down
2 changes: 2 additions & 0 deletions src/strands_evals/types/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
metadata: Additional information about the test case.
actual_interactions: The actual interaction sequence given the input.
expected_interactions: The expected interaction sequence given the input.
retrieval_context: The retrieved context for RAG evaluation (e.g., documents from vector store).
"""

input: InputT
Expand All @@ -86,6 +87,7 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
metadata: dict[str, Any] | None = None
actual_interactions: list[Interaction] | None = None
expected_interactions: list[Interaction] | None = None
retrieval_context: list[str] | None = None


class EvaluationOutput(BaseModel):
Expand Down
Loading
Loading