From 8bf70ae870f89a589448a6dc776e36dab94574c0 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Wed, 31 Dec 2025 09:46:07 -0500 Subject: [PATCH 01/17] Add more guidelines about validating tests --- .github/agent-sops/task-release-notes.sop.md | 125 +++++++++++++++---- 1 file changed, 99 insertions(+), 26 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 5f024da82..81fcee67b 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -210,42 +210,102 @@ When existing examples are insufficient, generate new code snippets. **Note**: This phase is REQUIRED for all code snippets (extracted or generated) that will appear in Major Features sections. Validation must occur AFTER snippets have been extracted or generated in Step 3. +**Critical**: Validation tests MUST verify the actual behavior of the feature, not just syntax correctness. A test that only checks whether code parses or imports succeed is NOT valid validation. The purpose of validation is to prove the code example actually works and demonstrates the feature as intended. + #### 4.1 Create Temporary Test Files -Create temporary test files to validate the code snippets. +Create temporary test files that verify the feature's behavior. **Constraints:** - You MUST create a temporary test file for each code snippet - You MUST place test files in an appropriate test directory based on the project structure - You MUST include all necessary imports and setup code in the test file - You MUST wrap the snippet in a proper test case +- You MUST include assertions that verify the feature's actual behavior: + - Assert that outputs match expected values + - Assert that state changes occur as expected + - Assert that callbacks/hooks are invoked correctly + - Assert that return types and structures are correct +- You MUST NOT write tests that only verify: + - Code parses without syntax errors + - Imports succeed + - Objects can be instantiated without checking behavior + - Functions can be called without checking results - You SHOULD use the project's testing framework -- You MAY need to mock dependencies or setup test fixtures +- You SHOULD mock external dependencies (APIs, databases) but still verify behavior with mocks +- You MAY need to setup test fixtures that enable behavioral verification - You MAY include additional test code that doesn't appear in the release notes -**Example test file structure** (language-specific format will vary): +**Example of GOOD validation** (verifies behavior): +```python +def test_structured_output_validation(): + """Verify that structured output actually validates against the schema.""" + from pydantic import BaseModel + + class UserResponse(BaseModel): + name: str + age: int + + agent = Agent(model=mock_model, output_schema=UserResponse) + result = agent("Get user info") + + # Behavioral assertions - verify the feature works + assert isinstance(result.output, UserResponse) + assert hasattr(result.output, 'name') + assert hasattr(result.output, 'age') + assert isinstance(result.output.age, int) ``` -# Test structure depends on the project's testing framework -# Include necessary imports, setup, and the snippet being validated -# Add assertions to verify the code works correctly + +**Example of BAD validation** (only verifies syntax): +```python +def test_structured_output_syntax(): + """BAD: This only verifies the code runs without errors.""" + from pydantic import BaseModel + + class UserResponse(BaseModel): + name: str + age: int + + # BAD: No assertions about behavior + agent = Agent(model=mock_model, output_schema=UserResponse) + # BAD: Just calling without checking results proves nothing + agent("Get user info") ``` #### 4.2 Run Validation Tests -Execute tests to ensure code snippets are valid and functional. +Execute tests to ensure code snippets demonstrate working feature behavior. **Constraints:** - You MUST run the appropriate test command for the project (e.g., `npm test`, `pytest`, `go test`) - You MUST verify that the test passes successfully +- You MUST verify that assertions actually executed (not skipped or short-circuited) - You MUST check that the code compiles without errors in compiled languages +- You MUST ensure tests include meaningful assertions about feature behavior - You SHOULD run type checking if applicable (e.g., `npm run type-check`, `mypy`) +- You SHOULD review test output to confirm behavioral assertions passed, not just that the test didn't error - You MAY need to adjust imports or setup code if tests fail - You MAY need to install additional dependencies if required -**Fallback validation** (if test execution fails or is not possible): -- You MUST at minimum validate syntax using the appropriate language tools -- You MUST ensure the code is syntactically correct -- You MUST verify all referenced types and modules exist +**What constitutes valid behavioral verification:** +- Testing that a new API returns expected data structures +- Testing that a new option/parameter changes behavior as documented +- Testing that callbacks are invoked with correct arguments +- Testing that error handling works as described +- Testing that integrations connect and exchange data correctly (with mocks if needed) + +**What does NOT constitute valid verification:** +- Code executes without raising exceptions +- Objects can be constructed +- Functions can be called +- Imports resolve successfully +- Type hints are valid + +**Fallback validation** (if full behavioral test execution is not possible): +- You MUST still write assertions about expected behavior, even if mocked +- You MUST document why full behavioral testing wasn't possible +- You SHOULD use mocks that verify interaction patterns (e.g., mock.assert_called_with) +- You MAY mark the example as "partially validated" in the validation comment if behavioral testing is limited #### 4.3 Handle Validation Failures @@ -253,13 +313,17 @@ Address any validation failures before including snippets in release notes. **Constraints:** - You MUST NOT include unvalidated code snippets in release notes +- You MUST NOT consider syntax-only tests as valid validation - You MUST revise the code snippet if validation fails - You MUST re-run validation after making changes +- You MUST ensure revised tests include behavioral assertions - You SHOULD examine the actual implementation in the PR if generated code fails -- You SHOULD simplify the example if complexity is causing validation issues -- You MAY extract a different example from the PR if the current one cannot be validated -- You MAY seek clarification if you cannot create a valid example -- You MUST preserve the test file content to include in the GitHub issue comment (Step 6.2) +- You SHOULD examine existing tests in the PR for patterns that verify behavior +- You SHOULD simplify the example if complexity is causing validation issues, but maintain behavioral assertions +- You MAY extract a different example from the PR if the current one cannot be behaviorally validated +- You MAY seek clarification if you cannot create a valid example with behavioral verification +- You MUST preserve the test file content to include in the GitHub issue comment (Step 6.1) +- You MUST note in the validation comment what specific behavior each test verifies - You MAY delete temporary test files after capturing their content, as the environment is ephemeral ### 5. Release Notes Formatting @@ -365,10 +429,12 @@ Batch all validation code into a single GitHub issue comment. - You MUST NOT post separate comments for each feature's validation - You MUST post this comment BEFORE the release notes comment - You MUST include all test files created during validation (Step 4) in this single comment +- You MUST document what specific behavior each test verifies (not just "validates the code works") - You MUST NOT reference local file pathsβ€”the ephemeral environment will be destroyed - You MUST clearly label this comment as "Code Validation Tests" - You MUST include a note explaining that this code was used to validate the snippets in the release notes -- You SHOULD use collapsible `
` sections to organize validation code by feature: +- You SHOULD use collapsible `
` sections to organize validation code by feature +- You SHOULD include a brief description of what behavior is being verified for each test: ```markdown ## Code Validation Tests @@ -377,8 +443,10 @@ Batch all validation code into a single GitHub issue comment.
Validation: Feature Name 1 - \`\`\`typescript - [Full test file for feature 1] + **Behavior verified:** This test confirms that the new `output_schema` parameter causes the agent to return a validated Pydantic model instance with the correct field types. + + \`\`\`python + [Full test file for feature 1 with behavioral assertions] \`\`\`
@@ -386,13 +454,15 @@ Batch all validation code into a single GitHub issue comment.
Validation: Feature Name 2 - \`\`\`typescript - [Full test file for feature 2] + **Behavior verified:** This test confirms that async streaming yields events in real-time and that the final result contains all streamed content. + + \`\`\`python + [Full test file for feature 2 with behavioral assertions] \`\`\`
``` -- This allows reviewers to copy and run the validation code themselves +- This allows reviewers to understand what was validated and copy/run the validation code themselves #### 6.2 Post Release Notes Comment @@ -523,10 +593,13 @@ If code validation fails for a snippet: 1. Review the test output to understand the failure reason 2. Check if the feature requires additional dependencies or setup 3. Examine the actual implementation in the PR to understand correct usage -4. Try simplifying the example to focus on core functionality -5. Consider using a different example from the PR -6. If unable to validate, note the issue in the release notes comment and skip the code example for that feature -7. Leave a comment on the issue noting which features couldn't include validated code examples +4. Look at existing tests in the PR for patterns that verify behavior correctly +5. Try simplifying the example to focus on core functionality while maintaining behavioral assertions +6. Consider using a different example from the PR +7. If unable to validate behaviorally, note the issue in the release notes comment and skip the code example for that feature +8. Leave a comment on the issue noting which features couldn't include validated code examples + +**Important**: If your tests only verify syntax (code runs without errors) but don't verify behavior (assertions about outputs, state changes, or interactions), the validation is incomplete. Revisit Step 4 and add meaningful behavioral assertions. ### Large PR Sets (>100 PRs) @@ -574,7 +647,7 @@ If no suitable code examples can be found or generated for a feature: ## Desired Outcome * Focused release notes highlighting Major Features and Major Bug Fixes with concise descriptions (2-3 sentences, no bullet points) -* Working, validated code examples for all major features +* Working, behaviorally-validated code examples for all major features (tests must verify feature behavior, not just syntax) * Well-formatted markdown that renders properly on GitHub * Release notes posted as a comment on the GitHub issue for review From 88140851e97d563ce349b113ccc7568cb641ac1d Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Wed, 31 Dec 2025 09:49:37 -0500 Subject: [PATCH 02/17] Update guidelines to reflect possible PR description staleness --- .github/agent-sops/task-release-notes.sop.md | 44 ++++++++++++++++---- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 81fcee67b..9a4cd2350 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -62,10 +62,13 @@ For each PR identified (from release or API query), fetch additional metadata ne - You MUST retrieve additional metadata for PRs being considered for Major Features or Major Bug Fixes: - PR description/body (essential for understanding the change) - PR labels (if any) + - PR review comments and conversation threads (to identify post-description changes) - You SHOULD retrieve for Major Feature candidates: - Files changed in the PR (to find code examples) -- You MAY retrieve: - - PR review comments if helpful for understanding the change +- You MUST retrieve PR review comments for Major Feature and Major Bug Fix candidates: + - Review comments often contain important context about changes made after the initial description + - Look for reviewer requests that resulted in structural changes to the implementation + - Check for author responses indicating significant modifications - You SHOULD minimize API calls by only fetching detailed metadata for PRs that appear significant based on title/prefix - You MUST track this data for use in categorization and release notes generation @@ -89,18 +92,27 @@ Extract categorization signals from PR titles using conventional commit prefixes - You SHOULD record the prefix-based category for each PR - You MAY encounter PRs without conventional commit prefixes -#### 2.2 Analyze PR Descriptions +#### 2.2 Analyze PR Descriptions and Review Comments Use LLM analysis to understand the significance and user impact of each change. +**Critical Warning - Stale PR Descriptions:** +PR descriptions are written at the time of PR creation and may become outdated after code review. Reviewers often request structural changes, API modifications, or feature adjustments that are implemented but NOT reflected in the original description. You MUST cross-reference the description with review comments to get an accurate understanding of the final merged code. + **Constraints:** - You MUST read and analyze the PR description for each PR +- You MUST also review PR comments and review threads to identify changes made after the initial description: + - Look for reviewer comments requesting changes to the implementation + - Look for author responses confirming changes were made + - Look for "LGTM" or approval comments that reference specific modifications + - Pay special attention to comments about API changes, renamed methods, or restructured code +- You MUST treat the actual merged code as the source of truth when descriptions conflict with review feedback - You MUST assess the user-facing impact of the change: - Does it introduce new functionality users will interact with? - Does it fix a bug that users experienced? - Is it purely internal with no user-visible changes? - You MUST identify if the change introduces breaking changes -- You SHOULD identify if the PR includes code examples in its description +- You SHOULD identify if the PR includes code examples in its description (but verify they match the final implementation) - You SHOULD note any links to documentation or related issues - You MAY consider the size and complexity of the change @@ -161,14 +173,21 @@ Present the categorized PRs to the user for review and confirmation. Search merged PRs for existing code that demonstrates the new feature. +**Critical Warning - Verify Examples Against Final Implementation:** +Code examples in PR descriptions may be outdated if the implementation changed during review. Always verify that examples match the actual merged code by checking review comments for requested changes and examining the final implementation. + **Constraints:** - You MUST search each Major Feature PR for existing code examples in: - - Test files (especially integration tests or example tests) + - Test files (especially integration tests or example tests) - these are most reliable as they reflect the final implementation - Example applications or scripts in `examples/` directory - - Code snippets in the PR description + - Code snippets in the PR description (but verify against review comments and final code) - Documentation updates that include code examples - README updates with usage examples -- You MUST prioritize test files that show real usage of the feature +- You MUST cross-reference any examples from PR descriptions with: + - Review comments that may have requested API changes + - The actual merged code to ensure the example is still accurate + - Test files which reflect the working implementation +- You MUST prioritize test files that show real usage of the feature (these are validated against the final code) - You SHOULD look for the simplest, most focused examples - You SHOULD prefer examples that are already validated (from test files) - You MAY examine multiple PRs if a feature spans several PRs @@ -644,6 +663,17 @@ If no suitable code examples can be found or generated for a feature: 5. Mark the example as "conceptual" if validation isn't possible 6. Consider omitting the code example if it would be misleading +### Stale or Inaccurate PR Descriptions + +If you discover that a PR description doesn't match the actual implementation: +1. Review the PR comment thread and review comments for context on what changed +2. Look for reviewer requests that led to structural changes +3. Check the author's responses to understand what modifications were made +4. Examine the actual merged code (especially test files) to understand the true implementation +5. Use test files as the authoritative source for code examples, not the PR description +6. If the feature's scope changed significantly during review, update your categorization accordingly +7. Note in your analysis when you relied on review comments rather than the description + ## Desired Outcome * Focused release notes highlighting Major Features and Major Bug Fixes with concise descriptions (2-3 sentences, no bullet points) From a57b5396b5a3d73e835cbb3cabe92bf3531fa98f Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Wed, 31 Dec 2025 10:24:22 -0500 Subject: [PATCH 03/17] Indicate that exclusions should be called out --- .github/agent-sops/task-release-notes.sop.md | 42 +++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 9a4cd2350..5b2d291ca 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -433,11 +433,12 @@ Add a horizontal rule to separate your content from GitHub's auto-generated sect **Critical**: You are running in an ephemeral environment. All files created during execution (test files, temporary notes, etc.) will be deleted when the workflow completes. You MUST post all deliverables as GitHub issue commentsβ€”this is the only way to preserve your work and make it accessible to reviewers. -**Comment Structure**: Post exactly two comments on the GitHub issue: +**Comment Structure**: Post exactly three comments on the GitHub issue: 1. **Validation Comment** (first): Contains all validation code for all features in one batched comment 2. **Release Notes Comment** (second): Contains the final formatted release notes +3. **Exclusions Comment** (third): Documents any features that were excluded and why -This ordering allows reviewers to see the validation evidence before reviewing the release notes. +This ordering allows reviewers to see the validation evidence, review the release notes, and understand any exclusion decisions. #### 6.1 Post Validation Code Comment @@ -497,6 +498,42 @@ Post the formatted release notes as a single GitHub issue comment. - You MAY use markdown formatting in the comment - If comment posting is deferred, continue with the workflow and note the deferred status +#### 6.3 Post Exclusions Comment + +Document any features or bug fixes that were considered but excluded from the release notes. + +**Critical**: This comment is REQUIRED whenever you decide that a categorized Major Feature or Major Bug Fix does not warrant inclusion in the final release notes, does not need a code example, or was downgraded to Minor Changes during the process. + +**Constraints:** +- You MUST post this comment as the FINAL comment on the GitHub issue +- You MUST include this comment if ANY of the following occurred: + - A PR initially categorized as Major Feature was excluded from release notes + - A PR initially categorized as Major Bug Fix was excluded from release notes + - A Major Feature was included without a code example + - A feature's scope or description was significantly different from the PR description + - You relied on review comments rather than the PR description to understand a feature +- You MUST clearly explain the reasoning for each exclusion or modification +- You MUST format the comment with clear sections: + ```markdown + ## Release Notes Exclusions & Notes + + The following decisions were made during release notes generation: + + ### Excluded from Major Features + - **PR#123 - Feature Title**: Excluded because [specific reason - e.g., "internal refactoring with no user-facing API changes", "feature was reverted in a later PR", "scope reduced during review to minor enhancement"] + + ### Excluded from Major Bug Fixes + - **PR#456 - Fix Title**: Excluded because [specific reason] + + ### Features Without Code Examples + - **PR#789 - Feature Title**: No code example provided because [specific reason - e.g., "feature is configuration-only", "existing documentation covers usage adequately", "unable to create a validated example"] + + ### Description vs. Implementation Discrepancies + - **PR#101 - Feature Title**: PR description stated [X] but review comments and final implementation show [Y]. Release notes reflect the actual merged behavior. + ``` +- You SHOULD include this comment even if there are no exclusions, with a simple note: "No features or bug fixes were excluded from this release notes draft." +- You MUST NOT skip this commentβ€”it provides critical transparency for reviewers + ## Examples ### Example 1: Major Features Section with Code @@ -680,6 +717,7 @@ If you discover that a PR description doesn't match the actual implementation: * Working, behaviorally-validated code examples for all major features (tests must verify feature behavior, not just syntax) * Well-formatted markdown that renders properly on GitHub * Release notes posted as a comment on the GitHub issue for review +* Exclusions comment documenting any features excluded or modified, with clear reasoning for each decision **Important**: Your generated release notes will be prepended to GitHub's auto-generated release notes. GitHub automatically generates: - "What's Changed" section listing all PRs with authors and links From d20ef7989b413a0a88ea6233691bb88841b76a5d Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Wed, 31 Dec 2025 10:34:33 -0500 Subject: [PATCH 04/17] Take trigger feedback into account --- .../actions/strands-agent-runner/action.yml | 2 +- .github/scripts/javascript/process-input.cjs | 21 ++++++++++++++++++- .github/scripts/python/agent_runner.py | 13 ++++++------ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/.github/actions/strands-agent-runner/action.yml b/.github/actions/strands-agent-runner/action.yml index 6d4c2d7fb..d0e93effe 100644 --- a/.github/actions/strands-agent-runner/action.yml +++ b/.github/actions/strands-agent-runner/action.yml @@ -149,7 +149,7 @@ runs: STRANDS_TOOL_CONSOLE_MODE: 'enabled' BYPASS_TOOL_CONSENT: 'true' run: | - uv run --no-project ${{ runner.temp }}/strands-agent-runner/.github/scripts/python/agent_runner.py "$INPUT_TASK" + uv run --no-project ${{ runner.temp }}/strands-agent-runner/.github/scripts/python/agent_runner.py - name: Capture repository state shell: bash diff --git a/.github/scripts/javascript/process-input.cjs b/.github/scripts/javascript/process-input.cjs index b7ed29263..2aa770b91 100644 --- a/.github/scripts/javascript/process-input.cjs +++ b/.github/scripts/javascript/process-input.cjs @@ -76,10 +76,29 @@ function buildPrompts(mode, issueId, isPullRequest, command, branchName, inputs) const scriptFile = scriptFiles[mode] || scriptFiles['refiner']; const systemPrompt = fs.readFileSync(scriptFile, 'utf8'); + // Extract the user's feedback/instructions after the mode keyword + // e.g., "release-notes Move #123 to Major Features" -> "Move #123 to Major Features" + const modeKeywords = { + 'release-notes': /^(?:release-notes|release notes)\s*/i, + 'implementer': /^implement\s*/i, + 'refiner': /^refine\s*/i + }; + + const modePattern = modeKeywords[mode]; + const userFeedback = modePattern ? command.replace(modePattern, '').trim() : command.trim(); + let prompt = (isPullRequest) ? 'The pull request id is:' : 'The issue id is:'; - prompt += `${issueId}\n${command}\nreview and continue`; + prompt += `${issueId}\n`; + + // If there's substantial user feedback beyond just the command keyword, include it as the main instruction + // Otherwise, use the default "review and continue" for initial triggers + if (userFeedback && userFeedback.length > 0) { + prompt += userFeedback; + } else { + prompt += 'review and continue'; + } return { sessionId, systemPrompt, prompt }; } diff --git a/.github/scripts/python/agent_runner.py b/.github/scripts/python/agent_runner.py index db10ceadb..9d92c2ac4 100644 --- a/.github/scripts/python/agent_runner.py +++ b/.github/scripts/python/agent_runner.py @@ -142,13 +142,12 @@ def run_agent(query: str): def main() -> None: """Main entry point for the agent runner.""" try: - # Read task from command line arguments - if len(sys.argv) < 2: - raise ValueError("Task argument is required") - - task = " ".join(sys.argv[1:]) - if not task.strip(): - raise ValueError("Task cannot be empty") + # Prefer INPUT_TASK env var (avoids shell escaping issues), fall back to CLI args + task = os.getenv("INPUT_TASK", "").strip() + if not task and len(sys.argv) > 1: + task = " ".join(sys.argv[1:]).strip() + if not task: + raise ValueError("Task is required (via INPUT_TASK env var or CLI argument)") print(f"πŸ€– Running agent with task: {task}") run_agent(task) From bfefbc9fa78a22344753de64463428ae0e036570 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 10:55:46 -0500 Subject: [PATCH 05/17] Simplify command extraction --- .github/scripts/javascript/process-input.cjs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/javascript/process-input.cjs b/.github/scripts/javascript/process-input.cjs index 2aa770b91..d0380a97b 100644 --- a/.github/scripts/javascript/process-input.cjs +++ b/.github/scripts/javascript/process-input.cjs @@ -8,9 +8,10 @@ async function getIssueInfo(github, context, inputs) { const issueId = context.eventName === 'workflow_dispatch' ? inputs.issue_id : context.payload.issue.number.toString(); + const commentBody = context.payload.comment?.body || ''; const command = context.eventName === 'workflow_dispatch' ? inputs.command - : (context.payload.comment.body.match(/^\/strands\s*(.*?)$/m)?.[1]?.trim() || ''); + : (commentBody.startsWith('/strands') ? commentBody.slice('/strands'.length).trim() : ''); console.log(`Event: ${context.eventName}, Issue ID: ${issueId}, Command: "${command}"`); From fe927022ca50ea6724ef95e1e817385286b31494 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 11:00:12 -0500 Subject: [PATCH 06/17] Update guidelines to take into account re-validation --- .github/agent-sops/task-release-notes.sop.md | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 5b2d291ca..5e293b072 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -165,6 +165,13 @@ Present the categorized PRs to the user for review and confirmation. - You SHOULD update your categorization based on user feedback - You MAY iterate on categorization if the user requests changes +**Critical - Re-validation After Recategorization:** +When the user promotes a PR to "Major Features" that was not previously in that category: +- You MUST perform Step 3 (Code Snippet Extraction) for the newly promoted PR +- You MUST perform Step 4 (Code Validation) for any code snippets extracted or generated +- You MUST NOT skip validation just because the user requested the change +- You MUST include the validation code for newly promoted features in the Validation Comment (Step 6.1) + ### 3. Code Snippet Extraction and Generation **Note**: This phase applies only to PRs categorized as "Major Features". Bug fixes typically do not require code examples. @@ -440,6 +447,11 @@ Add a horizontal rule to separate your content from GitHub's auto-generated sect This ordering allows reviewers to see the validation evidence, review the release notes, and understand any exclusion decisions. +**Iteration Comments**: If the user requests changes after the initial comments are posted: +- Post additional validation comments for any re-validated code +- Post updated release notes as new comments (do not edit previous comments) +- This creates an audit trail of changes and validations + #### 6.1 Post Validation Code Comment Batch all validation code into a single GitHub issue comment. @@ -534,6 +546,30 @@ Document any features or bug fixes that were considered but excluded from the re - You SHOULD include this comment even if there are no exclusions, with a simple note: "No features or bug fixes were excluded from this release notes draft." - You MUST NOT skip this commentβ€”it provides critical transparency for reviewers +#### 6.4 Handle User Feedback on Release Notes + +When the user requests changes to the release notes after they have been posted, re-validate as needed. + +**Critical**: User feedback does NOT exempt you from validation requirements. Any changes to code examples or newly added features must be validated. + +**Constraints:** +- You MUST re-run validation (Step 4) when the user requests changes that affect code examples: + - Modified code snippets + - New code examples for features that previously had none + - Replacement examples for features +- You MUST perform full extraction (Step 3) and validation (Step 4) when the user requests: + - Adding a new feature to the release notes that wasn't previously included + - Promoting a bug fix to include a code example +- You MUST NOT make changes to code examples without re-validating them +- You MUST post updated validation code as a new comment when re-validation occurs +- You MUST post the revised release notes as a new comment (do not edit previous comments) +- You SHOULD note in the updated release notes comment what changed from the previous version +- You MAY skip re-validation only for changes that do not affect code: + - Wording changes to descriptions + - Fixing typos + - Reordering features + - Removing features (no validation needed for removal) + ## Examples ### Example 1: Major Features Section with Code From 522c023bb068388fbb1054b7bb412ae456cef179 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 12:39:04 -0500 Subject: [PATCH 07/17] Add stronger requests for code samples --- .github/agent-sops/task-release-notes.sop.md | 84 ++++++++++++++++++-- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 5e293b072..005b8bf8b 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -311,7 +311,18 @@ Execute tests to ensure code snippets demonstrate working feature behavior. - You SHOULD run type checking if applicable (e.g., `npm run type-check`, `mypy`) - You SHOULD review test output to confirm behavioral assertions passed, not just that the test didn't error - You MAY need to adjust imports or setup code if tests fail -- You MAY need to install additional dependencies if required + +**Installing Dependencies for Validation:** +- You MUST attempt to install missing dependencies when tests fail due to import errors +- You SHOULD check the project's `pyproject.toml`, `package.json`, or equivalent for optional dependency groups +- You SHOULD use the project's package manager to install dependencies (e.g., `pip install`, `npm install`, `hatch`) +- For Python projects with optional extras, try: `pip install -e ".[extra_name]"` or `pip install package_name` +- You MUST NOT skip validation simply because a dependency is missing - attempt installation first +- You SHOULD only fall back to mocking if the dependency cannot be installed (e.g., requires paid API keys, proprietary software) +- Common optional dependencies to check for: + - Model provider SDKs: `openai`, `anthropic`, `google-generativeai`, `boto3` + - Testing utilities: `pytest`, `pytest-asyncio`, `moto` + - Type checking: `mypy`, `types-*` packages **What constitutes valid behavioral verification:** - Testing that a new API returns expected data structures @@ -326,12 +337,57 @@ Execute tests to ensure code snippets demonstrate working feature behavior. - Functions can be called - Imports resolve successfully - Type hints are valid +- "Source verification" or reviewing PR descriptions +- "API structure validation" or checking import paths exist +- "Conceptual correctness" or any form of manual review +- Claiming external dependencies prevent testing (use mocks instead) + +**Handling External Dependencies:** +When a feature requires external SDKs or services (e.g., OpenAI SDK, Google Gemini SDK, AWS services): +1. **First, attempt to install the dependency** - Many SDKs can be installed and used for validation +2. **If installation succeeds**, write tests that use the real SDK with mocked API responses +3. **If installation fails or requires paid credentials**, mock the dependency entirely +- You MUST still write and execute tests using mocks +- You MUST mock the external dependency and verify the integration code works correctly +- You MUST NOT skip validation because "external dependencies were not installed" without first attempting installation +- You SHOULD use the project's existing test patterns for mocking external services +- You SHOULD examine how the project's existing tests handle similar dependencies + +**Example of mocking external dependencies:** +```python +def test_custom_http_client(): + """Verify custom HTTP client is passed to the provider.""" + from unittest.mock import Mock, patch + + custom_client = Mock() + + with patch('strands.models.openai.OpenAI') as mock_openai: + from strands.models.openai import OpenAIModel + model = OpenAIModel(http_client=custom_client) + + # Verify the custom client was passed + mock_openai.assert_called_once() + call_kwargs = mock_openai.call_args[1] + assert call_kwargs.get('http_client') == custom_client +``` + +**Fallback: Excluding Features Without Validated Examples** +If you genuinely cannot create a working test for a feature (after attempting mocks): +- You MUST NOT include a code example for that feature in the release notes +- You MUST document the feature in the Exclusions Comment (Step 6.3) explaining why validation failed +- You MUST still include the feature in release notes with a placeholder indicating manual sample creation is needed: + ```markdown + ### Feature Name - [PR#123](link) -**Fallback validation** (if full behavioral test execution is not possible): -- You MUST still write assertions about expected behavior, even if mocked -- You MUST document why full behavioral testing wasn't possible -- You SHOULD use mocks that verify interaction patterns (e.g., mock.assert_called_with) -- You MAY mark the example as "partially validated" in the validation comment if behavioral testing is limited + Description of the feature and its impact. + + \`\`\` + # TODO: Could not verify a code sample successfully because [specific reason, e.g., "feature requires live AWS credentials", "complex integration with external service that cannot be mocked"] + # Manual code sample creation required + \`\`\` + ``` +- You MUST NOT invent alternative "validation" methods like source verification or API review +- You MUST NOT include unvalidated code examples - use the placeholder instead #### 4.3 Handle Validation Failures @@ -340,6 +396,7 @@ Address any validation failures before including snippets in release notes. **Constraints:** - You MUST NOT include unvalidated code snippets in release notes - You MUST NOT consider syntax-only tests as valid validation +- You MUST NOT invent alternative validation methods (source verification, API review, conceptual correctness, etc.) - You MUST revise the code snippet if validation fails - You MUST re-run validation after making changes - You MUST ensure revised tests include behavioral assertions @@ -352,6 +409,16 @@ Address any validation failures before including snippets in release notes. - You MUST note in the validation comment what specific behavior each test verifies - You MAY delete temporary test files after capturing their content, as the environment is ephemeral +**If validation is not possible after reasonable attempts:** +- You MUST use a placeholder code block instead of an unvalidated example: + ``` + # TODO: Could not verify a code sample successfully because [reason] + # Manual code sample creation required + ``` +- You MUST document the exclusion in the Exclusions Comment (Step 6.3) +- You MUST NOT include unvalidated code with a note that it was "validated through source review" or similar +- The feature still appears in release notes with the placeholder, signaling that manual review is needed + ### 5. Release Notes Formatting #### 5.1 Format Major Features Section @@ -457,7 +524,7 @@ This ordering allows reviewers to see the validation evidence, review the releas Batch all validation code into a single GitHub issue comment. **Constraints:** -- You MUST post ONE comment containing ALL validation code for ALL features +- You MUST post ONE comment containing ALL validation code for ALL features that have code examples - You MUST NOT post separate comments for each feature's validation - You MUST post this comment BEFORE the release notes comment - You MUST include all test files created during validation (Step 4) in this single comment @@ -465,6 +532,9 @@ Batch all validation code into a single GitHub issue comment. - You MUST NOT reference local file pathsβ€”the ephemeral environment will be destroyed - You MUST clearly label this comment as "Code Validation Tests" - You MUST include a note explaining that this code was used to validate the snippets in the release notes +- You MUST NOT include "batch validation notes" claiming features were validated through source review, API structure validation, or conceptual correctness +- Every feature with a code example in the release notes MUST have a corresponding executed test in this comment +- Features without executed tests MUST NOT have code examples in the release notes - You SHOULD use collapsible `
` sections to organize validation code by feature - You SHOULD include a brief description of what behavior is being verified for each test: ```markdown From f83cdfab41dee37c98b0508bdbd40ab2a56c5698 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 13:24:01 -0500 Subject: [PATCH 08/17] Reinforce the need to generate examples --- .github/agent-sops/task-release-notes.sop.md | 52 ++++++++++++++++---- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 005b8bf8b..b741e65ec 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -236,8 +236,15 @@ When existing examples are insufficient, generate new code snippets. **Note**: This phase is REQUIRED for all code snippets (extracted or generated) that will appear in Major Features sections. Validation must occur AFTER snippets have been extracted or generated in Step 3. +**PRIMARY GOAL: Every Major Feature MUST have a validated code example.** The placeholder comment is an absolute last resort, not a convenient escape hatch. You must try extremely hard to create working, validated examples for every feature. Users rely on these examples to understand how to use new features. + **Critical**: Validation tests MUST verify the actual behavior of the feature, not just syntax correctness. A test that only checks whether code parses or imports succeed is NOT valid validation. The purpose of validation is to prove the code example actually works and demonstrates the feature as intended. +**Available Testing Resources:** +- **Amazon Bedrock**: You have access to Bedrock models for testing. Use Bedrock when a feature requires a real model provider. +- **Project test fixtures**: The project includes mocked model providers and test utilities in `tests/fixtures/` +- **Integration test patterns**: Examine `tests_integ/` for patterns that test real model interactions + #### 4.1 Create Temporary Test Files Create temporary test files that verify the feature's behavior. @@ -344,14 +351,23 @@ Execute tests to ensure code snippets demonstrate working feature behavior. **Handling External Dependencies:** When a feature requires external SDKs or services (e.g., OpenAI SDK, Google Gemini SDK, AWS services): + +**TRY EXTREMELY HARD to create working examples. Exhaust all options before using a placeholder.** + 1. **First, attempt to install the dependency** - Many SDKs can be installed and used for validation 2. **If installation succeeds**, write tests that use the real SDK with mocked API responses -3. **If installation fails or requires paid credentials**, mock the dependency entirely -- You MUST still write and execute tests using mocks -- You MUST mock the external dependency and verify the integration code works correctly +3. **For model provider features, USE BEDROCK** - You have access to Amazon Bedrock. If a feature works with any model provider, test it with Bedrock instead of skipping validation. +4. **If the feature is provider-specific** (e.g., OpenAI-only feature), install that provider's SDK and mock the API responses +5. **Only as an absolute last resort**, if you have exhausted all options and still cannot validate, use the placeholder + +- You MUST use Bedrock for testing when a feature works with multiple model providers +- You MUST install and use provider SDKs when testing provider-specific features +- You MUST mock API responses when you cannot make real API calls - You MUST NOT skip validation because "external dependencies were not installed" without first attempting installation +- You MUST NOT use the placeholder if Bedrock or mocking could work - You SHOULD use the project's existing test patterns for mocking external services - You SHOULD examine how the project's existing tests handle similar dependencies +- You SHOULD check `tests_integ/models/` for examples of testing with real model providers **Example of mocking external dependencies:** ```python @@ -371,27 +387,37 @@ def test_custom_http_client(): assert call_kwargs.get('http_client') == custom_client ``` -**Fallback: Excluding Features Without Validated Examples** -If you genuinely cannot create a working test for a feature (after attempting mocks): -- You MUST NOT include a code example for that feature in the release notes -- You MUST document the feature in the Exclusions Comment (Step 6.3) explaining why validation failed -- You MUST still include the feature in release notes with a placeholder indicating manual sample creation is needed: +**Fallback: Placeholder for Truly Impossible Validation (LAST RESORT ONLY)** +The placeholder is for situations where validation is genuinely impossible, NOT inconvenient. Before using a placeholder, you MUST have attempted ALL of the following: + +1. βœ… Tried using Bedrock as the model provider (if feature works with multiple providers) +2. βœ… Tried installing the required SDK/dependency +3. βœ… Tried mocking the external service +4. βœ… Tried using the project's test fixtures (`tests/fixtures/mocked_model_provider.py`) +5. βœ… Tried adapting patterns from `tests_integ/` integration tests +6. βœ… Tried simplifying the example to remove external dependencies + +Only if ALL applicable approaches above have been attempted and failed should you use a placeholder: +- You MUST document which approaches you tried and why they failed in the Exclusions Comment (Step 6.3) +- You MUST use the placeholder format in the release notes: ```markdown ### Feature Name - [PR#123](link) Description of the feature and its impact. \`\`\` - # TODO: Could not verify a code sample successfully because [specific reason, e.g., "feature requires live AWS credentials", "complex integration with external service that cannot be mocked"] + # TODO: Could not verify a code sample successfully because [specific reason] + # Attempted: [list what you tried, e.g., "Bedrock (not applicable - OpenAI-specific), SDK installation (succeeded), mocking (failed because X)"] # Manual code sample creation required \`\`\` ``` +- You MUST NOT use the placeholder simply because validation is difficult or time-consuming - You MUST NOT invent alternative "validation" methods like source verification or API review - You MUST NOT include unvalidated code examples - use the placeholder instead #### 4.3 Handle Validation Failures -Address any validation failures before including snippets in release notes. +Address any validation failures before including snippets in release notes. **Do not give up easily - try multiple approaches.** **Constraints:** - You MUST NOT include unvalidated code snippets in release notes @@ -400,6 +426,12 @@ Address any validation failures before including snippets in release notes. - You MUST revise the code snippet if validation fails - You MUST re-run validation after making changes - You MUST ensure revised tests include behavioral assertions +- You MUST try multiple approaches before giving up: + 1. Try using Bedrock instead of other model providers + 2. Try installing missing dependencies + 3. Try mocking external services + 4. Try using project test fixtures + 5. Try simplifying the example - You SHOULD examine the actual implementation in the PR if generated code fails - You SHOULD examine existing tests in the PR for patterns that verify behavior - You SHOULD simplify the example if complexity is causing validation issues, but maintain behavioral assertions From f40cc51dfc38225cf713f13b9aff485a15c8087a Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 15:17:26 -0500 Subject: [PATCH 09/17] Try again --- .github/agent-sops/task-release-notes.sop.md | 211 +++++++++++-------- 1 file changed, 126 insertions(+), 85 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index b741e65ec..7e280ee66 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -236,7 +236,9 @@ When existing examples are insufficient, generate new code snippets. **Note**: This phase is REQUIRED for all code snippets (extracted or generated) that will appear in Major Features sections. Validation must occur AFTER snippets have been extracted or generated in Step 3. -**PRIMARY GOAL: Every Major Feature MUST have a validated code example.** The placeholder comment is an absolute last resort, not a convenient escape hatch. You must try extremely hard to create working, validated examples for every feature. Users rely on these examples to understand how to use new features. +**CRITICAL RULE: Every Major Feature MUST have a code example in the release notes.** You cannot remove a feature from the release notes because validation failed. If validation fails, you MUST: +1. Use a code sample extracted from the PR (description, tests, or implementation) +2. Mark it clearly as requiring engineer validation **Critical**: Validation tests MUST verify the actual behavior of the feature, not just syntax correctness. A test that only checks whether code parses or imports succeed is NOT valid validation. The purpose of validation is to prove the code example actually works and demonstrates the feature as intended. @@ -352,19 +354,19 @@ Execute tests to ensure code snippets demonstrate working feature behavior. **Handling External Dependencies:** When a feature requires external SDKs or services (e.g., OpenAI SDK, Google Gemini SDK, AWS services): -**TRY EXTREMELY HARD to create working examples. Exhaust all options before using a placeholder.** +**Try hard to validate, but NEVER skip including the feature or its code example.** 1. **First, attempt to install the dependency** - Many SDKs can be installed and used for validation 2. **If installation succeeds**, write tests that use the real SDK with mocked API responses 3. **For model provider features, USE BEDROCK** - You have access to Amazon Bedrock. If a feature works with any model provider, test it with Bedrock instead of skipping validation. 4. **If the feature is provider-specific** (e.g., OpenAI-only feature), install that provider's SDK and mock the API responses -5. **Only as an absolute last resort**, if you have exhausted all options and still cannot validate, use the placeholder +5. **If validation fails after all attempts**, extract a code sample from the PR and mark it for engineer review - You MUST use Bedrock for testing when a feature works with multiple model providers - You MUST install and use provider SDKs when testing provider-specific features - You MUST mock API responses when you cannot make real API calls - You MUST NOT skip validation because "external dependencies were not installed" without first attempting installation -- You MUST NOT use the placeholder if Bedrock or mocking could work +- You MUST NOT remove a feature from release notes because validation failed - You SHOULD use the project's existing test patterns for mocking external services - You SHOULD examine how the project's existing tests handle similar dependencies - You SHOULD check `tests_integ/models/` for examples of testing with real model providers @@ -387,46 +389,56 @@ def test_custom_http_client(): assert call_kwargs.get('http_client') == custom_client ``` -**Fallback: Placeholder for Truly Impossible Validation (LAST RESORT ONLY)** -The placeholder is for situations where validation is genuinely impossible, NOT inconvenient. Before using a placeholder, you MUST have attempted ALL of the following: +**When Validation Fails: Use PR Code Sample with Engineer Review Callout** -1. βœ… Tried using Bedrock as the model provider (if feature works with multiple providers) -2. βœ… Tried installing the required SDK/dependency -3. βœ… Tried mocking the external service -4. βœ… Tried using the project's test fixtures (`tests/fixtures/mocked_model_provider.py`) -5. βœ… Tried adapting patterns from `tests_integ/` integration tests -6. βœ… Tried simplifying the example to remove external dependencies +If you cannot successfully validate a code example after attempting all approaches (Bedrock, mocking, dependency installation), you MUST still include a code example. Follow this process: -Only if ALL applicable approaches above have been attempted and failed should you use a placeholder: -- You MUST document which approaches you tried and why they failed in the Exclusions Comment (Step 6.3) -- You MUST use the placeholder format in the release notes: - ```markdown - ### Feature Name - [PR#123](link) +1. **Extract a code sample from the PR** - Use code from: + - The PR description's code examples + - Test files added in the PR + - The actual implementation (simplified for readability) + - Documentation updates in the PR +2. **Include the sample in the release notes** with a clear callout that it needs engineer validation +3. **Document the validation failure** in the Exclusions Comment (Step 6.3) - Description of the feature and its impact. +**Format for unvalidated code examples:** +```markdown +### Feature Name - [PR#123](link) - \`\`\` - # TODO: Could not verify a code sample successfully because [specific reason] - # Attempted: [list what you tried, e.g., "Bedrock (not applicable - OpenAI-specific), SDK installation (succeeded), mocking (failed because X)"] - # Manual code sample creation required - \`\`\` - ``` -- You MUST NOT use the placeholder simply because validation is difficult or time-consuming -- You MUST NOT invent alternative "validation" methods like source verification or API review -- You MUST NOT include unvalidated code examples - use the placeholder instead +Description of the feature and its impact. + +\`\`\`python +# ⚠️ NEEDS ENGINEER VALIDATION - Could not automatically verify this example +# Reason: [specific reason, e.g., "requires OpenAI API credentials", "complex integration setup"] + +# Code sample extracted from PR description/tests +from strands import Agent +from strands.models.openai import OpenAIModel + +model = OpenAIModel(http_client=custom_client) +agent = Agent(model=model) +\`\`\` +``` + +**Constraints:** +- You MUST NOT remove a feature from release notes because validation failed +- You MUST extract actual code from the PR to use as the example +- You MUST include the warning comment at the top of unvalidated examples +- You MUST include the reason validation failed in the warning +- You MUST document all unvalidated examples in the Exclusions Comment #### 4.3 Handle Validation Failures -Address any validation failures before including snippets in release notes. **Do not give up easily - try multiple approaches.** +Address any validation failures before including snippets in release notes. **Do not give up easily - try multiple approaches. But NEVER remove a feature because validation failed.** **Constraints:** -- You MUST NOT include unvalidated code snippets in release notes +- You MUST NOT remove a feature from release notes because validation failed - You MUST NOT consider syntax-only tests as valid validation - You MUST NOT invent alternative validation methods (source verification, API review, conceptual correctness, etc.) -- You MUST revise the code snippet if validation fails +- You MUST revise the code snippet if validation fails and try again - You MUST re-run validation after making changes - You MUST ensure revised tests include behavioral assertions -- You MUST try multiple approaches before giving up: +- You MUST try multiple approaches before falling back to unvalidated PR code: 1. Try using Bedrock instead of other model providers 2. Try installing missing dependencies 3. Try mocking external services @@ -436,20 +448,16 @@ Address any validation failures before including snippets in release notes. **Do - You SHOULD examine existing tests in the PR for patterns that verify behavior - You SHOULD simplify the example if complexity is causing validation issues, but maintain behavioral assertions - You MAY extract a different example from the PR if the current one cannot be behaviorally validated -- You MAY seek clarification if you cannot create a valid example with behavioral verification - You MUST preserve the test file content to include in the GitHub issue comment (Step 6.1) - You MUST note in the validation comment what specific behavior each test verifies - You MAY delete temporary test files after capturing their content, as the environment is ephemeral -**If validation is not possible after reasonable attempts:** -- You MUST use a placeholder code block instead of an unvalidated example: - ``` - # TODO: Could not verify a code sample successfully because [reason] - # Manual code sample creation required - ``` -- You MUST document the exclusion in the Exclusions Comment (Step 6.3) -- You MUST NOT include unvalidated code with a note that it was "validated through source review" or similar -- The feature still appears in release notes with the placeholder, signaling that manual review is needed +**If validation is not possible after all attempts:** +- You MUST extract a code sample from the PR (description, tests, or implementation) +- You MUST include the code sample with the engineer validation warning (see format above) +- You MUST document the failure in the Exclusions Comment (Step 6.3) +- You MUST NOT leave the feature without a code example +- You MUST NOT use empty placeholders ### 5. Release Notes Formatting @@ -556,7 +564,7 @@ This ordering allows reviewers to see the validation evidence, review the releas Batch all validation code into a single GitHub issue comment. **Constraints:** -- You MUST post ONE comment containing ALL validation code for ALL features that have code examples +- You MUST post ONE comment containing ALL validation code for features that were successfully validated - You MUST NOT post separate comments for each feature's validation - You MUST post this comment BEFORE the release notes comment - You MUST include all test files created during validation (Step 4) in this single comment @@ -565,8 +573,7 @@ Batch all validation code into a single GitHub issue comment. - You MUST clearly label this comment as "Code Validation Tests" - You MUST include a note explaining that this code was used to validate the snippets in the release notes - You MUST NOT include "batch validation notes" claiming features were validated through source review, API structure validation, or conceptual correctness -- Every feature with a code example in the release notes MUST have a corresponding executed test in this comment -- Features without executed tests MUST NOT have code examples in the release notes +- You MUST list which features have unvalidated code samples (marked for engineer review) at the end of this comment - You SHOULD use collapsible `
` sections to organize validation code by feature - You SHOULD include a brief description of what behavior is being verified for each test: ```markdown @@ -575,7 +582,7 @@ Batch all validation code into a single GitHub issue comment. The following test code was used to validate the code examples in the release notes.
- Validation: Feature Name 1 + βœ… Validated: Feature Name 1 **Behavior verified:** This test confirms that the new `output_schema` parameter causes the agent to return a validated Pydantic model instance with the correct field types. @@ -586,7 +593,7 @@ Batch all validation code into a single GitHub issue comment.
- Validation: Feature Name 2 + βœ… Validated: Feature Name 2 **Behavior verified:** This test confirms that async streaming yields events in real-time and that the final result contains all streamed content. @@ -595,8 +602,17 @@ Batch all validation code into a single GitHub issue comment. \`\`\`
+ + --- + + ### ⚠️ Features Requiring Engineer Validation + + The following features have code samples extracted from PRs but could not be automatically validated: + + - **Feature Name 3** (PR#456): Could not validate because [reason]. Code sample extracted from PR description. + - **Feature Name 4** (PR#789): Could not validate because [reason]. Code sample extracted from PR test files. ``` -- This allows reviewers to understand what was validated and copy/run the validation code themselves +- This allows reviewers to understand what was validated and which examples need manual review #### 6.2 Post Release Notes Comment @@ -614,38 +630,43 @@ Post the formatted release notes as a single GitHub issue comment. #### 6.3 Post Exclusions Comment -Document any features or bug fixes that were considered but excluded from the release notes. +Document any features with unvalidated code samples and any other notable decisions. -**Critical**: This comment is REQUIRED whenever you decide that a categorized Major Feature or Major Bug Fix does not warrant inclusion in the final release notes, does not need a code example, or was downgraded to Minor Changes during the process. +**Critical**: This comment is REQUIRED to document features that have unvalidated code samples requiring engineer review. **Constraints:** - You MUST post this comment as the FINAL comment on the GitHub issue - You MUST include this comment if ANY of the following occurred: - - A PR initially categorized as Major Feature was excluded from release notes - - A PR initially categorized as Major Bug Fix was excluded from release notes - - A Major Feature was included without a code example + - A Major Feature has an unvalidated code sample (marked for engineer review) - A feature's scope or description was significantly different from the PR description - You relied on review comments rather than the PR description to understand a feature -- You MUST clearly explain the reasoning for each exclusion or modification +- You MUST clearly explain the reasoning for each unvalidated sample - You MUST format the comment with clear sections: ```markdown - ## Release Notes Exclusions & Notes + ## Release Notes Review Notes - The following decisions were made during release notes generation: + The following items require attention during review: - ### Excluded from Major Features - - **PR#123 - Feature Title**: Excluded because [specific reason - e.g., "internal refactoring with no user-facing API changes", "feature was reverted in a later PR", "scope reduced during review to minor enhancement"] + ### ⚠️ Features with Unvalidated Code Samples - ### Excluded from Major Bug Fixes - - **PR#456 - Fix Title**: Excluded because [specific reason] + These features have code samples extracted from PRs but could not be automatically validated. An engineer must verify these examples before publishing: - ### Features Without Code Examples - - **PR#789 - Feature Title**: No code example provided because [specific reason - e.g., "feature is configuration-only", "existing documentation covers usage adequately", "unable to create a validated example"] + - **PR#123 - Feature Title**: + - Code source: PR description / test files / implementation + - Validation attempted: [what you tried] + - Failure reason: [why it failed, e.g., "requires OpenAI API credentials", "complex multi-service integration"] + - Action needed: Engineer should verify the code sample works as shown + + - **PR#456 - Feature Title**: + - Code source: PR test files + - Validation attempted: Bedrock (not applicable - OpenAI-specific), mocking (failed due to complex auth flow) + - Failure reason: Feature requires live OpenAI API interaction that cannot be mocked + - Action needed: Engineer should test with OpenAI credentials and update if needed ### Description vs. Implementation Discrepancies - **PR#101 - Feature Title**: PR description stated [X] but review comments and final implementation show [Y]. Release notes reflect the actual merged behavior. ``` -- You SHOULD include this comment even if there are no exclusions, with a simple note: "No features or bug fixes were excluded from this release notes draft." +- You SHOULD include this comment even if all code samples were validated, with a simple note: "All code samples were successfully validated. No engineer review required." - You MUST NOT skip this commentβ€”it provides critical transparency for reviewers #### 6.4 Handle User Feedback on Release Notes @@ -674,7 +695,7 @@ When the user requests changes to the release notes after they have been posted, ## Examples -### Example 1: Major Features Section with Code +### Example 1: Major Features Section with Validated Code ```markdown ## Major Features @@ -683,26 +704,42 @@ When the user requests changes to the release notes after they have been posted, MCP Connections via ToolProviders allow the Agent to manage connection lifecycles automatically, eliminating the need for manual context managers. This experimental interface simplifies MCP tool integration significantly. -\`\`\`[language] +\`\`\`python # Code example in the project's programming language # Demonstrate the key feature usage # Keep it focused and concise \`\`\` See the [MCP docs](https://docs.example.com/mcp) for details. +``` -### Async Streaming for Multi-Agent Systems - [PR#961](https://github.com/org/repo/pull/961) +### Example 2: Major Feature with Unvalidated Code (Needs Engineer Review) -Multi-agent systems now support async streaming, enabling real-time event streaming from agent teams as they collaborate. +```markdown +### Custom HTTP Client Support - [PR#1366](https://github.com/org/repo/pull/1366) -\`\`\`[language] -# Another code example -# Show the feature in action -# Include only essential code +OpenAI model provider now accepts a custom HTTP client, enabling proxy configuration, custom timeouts, and request logging. + +\`\`\`python +# ⚠️ NEEDS ENGINEER VALIDATION - Could not automatically verify this example +# Reason: Requires OpenAI API credentials for full validation + +from strands.models.openai import OpenAIModel +import httpx + +# Create custom HTTP client with proxy and timeout settings +custom_client = httpx.Client( + proxy="http://proxy.example.com:8080", + timeout=30.0 +) + +model = OpenAIModel( + client_args={"http_client": custom_client} +) \`\`\` ``` -### Example 2: Major Bug Fixes Section +### Example 3: Major Bug Fixes Section ```markdown --- @@ -719,7 +756,7 @@ Multi-agent systems now support async streaming, enabling real-time event stream Fixed broken conversations caused by orphaned `toolUse` blocks, improving reliability when tools fail or are interrupted. ``` -### Example 3: Complete Release Notes Structure +### Example 4: Complete Release Notes Structure ```markdown ## Major Features @@ -789,11 +826,12 @@ If code validation fails for a snippet: 3. Examine the actual implementation in the PR to understand correct usage 4. Look at existing tests in the PR for patterns that verify behavior correctly 5. Try simplifying the example to focus on core functionality while maintaining behavioral assertions -6. Consider using a different example from the PR -7. If unable to validate behaviorally, note the issue in the release notes comment and skip the code example for that feature -8. Leave a comment on the issue noting which features couldn't include validated code examples +6. Try using Bedrock if the feature works with multiple model providers +7. Try mocking external services +8. **If all attempts fail**: Extract a code sample from the PR and include it with the engineer validation warning +9. Document the unvalidated sample in the Exclusions Comment -**Important**: If your tests only verify syntax (code runs without errors) but don't verify behavior (assertions about outputs, state changes, or interactions), the validation is incomplete. Revisit Step 4 and add meaningful behavioral assertions. +**Important**: You MUST NOT remove a feature from release notes because validation failed. Always include a code sample - either validated or marked for engineer review. ### Large PR Sets (>100 PRs) @@ -830,13 +868,14 @@ When GitHub tools or git operations are deferred (GITHUB_WRITE=false): ### Unable to Extract Suitable Code Examples -If no suitable code examples can be found or generated for a feature: -1. Examine the PR description more carefully for usage information -2. Look at related documentation changes -3. Consider whether the feature actually needs a code example (some features are self-explanatory) -4. Generate a minimal example based on the API changes, even if you can't fully validate it -5. Mark the example as "conceptual" if validation isn't possible -6. Consider omitting the code example if it would be misleading +If no suitable code examples can be found in a PR: +1. Check the PR description for any code snippets +2. Look at test files added or modified in the PR +3. Examine the actual implementation and create a simplified usage example +4. Look at related documentation changes +5. Check if there are example files in the `examples/` directory + +**Important**: You MUST always include a code example for Major Features. If you cannot find one in the PR, create one based on the implementation and mark it for engineer validation. Never leave a feature without a code example. ### Stale or Inaccurate PR Descriptions @@ -852,10 +891,12 @@ If you discover that a PR description doesn't match the actual implementation: ## Desired Outcome * Focused release notes highlighting Major Features and Major Bug Fixes with concise descriptions (2-3 sentences, no bullet points) -* Working, behaviorally-validated code examples for all major features (tests must verify feature behavior, not just syntax) +* Code examples for ALL major features - either validated or marked for engineer review +* Validated code examples have passing behavioral tests +* Unvalidated code examples are clearly marked with the engineer validation warning and extracted from PR sources * Well-formatted markdown that renders properly on GitHub * Release notes posted as a comment on the GitHub issue for review -* Exclusions comment documenting any features excluded or modified, with clear reasoning for each decision +* Review notes comment documenting any features with unvalidated code samples that need engineer attention **Important**: Your generated release notes will be prepended to GitHub's auto-generated release notes. GitHub automatically generates: - "What's Changed" section listing all PRs with authors and links From 39458324e359ba2c559cdeee14727fb339b9c960 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 17:39:37 -0500 Subject: [PATCH 10/17] Attempt again --- .github/agent-sops/task-release-notes.sop.md | 118 ++++++++++++------- 1 file changed, 77 insertions(+), 41 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index 7e280ee66..a412311c7 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -236,9 +236,19 @@ When existing examples are insufficient, generate new code snippets. **Note**: This phase is REQUIRED for all code snippets (extracted or generated) that will appear in Major Features sections. Validation must occur AFTER snippets have been extracted or generated in Step 3. -**CRITICAL RULE: Every Major Feature MUST have a code example in the release notes.** You cannot remove a feature from the release notes because validation failed. If validation fails, you MUST: -1. Use a code sample extracted from the PR (description, tests, or implementation) -2. Mark it clearly as requiring engineer validation +**CRITICAL: You MUST attempt to validate EVERY code example.** The fallback is only for cases where you have genuinely tried and failed. You cannot skip validation - you must demonstrate that you attempted it. + +**Validation is almost always possible** because: +- You have access to Amazon Bedrock for testing model features +- Most features can be tested with mocks +- The project has extensive test fixtures in `tests/fixtures/` +- Many "external" dependencies can be installed and mocked + +**For each Major Feature, you MUST:** +1. Write a test file +2. Run the test +3. If it fails, try alternative approaches (Bedrock, mocking, different example) +4. Only after multiple documented failures can you use the engineer validation fallback **Critical**: Validation tests MUST verify the actual behavior of the feature, not just syntax correctness. A test that only checks whether code parses or imports succeed is NOT valid validation. The purpose of validation is to prove the code example actually works and demonstrates the feature as intended. @@ -247,6 +257,14 @@ When existing examples are insufficient, generate new code snippets. - **Project test fixtures**: The project includes mocked model providers and test utilities in `tests/fixtures/` - **Integration test patterns**: Examine `tests_integ/` for patterns that test real model interactions + +**Features that genuinely cannot be validated (rare):** +- Features requiring paid third-party API credentials with no mock option AND no Bedrock alternative +- Features requiring specific hardware (GPU, TPU) +- Features requiring live network access to specific external services that cannot be mocked + +**If your feature doesn't fall into the "genuinely cannot validate" category, you MUST validate it.** + #### 4.1 Create Temporary Test Files Create temporary test files that verify the feature's behavior. @@ -389,17 +407,27 @@ def test_custom_http_client(): assert call_kwargs.get('http_client') == custom_client ``` -**When Validation Fails: Use PR Code Sample with Engineer Review Callout** +**When Validation Fails After Genuine Attempts: Use PR Code Sample with Engineer Review Callout** + +**You can ONLY use this fallback if you have actually attempted validation and can document what you tried.** Simply stating "complex external SDK setup required" or "provider SDK setup required" without evidence of attempts is NOT acceptable. -If you cannot successfully validate a code example after attempting all approaches (Bedrock, mocking, dependency installation), you MUST still include a code example. Follow this process: +**Required proof of validation attempts:** +Before marking any example as needing engineer validation, you MUST have: +1. Created an actual test file (show the code in the validation comment) +2. Run the test and received an actual error (show the error message) +3. Tried at least ONE alternative approach (Bedrock, mocking, simplified example) +4. Documented each attempt and its failure reason +**If you cannot show this proof, you have not tried hard enough.** + +**Process when validation genuinely fails after documented attempts:** 1. **Extract a code sample from the PR** - Use code from: - The PR description's code examples - Test files added in the PR - The actual implementation (simplified for readability) - Documentation updates in the PR 2. **Include the sample in the release notes** with a clear callout that it needs engineer validation -3. **Document the validation failure** in the Exclusions Comment (Step 6.3) +3. **Document the validation attempts and failures** in the Validation Comment (Step 6.1) **Format for unvalidated code examples:** ```markdown @@ -408,8 +436,9 @@ If you cannot successfully validate a code example after attempting all approach Description of the feature and its impact. \`\`\`python -# ⚠️ NEEDS ENGINEER VALIDATION - Could not automatically verify this example -# Reason: [specific reason, e.g., "requires OpenAI API credentials", "complex integration setup"] +# ⚠️ NEEDS ENGINEER VALIDATION +# Validation attempted: [describe test created and error received] +# Alternative attempts: [what else you tried and why it failed] # Code sample extracted from PR description/tests from strands import Agent @@ -421,33 +450,35 @@ agent = Agent(model=model) ``` **Constraints:** -- You MUST NOT remove a feature from release notes because validation failed -- You MUST extract actual code from the PR to use as the example -- You MUST include the warning comment at the top of unvalidated examples -- You MUST include the reason validation failed in the warning -- You MUST document all unvalidated examples in the Exclusions Comment +- You MUST NOT mark examples as needing validation without actually attempting validation first +- You MUST NOT use vague reasons like "complex setup required" - be specific about what you tried and what error you got +- You MUST show your test code and error messages in the Validation Comment +- You MUST try Bedrock for any feature that works with multiple model providers before giving up +- You MUST try mocking for provider-specific features before giving up +- You MUST document all validation attempts (successful AND failed) in the Validation Comment #### 4.3 Handle Validation Failures -Address any validation failures before including snippets in release notes. **Do not give up easily - try multiple approaches. But NEVER remove a feature because validation failed.** +Address any validation failures before including snippets in release notes. **You MUST actually attempt validation - do not skip to the fallback.** **Constraints:** -- You MUST NOT remove a feature from release notes because validation failed +- You MUST actually run tests before claiming validation failed +- You MUST NOT skip validation and go straight to "NEEDS ENGINEER VALIDATION" - You MUST NOT consider syntax-only tests as valid validation - You MUST NOT invent alternative validation methods (source verification, API review, conceptual correctness, etc.) - You MUST revise the code snippet if validation fails and try again - You MUST re-run validation after making changes - You MUST ensure revised tests include behavioral assertions -- You MUST try multiple approaches before falling back to unvalidated PR code: - 1. Try using Bedrock instead of other model providers - 2. Try installing missing dependencies - 3. Try mocking external services - 4. Try using project test fixtures - 5. Try simplifying the example +- You MUST try these approaches IN ORDER before using the fallback: + 1. Run the test as-is and capture the error + 2. Try using Bedrock instead of other model providers + 3. Try installing missing dependencies + 4. Try mocking external services + 5. Try using project test fixtures (`tests/fixtures/mocked_model_provider.py`) + 6. Try simplifying the example +- You MUST document each attempt and its result in the Validation Comment - You SHOULD examine the actual implementation in the PR if generated code fails - You SHOULD examine existing tests in the PR for patterns that verify behavior -- You SHOULD simplify the example if complexity is causing validation issues, but maintain behavioral assertions -- You MAY extract a different example from the PR if the current one cannot be behaviorally validated - You MUST preserve the test file content to include in the GitHub issue comment (Step 6.1) - You MUST note in the validation comment what specific behavior each test verifies - You MAY delete temporary test files after capturing their content, as the environment is ephemeral @@ -561,19 +592,17 @@ This ordering allows reviewers to see the validation evidence, review the releas #### 6.1 Post Validation Code Comment -Batch all validation code into a single GitHub issue comment. +Batch all validation code into a single GitHub issue comment. **This comment MUST show evidence that you attempted validation for EVERY feature.** **Constraints:** -- You MUST post ONE comment containing ALL validation code for features that were successfully validated +- You MUST post ONE comment containing validation attempts for ALL Major Features +- You MUST show test code for EVERY feature - both successful and failed attempts - You MUST NOT post separate comments for each feature's validation - You MUST post this comment BEFORE the release notes comment - You MUST include all test files created during validation (Step 4) in this single comment - You MUST document what specific behavior each test verifies (not just "validates the code works") - You MUST NOT reference local file pathsβ€”the ephemeral environment will be destroyed - You MUST clearly label this comment as "Code Validation Tests" -- You MUST include a note explaining that this code was used to validate the snippets in the release notes -- You MUST NOT include "batch validation notes" claiming features were validated through source review, API structure validation, or conceptual correctness -- You MUST list which features have unvalidated code samples (marked for engineer review) at the end of this comment - You SHOULD use collapsible `
` sections to organize validation code by feature - You SHOULD include a brief description of what behavior is being verified for each test: ```markdown @@ -590,29 +619,36 @@ Batch all validation code into a single GitHub issue comment. [Full test file for feature 1 with behavioral assertions] \`\`\` + **Test output:** PASSED +
- βœ… Validated: Feature Name 2 - - **Behavior verified:** This test confirms that async streaming yields events in real-time and that the final result contains all streamed content. + ⚠️ Could Not Validate: Feature Name 2 + **Attempt 1: Direct test with mocked model** \`\`\`python - [Full test file for feature 2 with behavioral assertions] + [Test code that was attempted] + \`\`\` + **Error received:** + \`\`\` + [Actual error message from running the test] \`\`\` -
- - --- - - ### ⚠️ Features Requiring Engineer Validation + **Attempt 2: Test with Bedrock** + \`\`\`python + [Alternative test code attempted] + \`\`\` + **Error received:** + \`\`\` + [Actual error message] + \`\`\` - The following features have code samples extracted from PRs but could not be automatically validated: + **Conclusion:** Could not validate because [specific reason based on actual errors]. Code sample in release notes extracted from PR description. - - **Feature Name 3** (PR#456): Could not validate because [reason]. Code sample extracted from PR description. - - **Feature Name 4** (PR#789): Could not validate because [reason]. Code sample extracted from PR test files. +
``` -- This allows reviewers to understand what was validated and which examples need manual review +- This allows reviewers to verify that validation was genuinely attempted #### 6.2 Post Release Notes Comment From 7a7a3c2ea9a3b8e490cbc041a889f2aa47f24731 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 17:41:39 -0500 Subject: [PATCH 11/17] Attempt to reduce verbosity --- .github/agent-sops/task-release-notes.sop.md | 442 ++++++------------- 1 file changed, 136 insertions(+), 306 deletions(-) diff --git a/.github/agent-sops/task-release-notes.sop.md b/.github/agent-sops/task-release-notes.sop.md index a412311c7..8434b9de5 100644 --- a/.github/agent-sops/task-release-notes.sop.md +++ b/.github/agent-sops/task-release-notes.sop.md @@ -8,6 +8,22 @@ You analyze merged pull requests between two git references (tags or branches), **Important**: You are executing in an ephemeral environment. Any files you create (test files, notes, etc.) will be discarded after execution. All deliverablesβ€”release notes, validation code, categorization listsβ€”MUST be posted as GitHub issue comments to be preserved and accessible to reviewers. +## Key Principles + +These principles apply throughout the entire workflow and are referenced by name in later sections. + +### Principle 1: Ephemeral Environment +You are executing in an ephemeral environment. All deliverables MUST be posted as GitHub issue comments to be preserved. + +### Principle 2: PR Descriptions May Be Stale +PR descriptions are written at PR creation and may become outdated after code review. Reviewers often request structural changes, API modifications, or feature adjustments that are implemented but NOT reflected in the original description. You MUST cross-reference descriptions with review comments and treat merged code as the source of truth. + +### Principle 3: Validation Is Mandatory +You MUST attempt to validate EVERY code example with behavioral tests. The engineer review fallback is only for cases where you have genuinely tried and failed with documented evidence. + +### Principle 4: Never Remove Features +You MUST NOT remove a feature from release notes because validation failed. Always include a code sampleβ€”either validated or marked for engineer review. + ## Steps ### 1. Setup and Input Processing @@ -62,13 +78,10 @@ For each PR identified (from release or API query), fetch additional metadata ne - You MUST retrieve additional metadata for PRs being considered for Major Features or Major Bug Fixes: - PR description/body (essential for understanding the change) - PR labels (if any) - - PR review comments and conversation threads (to identify post-description changes) + - PR review comments and conversation threads (per **Principle 2**) - You SHOULD retrieve for Major Feature candidates: - Files changed in the PR (to find code examples) -- You MUST retrieve PR review comments for Major Feature and Major Bug Fix candidates: - - Review comments often contain important context about changes made after the initial description - - Look for reviewer requests that resulted in structural changes to the implementation - - Check for author responses indicating significant modifications +- You MUST retrieve PR review comments for Major Feature and Major Bug Fix candidates to identify post-description changes - You SHOULD minimize API calls by only fetching detailed metadata for PRs that appear significant based on title/prefix - You MUST track this data for use in categorization and release notes generation @@ -96,12 +109,9 @@ Extract categorization signals from PR titles using conventional commit prefixes Use LLM analysis to understand the significance and user impact of each change. -**Critical Warning - Stale PR Descriptions:** -PR descriptions are written at the time of PR creation and may become outdated after code review. Reviewers often request structural changes, API modifications, or feature adjustments that are implemented but NOT reflected in the original description. You MUST cross-reference the description with review comments to get an accurate understanding of the final merged code. - **Constraints:** - You MUST read and analyze the PR description for each PR -- You MUST also review PR comments and review threads to identify changes made after the initial description: +- Per **Principle 2**, you MUST also review PR comments and review threads to identify changes made after the initial description: - Look for reviewer comments requesting changes to the implementation - Look for author responses confirming changes were made - Look for "LGTM" or approval comments that reference specific modifications @@ -164,13 +174,10 @@ Present the categorized PRs to the user for review and confirmation. - You MUST wait for user confirmation or recategorization before proceeding - You SHOULD update your categorization based on user feedback - You MAY iterate on categorization if the user requests changes - -**Critical - Re-validation After Recategorization:** -When the user promotes a PR to "Major Features" that was not previously in that category: -- You MUST perform Step 3 (Code Snippet Extraction) for the newly promoted PR -- You MUST perform Step 4 (Code Validation) for any code snippets extracted or generated -- You MUST NOT skip validation just because the user requested the change -- You MUST include the validation code for newly promoted features in the Validation Comment (Step 6.1) +- When the user promotes a PR to "Major Features" that was not previously in that category: + - You MUST perform Step 3 (Code Snippet Extraction) for the newly promoted PR + - You MUST perform Step 4 (Code Validation) for any code snippets extracted or generated + - You MUST include the validation code for newly promoted features in the Validation Comment (Step 6.1) ### 3. Code Snippet Extraction and Generation @@ -180,14 +187,11 @@ When the user promotes a PR to "Major Features" that was not previously in that Search merged PRs for existing code that demonstrates the new feature. -**Critical Warning - Verify Examples Against Final Implementation:** -Code examples in PR descriptions may be outdated if the implementation changed during review. Always verify that examples match the actual merged code by checking review comments for requested changes and examining the final implementation. - **Constraints:** - You MUST search each Major Feature PR for existing code examples in: - Test files (especially integration tests or example tests) - these are most reliable as they reflect the final implementation - Example applications or scripts in `examples/` directory - - Code snippets in the PR description (but verify against review comments and final code) + - Code snippets in the PR description (but verify per **Principle 2**) - Documentation updates that include code examples - README updates with usage examples - You MUST cross-reference any examples from PR descriptions with: @@ -234,41 +238,22 @@ When existing examples are insufficient, generate new code snippets. ### 4. Code Validation -**Note**: This phase is REQUIRED for all code snippets (extracted or generated) that will appear in Major Features sections. Validation must occur AFTER snippets have been extracted or generated in Step 3. - -**CRITICAL: You MUST attempt to validate EVERY code example.** The fallback is only for cases where you have genuinely tried and failed. You cannot skip validation - you must demonstrate that you attempted it. +**Note**: This phase is REQUIRED for all code snippets (extracted or generated) that will appear in Major Features sections. Per **Principle 3**, you MUST attempt validation for every example. -**Validation is almost always possible** because: -- You have access to Amazon Bedrock for testing model features -- Most features can be tested with mocks -- The project has extensive test fixtures in `tests/fixtures/` -- Many "external" dependencies can be installed and mocked +#### 4.1 Validation Requirements -**For each Major Feature, you MUST:** -1. Write a test file -2. Run the test -3. If it fails, try alternative approaches (Bedrock, mocking, different example) -4. Only after multiple documented failures can you use the engineer validation fallback - -**Critical**: Validation tests MUST verify the actual behavior of the feature, not just syntax correctness. A test that only checks whether code parses or imports succeed is NOT valid validation. The purpose of validation is to prove the code example actually works and demonstrates the feature as intended. +Validation tests MUST verify the actual behavior of the feature, not just syntax correctness. A test that only checks whether code parses or imports succeed is NOT valid validation. **Available Testing Resources:** - **Amazon Bedrock**: You have access to Bedrock models for testing. Use Bedrock when a feature requires a real model provider. - **Project test fixtures**: The project includes mocked model providers and test utilities in `tests/fixtures/` - **Integration test patterns**: Examine `tests_integ/` for patterns that test real model interactions - **Features that genuinely cannot be validated (rare):** - Features requiring paid third-party API credentials with no mock option AND no Bedrock alternative - Features requiring specific hardware (GPU, TPU) - Features requiring live network access to specific external services that cannot be mocked -**If your feature doesn't fall into the "genuinely cannot validate" category, you MUST validate it.** - -#### 4.1 Create Temporary Test Files - -Create temporary test files that verify the feature's behavior. - **Constraints:** - You MUST create a temporary test file for each code snippet - You MUST place test files in an appropriate test directory based on the project structure @@ -325,9 +310,20 @@ def test_structured_output_syntax(): agent("Get user info") ``` -#### 4.2 Run Validation Tests +#### 4.2 Validation Workflow -Execute tests to ensure code snippets demonstrate working feature behavior. +For each Major Feature, follow this workflow in order: + +1. **Write a test file** with behavioral assertions +2. **Run the test** using the project's test framework +3. **If it fails**, try these approaches in order: + - Try using Bedrock instead of other model providers + - Try installing missing dependencies + - Try mocking external services + - Try using project test fixtures (`tests/fixtures/mocked_model_provider.py`) + - Try simplifying the example +4. **Document each attempt** and its result in the Validation Comment +5. **Only after documented failures** can you use the engineer review fallback **Constraints:** - You MUST run the appropriate test command for the project (e.g., `npm test`, `pytest`, `go test`) @@ -336,58 +332,15 @@ Execute tests to ensure code snippets demonstrate working feature behavior. - You MUST check that the code compiles without errors in compiled languages - You MUST ensure tests include meaningful assertions about feature behavior - You SHOULD run type checking if applicable (e.g., `npm run type-check`, `mypy`) -- You SHOULD review test output to confirm behavioral assertions passed, not just that the test didn't error +- You SHOULD review test output to confirm behavioral assertions passed - You MAY need to adjust imports or setup code if tests fail -**Installing Dependencies for Validation:** +**Installing Dependencies:** - You MUST attempt to install missing dependencies when tests fail due to import errors - You SHOULD check the project's `pyproject.toml`, `package.json`, or equivalent for optional dependency groups - You SHOULD use the project's package manager to install dependencies (e.g., `pip install`, `npm install`, `hatch`) - For Python projects with optional extras, try: `pip install -e ".[extra_name]"` or `pip install package_name` -- You MUST NOT skip validation simply because a dependency is missing - attempt installation first - You SHOULD only fall back to mocking if the dependency cannot be installed (e.g., requires paid API keys, proprietary software) -- Common optional dependencies to check for: - - Model provider SDKs: `openai`, `anthropic`, `google-generativeai`, `boto3` - - Testing utilities: `pytest`, `pytest-asyncio`, `moto` - - Type checking: `mypy`, `types-*` packages - -**What constitutes valid behavioral verification:** -- Testing that a new API returns expected data structures -- Testing that a new option/parameter changes behavior as documented -- Testing that callbacks are invoked with correct arguments -- Testing that error handling works as described -- Testing that integrations connect and exchange data correctly (with mocks if needed) - -**What does NOT constitute valid verification:** -- Code executes without raising exceptions -- Objects can be constructed -- Functions can be called -- Imports resolve successfully -- Type hints are valid -- "Source verification" or reviewing PR descriptions -- "API structure validation" or checking import paths exist -- "Conceptual correctness" or any form of manual review -- Claiming external dependencies prevent testing (use mocks instead) - -**Handling External Dependencies:** -When a feature requires external SDKs or services (e.g., OpenAI SDK, Google Gemini SDK, AWS services): - -**Try hard to validate, but NEVER skip including the feature or its code example.** - -1. **First, attempt to install the dependency** - Many SDKs can be installed and used for validation -2. **If installation succeeds**, write tests that use the real SDK with mocked API responses -3. **For model provider features, USE BEDROCK** - You have access to Amazon Bedrock. If a feature works with any model provider, test it with Bedrock instead of skipping validation. -4. **If the feature is provider-specific** (e.g., OpenAI-only feature), install that provider's SDK and mock the API responses -5. **If validation fails after all attempts**, extract a code sample from the PR and mark it for engineer review - -- You MUST use Bedrock for testing when a feature works with multiple model providers -- You MUST install and use provider SDKs when testing provider-specific features -- You MUST mock API responses when you cannot make real API calls -- You MUST NOT skip validation because "external dependencies were not installed" without first attempting installation -- You MUST NOT remove a feature from release notes because validation failed -- You SHOULD use the project's existing test patterns for mocking external services -- You SHOULD examine how the project's existing tests handle similar dependencies -- You SHOULD check `tests_integ/models/` for examples of testing with real model providers **Example of mocking external dependencies:** ```python @@ -407,20 +360,28 @@ def test_custom_http_client(): assert call_kwargs.get('http_client') == custom_client ``` -**When Validation Fails After Genuine Attempts: Use PR Code Sample with Engineer Review Callout** +#### 4.3 Engineer Review Fallback -**You can ONLY use this fallback if you have actually attempted validation and can document what you tried.** Simply stating "complex external SDK setup required" or "provider SDK setup required" without evidence of attempts is NOT acceptable. +When validation genuinely fails after documented attempts, use this fallback. Per **Principle 4**, you MUST still include the feature with a code sample. -**Required proof of validation attempts:** -Before marking any example as needing engineer validation, you MUST have: +**Required proof before using this fallback:** 1. Created an actual test file (show the code in the validation comment) -2. Run the test and received an actual error (show the error message) +2. Ran the test and received an actual error (show the error message) 3. Tried at least ONE alternative approach (Bedrock, mocking, simplified example) 4. Documented each attempt and its failure reason -**If you cannot show this proof, you have not tried hard enough.** +**Constraints:** +- You MUST NOT mark examples as needing validation without actually attempting validation first +- You MUST NOT use vague reasons like "complex setup required" - be specific about what you tried and what error you got +- You MUST show your test code and error messages in the Validation Comment +- You MUST try Bedrock for any feature that works with multiple model providers before giving up +- You MUST try mocking for provider-specific features before giving up +- You MUST document all validation attempts (successful AND failed) in the Validation Comment +- You MUST preserve the test file content to include in the GitHub issue comment (Step 6.1) +- You MUST note in the validation comment what specific behavior each test verifies +- You MAY delete temporary test files after capturing their content, as the environment is ephemeral -**Process when validation genuinely fails after documented attempts:** +**Process when validation genuinely fails:** 1. **Extract a code sample from the PR** - Use code from: - The PR description's code examples - Test files added in the PR @@ -449,47 +410,6 @@ agent = Agent(model=model) \`\`\` ``` -**Constraints:** -- You MUST NOT mark examples as needing validation without actually attempting validation first -- You MUST NOT use vague reasons like "complex setup required" - be specific about what you tried and what error you got -- You MUST show your test code and error messages in the Validation Comment -- You MUST try Bedrock for any feature that works with multiple model providers before giving up -- You MUST try mocking for provider-specific features before giving up -- You MUST document all validation attempts (successful AND failed) in the Validation Comment - -#### 4.3 Handle Validation Failures - -Address any validation failures before including snippets in release notes. **You MUST actually attempt validation - do not skip to the fallback.** - -**Constraints:** -- You MUST actually run tests before claiming validation failed -- You MUST NOT skip validation and go straight to "NEEDS ENGINEER VALIDATION" -- You MUST NOT consider syntax-only tests as valid validation -- You MUST NOT invent alternative validation methods (source verification, API review, conceptual correctness, etc.) -- You MUST revise the code snippet if validation fails and try again -- You MUST re-run validation after making changes -- You MUST ensure revised tests include behavioral assertions -- You MUST try these approaches IN ORDER before using the fallback: - 1. Run the test as-is and capture the error - 2. Try using Bedrock instead of other model providers - 3. Try installing missing dependencies - 4. Try mocking external services - 5. Try using project test fixtures (`tests/fixtures/mocked_model_provider.py`) - 6. Try simplifying the example -- You MUST document each attempt and its result in the Validation Comment -- You SHOULD examine the actual implementation in the PR if generated code fails -- You SHOULD examine existing tests in the PR for patterns that verify behavior -- You MUST preserve the test file content to include in the GitHub issue comment (Step 6.1) -- You MUST note in the validation comment what specific behavior each test verifies -- You MAY delete temporary test files after capturing their content, as the environment is ephemeral - -**If validation is not possible after all attempts:** -- You MUST extract a code sample from the PR (description, tests, or implementation) -- You MUST include the code sample with the engineer validation warning (see format above) -- You MUST document the failure in the Exclusions Comment (Step 6.3) -- You MUST NOT leave the feature without a code example -- You MUST NOT use empty placeholders - ### 5. Release Notes Formatting #### 5.1 Format Major Features Section @@ -517,9 +437,16 @@ Create the Major Features section with concise descriptions and code examples. Agents can now validate responses against predefined schemas with configurable retry behavior for non-conforming outputs. -\`\`\`[language] -# Code example in the project's programming language -# Show the feature in action with clear, focused code +\`\`\`python +from strands import Agent +from pydantic import BaseModel + +class Response(BaseModel): + answer: str + +agent = Agent(output_schema=Response) +result = agent("What is 2+2?") +print(result.output.answer) \`\`\` See the [Structured Output docs](https://docs.example.com/structured-output) for configuration options. @@ -564,19 +491,9 @@ Add a horizontal rule to separate your content from GitHub's auto-generated sect - This visually separates your curated content from GitHub's auto-generated "What's Changed" and "New Contributors" sections - You MUST NOT include a "Full Changelog" linkβ€”GitHub adds this automatically -**Example format**: -```markdown -## Major Bug Fixes - -- **Critical Fix** - [PR#124](https://github.com/owner/repo/pull/124) - Description of what was fixed. - ---- -``` - ### 6. Output Delivery -**Critical**: You are running in an ephemeral environment. All files created during execution (test files, temporary notes, etc.) will be deleted when the workflow completes. You MUST post all deliverables as GitHub issue commentsβ€”this is the only way to preserve your work and make it accessible to reviewers. +Per **Principle 1**, all deliverables must be posted as GitHub issue comments. **Comment Structure**: Post exactly three comments on the GitHub issue: 1. **Validation Comment** (first): Contains all validation code for all features in one batched comment @@ -592,7 +509,7 @@ This ordering allows reviewers to see the validation evidence, review the releas #### 6.1 Post Validation Code Comment -Batch all validation code into a single GitHub issue comment. **This comment MUST show evidence that you attempted validation for EVERY feature.** +Batch all validation code into a single GitHub issue comment. **Constraints:** - You MUST post ONE comment containing validation attempts for ALL Major Features @@ -604,51 +521,52 @@ Batch all validation code into a single GitHub issue comment. **This comment MUS - You MUST NOT reference local file pathsβ€”the ephemeral environment will be destroyed - You MUST clearly label this comment as "Code Validation Tests" - You SHOULD use collapsible `
` sections to organize validation code by feature -- You SHOULD include a brief description of what behavior is being verified for each test: - ```markdown - ## Code Validation Tests +- You SHOULD include a brief description of what behavior is being verified for each test - The following test code was used to validate the code examples in the release notes. +**Format:** +```markdown +## Code Validation Tests -
- βœ… Validated: Feature Name 1 +The following test code was used to validate the code examples in the release notes. - **Behavior verified:** This test confirms that the new `output_schema` parameter causes the agent to return a validated Pydantic model instance with the correct field types. +
+βœ… Validated: Feature Name 1 - \`\`\`python - [Full test file for feature 1 with behavioral assertions] - \`\`\` +**Behavior verified:** This test confirms that the new `output_schema` parameter causes the agent to return a validated Pydantic model instance with the correct field types. + +\`\`\`python +[Full test file for feature 1 with behavioral assertions] +\`\`\` - **Test output:** PASSED +**Test output:** PASSED -
+
-
- ⚠️ Could Not Validate: Feature Name 2 +
+⚠️ Could Not Validate: Feature Name 2 - **Attempt 1: Direct test with mocked model** - \`\`\`python - [Test code that was attempted] - \`\`\` - **Error received:** - \`\`\` - [Actual error message from running the test] - \`\`\` +**Attempt 1: Direct test with mocked model** +\`\`\`python +[Test code that was attempted] +\`\`\` +**Error received:** +\`\`\` +[Actual error message from running the test] +\`\`\` - **Attempt 2: Test with Bedrock** - \`\`\`python - [Alternative test code attempted] - \`\`\` - **Error received:** - \`\`\` - [Actual error message] - \`\`\` +**Attempt 2: Test with Bedrock** +\`\`\`python +[Alternative test code attempted] +\`\`\` +**Error received:** +\`\`\` +[Actual error message] +\`\`\` - **Conclusion:** Could not validate because [specific reason based on actual errors]. Code sample in release notes extracted from PR description. +**Conclusion:** Could not validate because [specific reason based on actual errors]. Code sample in release notes extracted from PR description. -
- ``` -- This allows reviewers to verify that validation was genuinely attempted +
+``` #### 6.2 Post Release Notes Comment @@ -668,8 +586,6 @@ Post the formatted release notes as a single GitHub issue comment. Document any features with unvalidated code samples and any other notable decisions. -**Critical**: This comment is REQUIRED to document features that have unvalidated code samples requiring engineer review. - **Constraints:** - You MUST post this comment as the FINAL comment on the GitHub issue - You MUST include this comment if ANY of the following occurred: @@ -677,40 +593,33 @@ Document any features with unvalidated code samples and any other notable decisi - A feature's scope or description was significantly different from the PR description - You relied on review comments rather than the PR description to understand a feature - You MUST clearly explain the reasoning for each unvalidated sample -- You MUST format the comment with clear sections: - ```markdown - ## Release Notes Review Notes +- You SHOULD include this comment even if all code samples were validated, with a simple note: "All code samples were successfully validated. No engineer review required." +- You MUST NOT skip this commentβ€”it provides critical transparency for reviewers - The following items require attention during review: +**Format:** +```markdown +## Release Notes Review Notes - ### ⚠️ Features with Unvalidated Code Samples +The following items require attention during review: - These features have code samples extracted from PRs but could not be automatically validated. An engineer must verify these examples before publishing: +### ⚠️ Features with Unvalidated Code Samples - - **PR#123 - Feature Title**: - - Code source: PR description / test files / implementation - - Validation attempted: [what you tried] - - Failure reason: [why it failed, e.g., "requires OpenAI API credentials", "complex multi-service integration"] - - Action needed: Engineer should verify the code sample works as shown +These features have code samples extracted from PRs but could not be automatically validated. An engineer must verify these examples before publishing: - - **PR#456 - Feature Title**: - - Code source: PR test files - - Validation attempted: Bedrock (not applicable - OpenAI-specific), mocking (failed due to complex auth flow) - - Failure reason: Feature requires live OpenAI API interaction that cannot be mocked - - Action needed: Engineer should test with OpenAI credentials and update if needed +- **PR#123 - Feature Title**: + - Code source: PR description / test files / implementation + - Validation attempted: [what you tried] + - Failure reason: [why it failed, e.g., "requires OpenAI API credentials", "complex multi-service integration"] + - Action needed: Engineer should verify the code sample works as shown - ### Description vs. Implementation Discrepancies - - **PR#101 - Feature Title**: PR description stated [X] but review comments and final implementation show [Y]. Release notes reflect the actual merged behavior. - ``` -- You SHOULD include this comment even if all code samples were validated, with a simple note: "All code samples were successfully validated. No engineer review required." -- You MUST NOT skip this commentβ€”it provides critical transparency for reviewers +### Description vs. Implementation Discrepancies +- **PR#101 - Feature Title**: PR description stated [X] but review comments and final implementation show [Y]. Release notes reflect the actual merged behavior. +``` #### 6.4 Handle User Feedback on Release Notes When the user requests changes to the release notes after they have been posted, re-validate as needed. -**Critical**: User feedback does NOT exempt you from validation requirements. Any changes to code examples or newly added features must be validated. - **Constraints:** - You MUST re-run validation (Step 4) when the user requests changes that affect code examples: - Modified code snippets @@ -731,7 +640,7 @@ When the user requests changes to the release notes after they have been posted, ## Examples -### Example 1: Major Features Section with Validated Code +### Example 1: Complete Release Notes ```markdown ## Major Features @@ -741,43 +650,32 @@ When the user requests changes to the release notes after they have been posted, MCP Connections via ToolProviders allow the Agent to manage connection lifecycles automatically, eliminating the need for manual context managers. This experimental interface simplifies MCP tool integration significantly. \`\`\`python -# Code example in the project's programming language -# Demonstrate the key feature usage -# Keep it focused and concise +from strands import Agent +from strands.tools import MCPToolProvider + +provider = MCPToolProvider(server_config) +agent = Agent(tools=[provider]) +result = agent("Use the MCP tools") \`\`\` See the [MCP docs](https://docs.example.com/mcp) for details. -``` - -### Example 2: Major Feature with Unvalidated Code (Needs Engineer Review) -```markdown ### Custom HTTP Client Support - [PR#1366](https://github.com/org/repo/pull/1366) OpenAI model provider now accepts a custom HTTP client, enabling proxy configuration, custom timeouts, and request logging. \`\`\`python -# ⚠️ NEEDS ENGINEER VALIDATION - Could not automatically verify this example -# Reason: Requires OpenAI API credentials for full validation +# ⚠️ NEEDS ENGINEER VALIDATION +# Validation attempted: mocked OpenAI client, received import error +# Alternative attempts: Bedrock (not applicable - OpenAI-specific) from strands.models.openai import OpenAIModel import httpx -# Create custom HTTP client with proxy and timeout settings -custom_client = httpx.Client( - proxy="http://proxy.example.com:8080", - timeout=30.0 -) - -model = OpenAIModel( - client_args={"http_client": custom_client} -) +custom_client = httpx.Client(proxy="http://proxy.example.com:8080") +model = OpenAIModel(client_args={"http_client": custom_client}) \`\`\` -``` - -### Example 3: Major Bug Fixes Section -```markdown --- ## Major Bug Fixes @@ -790,51 +688,12 @@ model = OpenAIModel( - **Orphaned Tool Use Fix** - [PR#1123](https://github.com/strands-agents/sdk-python/pull/1123) Fixed broken conversations caused by orphaned `toolUse` blocks, improving reliability when tools fail or are interrupted. -``` - -### Example 4: Complete Release Notes Structure - -```markdown -## Major Features - -### Feature Name - [PR#123](https://github.com/owner/repo/pull/123) - -Description of the feature and its impact. - -\`\`\`[language] -# Code example demonstrating the feature -\`\`\` - ---- - -## Major Bug Fixes - -- **Critical Fix** - [PR#124](https://github.com/owner/repo/pull/124) - Description of what was fixed and why it matters. --- ``` Note: The trailing `---` separates your content from GitHub's auto-generated "What's Changed" and "New Contributors" sections that follow. -### Example 4: Issue Comment with Release Notes - -```markdown -Release notes for v1.15.0: - -## Major Features - -### Managed MCP Connections - [PR#895](https://github.com/strands-agents/sdk-typescript/pull/895) - -We've introduced MCP Connections via ToolProviders... - -[... rest of release notes ...] - ---- -``` - -When this content is added to the GitHub release, GitHub will automatically append the "What's Changed" and "New Contributors" sections below the separator. - ## Troubleshooting ### Missing or Invalid Git References @@ -856,18 +715,7 @@ If you encounter GitHub API rate limit errors: ### Code Validation Failures -If code validation fails for a snippet: -1. Review the test output to understand the failure reason -2. Check if the feature requires additional dependencies or setup -3. Examine the actual implementation in the PR to understand correct usage -4. Look at existing tests in the PR for patterns that verify behavior correctly -5. Try simplifying the example to focus on core functionality while maintaining behavioral assertions -6. Try using Bedrock if the feature works with multiple model providers -7. Try mocking external services -8. **If all attempts fail**: Extract a code sample from the PR and include it with the engineer validation warning -9. Document the unvalidated sample in the Exclusions Comment - -**Important**: You MUST NOT remove a feature from release notes because validation failed. Always include a code sample - either validated or marked for engineer review. +Follow the validation workflow in Section 4.2. If all attempts fail, use the engineer review fallback per Section 4.3. Per **Principle 4**, always include a code sample. ### Large PR Sets (>100 PRs) @@ -902,27 +750,9 @@ When GitHub tools or git operations are deferred (GITHUB_WRITE=false): - The operations will be executed after agent completion - Do not retry or attempt alternative approaches for deferred operations -### Unable to Extract Suitable Code Examples - -If no suitable code examples can be found in a PR: -1. Check the PR description for any code snippets -2. Look at test files added or modified in the PR -3. Examine the actual implementation and create a simplified usage example -4. Look at related documentation changes -5. Check if there are example files in the `examples/` directory - -**Important**: You MUST always include a code example for Major Features. If you cannot find one in the PR, create one based on the implementation and mark it for engineer validation. Never leave a feature without a code example. - -### Stale or Inaccurate PR Descriptions +### Stale PR Descriptions -If you discover that a PR description doesn't match the actual implementation: -1. Review the PR comment thread and review comments for context on what changed -2. Look for reviewer requests that led to structural changes -3. Check the author's responses to understand what modifications were made -4. Examine the actual merged code (especially test files) to understand the true implementation -5. Use test files as the authoritative source for code examples, not the PR description -6. If the feature's scope changed significantly during review, update your categorization accordingly -7. Note in your analysis when you relied on review comments rather than the description +Per **Principle 2**: Review PR comments for context on what changed, examine merged code (especially test files), and use test files as the authoritative source for code examples. ## Desired Outcome @@ -939,4 +769,4 @@ If you discover that a PR description doesn't match the actual implementation: - "New Contributors" section acknowledging first-time contributors - "Full Changelog" comparison link -You should NOT include these sectionsβ€”focus exclusively on Major Features and Major Bug Fixes that benefit from detailed descriptions and code examples. Minor changes (refactors, docs, tests, chores, etc.) will be covered by GitHub's automatic changelog. \ No newline at end of file +You should NOT include these sectionsβ€”focus exclusively on Major Features and Major Bug Fixes that benefit from detailed descriptions and code examples. Minor changes (refactors, docs, tests, chores, etc.) will be covered by GitHub's automatic changelog. From f9970f77fee9a9f389037e18d261cf5f3a539f45 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Fri, 2 Jan 2026 17:47:46 -0500 Subject: [PATCH 12/17] Ensure that blocking handoffs are documented --- .github/scripts/python/handoff_to_user.py | 32 +++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/.github/scripts/python/handoff_to_user.py b/.github/scripts/python/handoff_to_user.py index 07ad331f1..aa41f2948 100644 --- a/.github/scripts/python/handoff_to_user.py +++ b/.github/scripts/python/handoff_to_user.py @@ -4,13 +4,28 @@ from strands.types.tools import ToolContext from strands_tools.utils import console_util +from github_tools import add_issue_comment + + @tool(context=True) -def handoff_to_user(message: str, tool_context: ToolContext) -> str: +def handoff_to_user( + message: str, + tool_context: ToolContext, + post_comment: bool = False, + issue_number: int | None = None, +) -> str: """ - Hand off control to the user with a message. + Hand off control to the user with a message. This stops the agent execution + and waits for the user to respond before continuing. Args: message: The message to give to the user + post_comment: If true, post the message as a comment on the GitHub issue/PR. + Only set this to true when user intervention or feedback is required + before the agent can continue (e.g., clarification needed, approval required, + or a decision must be made). Do not post a comment for simple status updates + or completion messages. + issue_number: The issue or PR number to comment on (required if post_comment is true) Returns: The users response after handing back control @@ -25,6 +40,19 @@ def handoff_to_user(message: str, tool_context: ToolContext) -> str: ) ) + # Post comment to GitHub if requested + if post_comment: + if issue_number is None: + console.print( + Panel( + "Cannot post comment: issue_number is required when post_comment is true", + title="[bold red]Error", + border_style="red", + ) + ) + else: + add_issue_comment(issue_number, message) + request_state = { "stop_event_loop": True } From 252f896b492886e0fac1dd6c3e9d6d81334ca1c1 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow Date: Mon, 5 Jan 2026 10:21:37 -0500 Subject: [PATCH 13/17] Reinforce --- .github/scripts/python/handoff_to_user.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/scripts/python/handoff_to_user.py b/.github/scripts/python/handoff_to_user.py index aa41f2948..e3c8d1edf 100644 --- a/.github/scripts/python/handoff_to_user.py +++ b/.github/scripts/python/handoff_to_user.py @@ -11,7 +11,7 @@ def handoff_to_user( message: str, tool_context: ToolContext, - post_comment: bool = False, + post_comment: bool, issue_number: int | None = None, ) -> str: """ @@ -22,9 +22,10 @@ def handoff_to_user( message: The message to give to the user post_comment: If true, post the message as a comment on the GitHub issue/PR. Only set this to true when user intervention or feedback is required - before the agent can continue (e.g., clarification needed, approval required, + before continuing (e.g., clarification needed, approval required, or a decision must be made). Do not post a comment for simple status updates - or completion messages. + or completion messages. If you are asking a question to the user this MUST + be true. issue_number: The issue or PR number to comment on (required if post_comment is true) Returns: From 4af52ae03609eca5d0a837952e98dfad38f74ec1 Mon Sep 17 00:00:00 2001 From: Strands Agent <217235299+strands-agent@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:18:12 +0000 Subject: [PATCH 14/17] feat(hooks): add BeforeToolsEvent and AfterToolsEvent for batch-level tool execution hooks Add batch-level hooks to complement existing per-tool hooks, enabling hook providers to observe and control tool execution at the batch level. This provides parity with the TypeScript SDK. - Add BeforeToolsEvent: triggered before a batch of tools execute, supports interrupts for approval workflows - Add AfterToolsEvent: triggered after all tools in a batch complete, uses reverse callback ordering - Update tool executors (sequential and concurrent) to trigger batch events - Add comprehensive test coverage for batch events and interrupts Resolves #25 --- src/strands/event_loop/event_loop.py | 2 +- src/strands/hooks/__init__.py | 4 + src/strands/hooks/events.py | 63 ++++++ src/strands/tools/executors/_executor.py | 5 + src/strands/tools/executors/concurrent.py | 23 +- src/strands/tools/executors/sequential.py | 22 ++ tests/fixtures/mock_hook_provider.py | 4 + tests/strands/agent/test_agent_hooks.py | 264 +++++++++++++++++++++- 8 files changed, 384 insertions(+), 3 deletions(-) diff --git a/src/strands/event_loop/event_loop.py b/src/strands/event_loop/event_loop.py index fcb530a0d..f50c93ba1 100644 --- a/src/strands/event_loop/event_loop.py +++ b/src/strands/event_loop/event_loop.py @@ -486,7 +486,7 @@ async def _handle_tool_execution( interrupts = [] tool_events = agent.tool_executor._execute( - agent, tool_uses, tool_results, cycle_trace, cycle_span, invocation_state, structured_output_context + agent, message, tool_uses, tool_results, cycle_trace, cycle_span, invocation_state, structured_output_context ) async for tool_event in tool_events: if isinstance(tool_event, ToolInterruptEvent): diff --git a/src/strands/hooks/__init__.py b/src/strands/hooks/__init__.py index 30163f207..8f15cc02b 100644 --- a/src/strands/hooks/__init__.py +++ b/src/strands/hooks/__init__.py @@ -33,10 +33,12 @@ def log_end(self, event: AfterInvocationEvent) -> None: AfterInvocationEvent, AfterModelCallEvent, AfterToolCallEvent, + AfterToolsEvent, AgentInitializedEvent, BeforeInvocationEvent, BeforeModelCallEvent, BeforeToolCallEvent, + BeforeToolsEvent, MessageAddedEvent, ) from .registry import BaseHookEvent, HookCallback, HookEvent, HookProvider, HookRegistry @@ -45,7 +47,9 @@ def log_end(self, event: AfterInvocationEvent) -> None: "AgentInitializedEvent", "BeforeInvocationEvent", "BeforeToolCallEvent", + "BeforeToolsEvent", "AfterToolCallEvent", + "AfterToolsEvent", "BeforeModelCallEvent", "AfterModelCallEvent", "AfterInvocationEvent", diff --git a/src/strands/hooks/events.py b/src/strands/hooks/events.py index 5e11524d1..17e608440 100644 --- a/src/strands/hooks/events.py +++ b/src/strands/hooks/events.py @@ -173,6 +173,69 @@ def should_reverse_callbacks(self) -> bool: return True +@dataclass +class BeforeToolsEvent(HookEvent, _Interruptible): + """Event triggered before a batch of tools are executed. + + This event is fired after the model returns tool use blocks but before + the tools are executed. Hook providers can use this event to inspect, + log, or implement approval workflows for tool batches. + + The event is interruptible, allowing hook callbacks to pause execution + and request user approval before proceeding with tool execution. + + Attributes: + message: The message from the model containing tool use blocks. + tool_uses: List of tools that will be executed in this batch. + """ + + message: Message + tool_uses: list[ToolUse] + + @override + def _interrupt_id(self, name: str) -> str: + """Unique id for the interrupt. + + Args: + name: User defined name for the interrupt. + + Returns: + Interrupt id. + """ + # Use a stable ID based on the tool use IDs in the batch + tool_ids = "|".join(str(tool_use.get("toolUseId", "")) for tool_use in self.tool_uses) + return f"v1:before_tools:{tool_ids}:{uuid.uuid5(uuid.NAMESPACE_OID, name)}" + + +@dataclass +class AfterToolsEvent(HookEvent): + """Event triggered after a batch of tools complete execution. + + This event is fired after all tools in a batch have been executed, + before the tool results are added to the conversation. Hook providers + can use this event for cleanup, logging, or batch-level post-processing. + + Note: This event uses reverse callback ordering, meaning callbacks registered + later will be invoked first during cleanup. + + Note: Tool results are available in the tool result message created after + this event. This event receives the original assistant message with tool uses, + not the result message. + + Attributes: + message: The original message from the model containing tool use blocks. + tool_uses: List of tools that were executed in this batch. + """ + + message: Message + tool_uses: list[ToolUse] + + @property + def should_reverse_callbacks(self) -> bool: + """True to invoke callbacks in reverse order.""" + return True + + @dataclass class BeforeModelCallEvent(HookEvent): """Event triggered before the model is invoked. diff --git a/src/strands/tools/executors/_executor.py b/src/strands/tools/executors/_executor.py index 5d01c5d48..d51c4947b 100644 --- a/src/strands/tools/executors/_executor.py +++ b/src/strands/tools/executors/_executor.py @@ -315,6 +315,7 @@ async def _stream_with_trace( def _execute( self, agent: "Agent", + message: Message, tool_uses: list[ToolUse], tool_results: list[ToolResult], cycle_trace: Trace, @@ -324,8 +325,12 @@ def _execute( ) -> AsyncGenerator[TypedEvent, None]: """Execute the given tools according to this executor's strategy. + This method is responsible for executing the tools in a batch and triggering + the BeforeToolsEvent and AfterToolsEvent hooks at the appropriate times. + Args: agent: The agent for which tools are being executed. + message: The message from the model containing tool use blocks. tool_uses: Metadata and inputs for the tools to be executed. tool_results: List of tool results from each tool execution. cycle_trace: Trace object for the current event loop cycle. diff --git a/src/strands/tools/executors/concurrent.py b/src/strands/tools/executors/concurrent.py index 216eee379..91232c72d 100644 --- a/src/strands/tools/executors/concurrent.py +++ b/src/strands/tools/executors/concurrent.py @@ -5,8 +5,10 @@ from typing_extensions import override +from ...hooks import AfterToolsEvent, BeforeToolsEvent from ...telemetry.metrics import Trace -from ...types._events import TypedEvent +from ...types._events import ToolInterruptEvent, TypedEvent +from ...types.content import Message from ...types.tools import ToolResult, ToolUse from ._executor import ToolExecutor @@ -22,6 +24,7 @@ class ConcurrentToolExecutor(ToolExecutor): async def _execute( self, agent: "Agent", + message: Message, tool_uses: list[ToolUse], tool_results: list[ToolResult], cycle_trace: Trace, @@ -33,6 +36,7 @@ async def _execute( Args: agent: The agent for which tools are being executed. + message: The message from the model containing tool use blocks. tool_uses: Metadata and inputs for the tools to be executed. tool_results: List of tool results from each tool execution. cycle_trace: Trace object for the current event loop cycle. @@ -43,6 +47,19 @@ async def _execute( Yields: Events from the tool execution stream. """ + # Skip batch events if no tools + if not tool_uses: + return + + # Trigger BeforeToolsEvent + before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + _, interrupts = await agent.hooks.invoke_callbacks_async(before_event) + + if interrupts: + # Use the first tool_use for the interrupt event (tools not executed yet) + yield ToolInterruptEvent(tool_uses[0], interrupts) + return + task_queue: asyncio.Queue[tuple[int, Any]] = asyncio.Queue() task_events = [asyncio.Event() for _ in tool_uses] stop_event = object() @@ -76,6 +93,10 @@ async def _execute( yield event task_events[task_id].set() + # Trigger AfterToolsEvent + after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + await agent.hooks.invoke_callbacks_async(after_event) + async def _task( self, agent: "Agent", diff --git a/src/strands/tools/executors/sequential.py b/src/strands/tools/executors/sequential.py index f78e60872..36ca3580b 100644 --- a/src/strands/tools/executors/sequential.py +++ b/src/strands/tools/executors/sequential.py @@ -4,8 +4,10 @@ from typing_extensions import override +from ...hooks import AfterToolsEvent, BeforeToolsEvent from ...telemetry.metrics import Trace from ...types._events import ToolInterruptEvent, TypedEvent +from ...types.content import Message from ...types.tools import ToolResult, ToolUse from ._executor import ToolExecutor @@ -21,6 +23,7 @@ class SequentialToolExecutor(ToolExecutor): async def _execute( self, agent: "Agent", + message: Message, tool_uses: list[ToolUse], tool_results: list[ToolResult], cycle_trace: Trace, @@ -34,6 +37,7 @@ async def _execute( Args: agent: The agent for which tools are being executed. + message: The message from the model containing tool use blocks. tool_uses: Metadata and inputs for the tools to be executed. tool_results: List of tool results from each tool execution. cycle_trace: Trace object for the current event loop cycle. @@ -44,6 +48,19 @@ async def _execute( Yields: Events from the tool execution stream. """ + # Skip batch events if no tools + if not tool_uses: + return + + # Trigger BeforeToolsEvent + before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + _, interrupts = await agent.hooks.invoke_callbacks_async(before_event) + + if interrupts: + # Use the first tool_use for the interrupt event (tools not executed yet) + yield ToolInterruptEvent(tool_uses[0], interrupts) + return + interrupted = False for tool_use in tool_uses: @@ -58,3 +75,8 @@ async def _execute( if interrupted: break + + # Only trigger AfterToolsEvent if no interrupts occurred + if not interrupted: + after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + await agent.hooks.invoke_callbacks_async(after_event) diff --git a/tests/fixtures/mock_hook_provider.py b/tests/fixtures/mock_hook_provider.py index 091f44d06..7e5aead27 100644 --- a/tests/fixtures/mock_hook_provider.py +++ b/tests/fixtures/mock_hook_provider.py @@ -5,10 +5,12 @@ AfterInvocationEvent, AfterModelCallEvent, AfterToolCallEvent, + AfterToolsEvent, AgentInitializedEvent, BeforeInvocationEvent, BeforeModelCallEvent, BeforeToolCallEvent, + BeforeToolsEvent, HookEvent, HookProvider, HookRegistry, @@ -25,6 +27,8 @@ def __init__(self, event_types: list[Type] | Literal["all"]): AfterInvocationEvent, BeforeToolCallEvent, AfterToolCallEvent, + BeforeToolsEvent, + AfterToolsEvent, BeforeModelCallEvent, AfterModelCallEvent, MessageAddedEvent, diff --git a/tests/strands/agent/test_agent_hooks.py b/tests/strands/agent/test_agent_hooks.py index 00b9d368a..17b0eb9c1 100644 --- a/tests/strands/agent/test_agent_hooks.py +++ b/tests/strands/agent/test_agent_hooks.py @@ -9,10 +9,12 @@ AfterInvocationEvent, AfterModelCallEvent, AfterToolCallEvent, + AfterToolsEvent, AgentInitializedEvent, BeforeInvocationEvent, BeforeModelCallEvent, BeforeToolCallEvent, + BeforeToolsEvent, MessageAddedEvent, ) from strands.types.content import Messages @@ -476,7 +478,267 @@ async def handle_after_model_call(self, event): # Should be called 3 times: initial + 2 retries assert retry_hook.call_count == 3 - assert retry_hook.retry_count == 2 + + +# Tests for BeforeToolsEvent and AfterToolsEvent + + +def test_before_tools_event_triggered(agent, hook_provider, agent_tool, tool_use): + """Verify that BeforeToolsEvent is triggered before tool batch execution.""" + # Add batch event tracking + batch_hook_provider = MockHookProvider([BeforeToolsEvent, AfterToolsEvent]) + agent.hooks.add_hook(batch_hook_provider) + + result = agent("test message") + + # Check that BeforeToolsEvent was triggered + batch_length, batch_events = batch_hook_provider.get_events() + assert batch_length == 2 # BeforeToolsEvent and AfterToolsEvent + + before_event = next(batch_events) + assert isinstance(before_event, BeforeToolsEvent) + assert before_event.agent == agent + assert len(before_event.tool_uses) == 1 + assert before_event.tool_uses[0]["name"] == "tool_decorated" + assert "toolUse" in before_event.message["content"][0] + + +def test_after_tools_event_triggered(agent, hook_provider, agent_tool, tool_use): + """Verify that AfterToolsEvent is triggered after all tools complete.""" + # Add batch event tracking + batch_hook_provider = MockHookProvider([BeforeToolsEvent, AfterToolsEvent]) + agent.hooks.add_hook(batch_hook_provider) + + result = agent("test message") + + # Check that AfterToolsEvent was triggered + batch_length, batch_events = batch_hook_provider.get_events() + assert batch_length == 2 + + before_event = next(batch_events) + after_event = next(batch_events) + + assert isinstance(after_event, AfterToolsEvent) + assert after_event.agent == agent + assert len(after_event.tool_uses) == 1 + assert after_event.tool_uses[0]["name"] == "tool_decorated" + assert "toolUse" in after_event.message["content"][0] + + +def test_after_tools_event_reverse_ordering(): + """Verify that AfterToolsEvent uses reverse callback ordering.""" + execution_order = [] + + class OrderTrackingHook1: + def register_hooks(self, registry): + registry.add_callback(AfterToolsEvent, lambda event: execution_order.append("hook1")) + + class OrderTrackingHook2: + def register_hooks(self, registry): + registry.add_callback(AfterToolsEvent, lambda event: execution_order.append("hook2")) + + @strands.tools.tool + def sample_tool(x: int) -> int: + return x * 2 + + tool_use = {"name": "sample_tool", "toolUseId": "123", "input": {"x": 5}} + agent_messages: Messages = [ + {"role": "assistant", "content": [{"toolUse": tool_use}]}, + {"role": "assistant", "content": [{"text": "Done"}]}, + ] + model = MockedModelProvider(agent_messages) + + agent = Agent( + model=model, + tools=[sample_tool], + hooks=[OrderTrackingHook1(), OrderTrackingHook2()], + ) + + agent("test") + + # AfterToolsEvent should execute in reverse order: hook2 before hook1 + assert execution_order == ["hook2", "hook1"] + + +def test_before_tools_event_with_multiple_tools(): + """Verify that BeforeToolsEvent contains all tools in batch.""" + batch_hook_provider = MockHookProvider([BeforeToolsEvent, AfterToolsEvent]) + + @strands.tools.tool + def tool1(x: int) -> int: + return x + 1 + + @strands.tools.tool + def tool2(y: int) -> int: + return y * 2 + + tool_use_1 = {"name": "tool1", "toolUseId": "123", "input": {"x": 5}} + tool_use_2 = {"name": "tool2", "toolUseId": "456", "input": {"y": 10}} + + agent_messages: Messages = [ + {"role": "assistant", "content": [{"toolUse": tool_use_1}, {"toolUse": tool_use_2}]}, + {"role": "assistant", "content": [{"text": "Done"}]}, + ] + model = MockedModelProvider(agent_messages) + + agent = Agent( + model=model, + tools=[tool1, tool2], + hooks=[batch_hook_provider], + ) + + agent("test") + + batch_length, batch_events = batch_hook_provider.get_events() + before_event = next(batch_events) + + assert isinstance(before_event, BeforeToolsEvent) + assert len(before_event.tool_uses) == 2 + assert before_event.tool_uses[0]["name"] == "tool1" + assert before_event.tool_uses[1]["name"] == "tool2" + + +def test_batch_events_not_triggered_without_tools(): + """Verify that batch events are not triggered when no tools are present.""" + batch_hook_provider = MockHookProvider([BeforeToolsEvent, AfterToolsEvent]) + + # Response with no tool uses + agent_messages: Messages = [ + {"role": "assistant", "content": [{"text": "No tools used"}]}, + ] + model = MockedModelProvider(agent_messages) + + agent = Agent( + model=model, + hooks=[batch_hook_provider], + ) + + agent("test") + + # No batch events should be triggered + batch_length, _ = batch_hook_provider.get_events() + assert batch_length == 0 + + +def test_before_tools_event_interrupt(): + """Verify that BeforeToolsEvent interrupt stops batch execution.""" + batch_hook_provider = MockHookProvider([BeforeToolsEvent, AfterToolsEvent]) + tool_hook_provider = MockHookProvider([BeforeToolCallEvent, AfterToolCallEvent]) + + class InterruptHook: + def register_hooks(self, registry): + registry.add_callback(BeforeToolsEvent, self.interrupt_batch) + + def interrupt_batch(self, event: BeforeToolsEvent): + # Interrupt without providing response + event.interrupt("batch-approval", reason="Need approval") + + @strands.tools.tool + def sample_tool(x: int) -> int: + return x * 2 + + tool_use = {"name": "sample_tool", "toolUseId": "123", "input": {"x": 5}} + agent_messages: Messages = [ + {"role": "assistant", "content": [{"toolUse": tool_use}]}, + ] + model = MockedModelProvider(agent_messages) + + agent = Agent( + model=model, + tools=[sample_tool], + hooks=[InterruptHook(), batch_hook_provider, tool_hook_provider], + ) + + result = agent("test") + + # Agent should stop with interrupt + assert result.stop_reason == "interrupt" + assert len(result.interrupts) == 1 + assert result.interrupts[0].name == "batch-approval" + + # BeforeToolsEvent should be triggered but AfterToolsEvent should not + batch_length, batch_events = batch_hook_provider.get_events() + assert batch_length == 1 # Only BeforeToolsEvent + assert isinstance(next(batch_events), BeforeToolsEvent) + + # No individual tool events should be triggered + tool_length, _ = tool_hook_provider.get_events() + assert tool_length == 0 + + +@pytest.mark.asyncio +async def test_before_tools_event_interrupt_async(): + """Verify that BeforeToolsEvent interrupt works in async context.""" + batch_hook_provider = MockHookProvider([BeforeToolsEvent, AfterToolsEvent]) + + class AsyncInterruptHook: + def register_hooks(self, registry): + registry.add_callback(BeforeToolsEvent, self.interrupt_batch) + + async def interrupt_batch(self, event: BeforeToolsEvent): + event.interrupt("async-batch-approval", reason="Async approval needed") + + @strands.tools.tool + def sample_tool(x: int) -> int: + return x * 2 + + tool_use = {"name": "sample_tool", "toolUseId": "123", "input": {"x": 5}} + agent_messages: Messages = [ + {"role": "assistant", "content": [{"toolUse": tool_use}]}, + ] + model = MockedModelProvider(agent_messages) + + agent = Agent( + model=model, + tools=[sample_tool], + hooks=[AsyncInterruptHook(), batch_hook_provider], + ) + + # Call agent synchronously but the hook is async + result = agent("test") + + assert result.stop_reason == "interrupt" + assert len(result.interrupts) == 1 + assert result.interrupts[0].name == "async-batch-approval" + + +def test_batch_events_with_tool_events(): + """Verify that batch events and per-tool events are triggered in correct order.""" + all_hook_provider = MockHookProvider([ + BeforeToolsEvent, + AfterToolsEvent, + BeforeToolCallEvent, + AfterToolCallEvent, + ]) + + @strands.tools.tool + def sample_tool(x: int) -> int: + return x * 2 + + tool_use = {"name": "sample_tool", "toolUseId": "123", "input": {"x": 5}} + agent_messages: Messages = [ + {"role": "assistant", "content": [{"toolUse": tool_use}]}, + {"role": "assistant", "content": [{"text": "Done"}]}, + ] + model = MockedModelProvider(agent_messages) + + agent = Agent( + model=model, + tools=[sample_tool], + hooks=[all_hook_provider], + ) + + agent("test") + + event_length, events = all_hook_provider.get_events() + assert event_length == 4 + + # Expected order: BeforeToolsEvent, BeforeToolCallEvent, AfterToolCallEvent, AfterToolsEvent + event_list = list(events) + assert isinstance(event_list[0], BeforeToolsEvent) + assert isinstance(event_list[1], BeforeToolCallEvent) + assert isinstance(event_list[2], AfterToolCallEvent) + assert isinstance(event_list[3], AfterToolsEvent) @pytest.mark.asyncio From cb0bfa9e455051102c3e18e1abd4146c22bd8b04 Mon Sep 17 00:00:00 2001 From: Strands Agent <217235299+strands-agent@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:19:16 +0000 Subject: [PATCH 15/17] Additional changes from write operations --- install_output.log | 256 +++++++++++++++++++++++++++++++++++++++++++++ test_output.log | 1 + 2 files changed, 257 insertions(+) create mode 100644 install_output.log create mode 100644 test_output.log diff --git a/install_output.log b/install_output.log new file mode 100644 index 000000000..4b0217208 --- /dev/null +++ b/install_output.log @@ -0,0 +1,256 @@ +Obtaining file:///home/runner/work/sdk-python/sdk-python + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Checking if build backend supports build_editable: started + Checking if build backend supports build_editable: finished with status 'done' + Getting requirements to build editable: started + Getting requirements to build editable: finished with status 'done' + Installing backend dependencies: started + Installing backend dependencies: finished with status 'done' + Preparing editable metadata (pyproject.toml): started + Preparing editable metadata (pyproject.toml): finished with status 'done' +Requirement already satisfied: boto3<2.0.0,>=1.26.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (1.42.32) +Requirement already satisfied: botocore<2.0.0,>=1.29.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (1.42.32) +Requirement already satisfied: docstring-parser<1.0,>=0.15 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (0.17.0) +Requirement already satisfied: jsonschema<5.0.0,>=4.0.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (4.26.0) +Requirement already satisfied: mcp<2.0.0,>=1.11.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (1.25.0) +Requirement already satisfied: opentelemetry-api<2.0.0,>=1.30.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (1.39.1) +Requirement already satisfied: opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (0.60b1) +Requirement already satisfied: opentelemetry-sdk<2.0.0,>=1.30.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (1.39.1) +Requirement already satisfied: pydantic<3.0.0,>=2.4.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (2.12.5) +Requirement already satisfied: typing-extensions<5.0.0,>=4.13.2 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (4.15.0) +Requirement already satisfied: watchdog<7.0.0,>=6.0.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from strands-agents==0.1.dev1+g252f896b4) (6.0.0) +Collecting commitizen<5.0.0,>=4.4.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading commitizen-4.12.0-py3-none-any.whl.metadata (13 kB) +Collecting hatch<2.0.0,>=1.0.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading hatch-1.16.3-py3-none-any.whl.metadata (5.6 kB) +Collecting moto<6.0.0,>=5.1.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading moto-5.1.20-py3-none-any.whl.metadata (12 kB) +Collecting mypy<2.0.0,>=1.15.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading mypy-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.2 kB) +Collecting pre-commit<4.6.0,>=3.2.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading pre_commit-4.5.1-py2.py3-none-any.whl.metadata (1.2 kB) +Collecting pytest-asyncio<1.4.0,>=1.0.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading pytest_asyncio-1.3.0-py3-none-any.whl.metadata (4.1 kB) +Collecting pytest-cov<8.0.0,>=7.0.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading pytest_cov-7.0.0-py3-none-any.whl.metadata (31 kB) +Collecting pytest-xdist<4.0.0,>=3.0.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading pytest_xdist-3.8.0-py3-none-any.whl.metadata (3.0 kB) +Collecting pytest<9.0.0,>=8.0.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading pytest-8.4.2-py3-none-any.whl.metadata (7.7 kB) +Collecting ruff<0.15.0,>=0.13.0 (from strands-agents==0.1.dev1+g252f896b4) + Downloading ruff-0.14.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (26 kB) +Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from boto3<2.0.0,>=1.26.0->strands-agents==0.1.dev1+g252f896b4) (1.0.1) +Requirement already satisfied: s3transfer<0.17.0,>=0.16.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from boto3<2.0.0,>=1.26.0->strands-agents==0.1.dev1+g252f896b4) (0.16.0) +Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from botocore<2.0.0,>=1.29.0->strands-agents==0.1.dev1+g252f896b4) (2.9.0.post0) +Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from botocore<2.0.0,>=1.29.0->strands-agents==0.1.dev1+g252f896b4) (2.6.3) +Collecting questionary<3.0,>=2.0 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading questionary-2.1.1-py3-none-any.whl.metadata (5.4 kB) +Collecting prompt-toolkit!=3.0.52 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading prompt_toolkit-3.0.51-py3-none-any.whl.metadata (6.4 kB) +Collecting decli<1.0,>=0.6.0 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading decli-0.6.3-py3-none-any.whl.metadata (17 kB) +Requirement already satisfied: colorama<1.0,>=0.4.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) (0.4.6) +Collecting termcolor<4.0.0,>=1.1.0 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading termcolor-3.3.0-py3-none-any.whl.metadata (6.5 kB) +Requirement already satisfied: packaging>=19 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) (26.0) +Collecting tomlkit<1.0.0,>=0.8.0 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading tomlkit-0.14.0-py3-none-any.whl.metadata (2.8 kB) +Collecting jinja2>=2.10.3 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB) +Collecting pyyaml>=3.8 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.4 kB) +Collecting argcomplete<3.7,>=1.12.1 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading argcomplete-3.6.3-py3-none-any.whl.metadata (16 kB) +Requirement already satisfied: charset-normalizer<4,>=2.1.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) (3.4.4) +Collecting deprecated<2,>=1.2.13 (from commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading deprecated-1.3.1-py2.py3-none-any.whl.metadata (5.9 kB) +Requirement already satisfied: wrapt<3,>=1.10 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from deprecated<2,>=1.2.13->commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) (1.17.3) +Collecting backports-zstd>=1.0.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading backports_zstd-1.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (6.9 kB) +Requirement already satisfied: click>=8.0.6 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (8.3.1) +Collecting hatchling>=1.27.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Using cached hatchling-1.28.0-py3-none-any.whl.metadata (3.8 kB) +Requirement already satisfied: httpx>=0.22.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (0.28.1) +Collecting hyperlink>=21.0.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading hyperlink-21.0.0-py2.py3-none-any.whl.metadata (1.5 kB) +Collecting keyring>=23.5.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading keyring-25.7.0-py3-none-any.whl.metadata (21 kB) +Collecting pexpect~=4.8 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading pexpect-4.9.0-py2.py3-none-any.whl.metadata (2.5 kB) +Collecting platformdirs>=2.5.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading platformdirs-4.5.1-py3-none-any.whl.metadata (12 kB) +Collecting pyproject-hooks (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading pyproject_hooks-1.2.0-py3-none-any.whl.metadata (1.3 kB) +Requirement already satisfied: rich>=11.2.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (14.2.0) +Collecting shellingham>=1.4.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB) +Collecting tomli-w>=1.0 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading tomli_w-1.2.0-py3-none-any.whl.metadata (5.7 kB) +Collecting userpath~=1.7 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading userpath-1.9.2-py3-none-any.whl.metadata (3.0 kB) +Collecting uv>=0.5.23 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading uv-0.9.26-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB) +Collecting virtualenv>=20.26.6 (from hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading virtualenv-20.36.1-py3-none-any.whl.metadata (4.7 kB) +Requirement already satisfied: attrs>=22.2.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from jsonschema<5.0.0,>=4.0.0->strands-agents==0.1.dev1+g252f896b4) (25.4.0) +Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from jsonschema<5.0.0,>=4.0.0->strands-agents==0.1.dev1+g252f896b4) (2025.9.1) +Requirement already satisfied: referencing>=0.28.4 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from jsonschema<5.0.0,>=4.0.0->strands-agents==0.1.dev1+g252f896b4) (0.37.0) +Requirement already satisfied: rpds-py>=0.25.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from jsonschema<5.0.0,>=4.0.0->strands-agents==0.1.dev1+g252f896b4) (0.30.0) +Requirement already satisfied: anyio>=4.5 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (4.12.1) +Requirement already satisfied: httpx-sse>=0.4 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (0.4.3) +Requirement already satisfied: pydantic-settings>=2.5.2 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (2.12.0) +Requirement already satisfied: pyjwt>=2.10.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from pyjwt[crypto]>=2.10.1->mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (2.10.1) +Requirement already satisfied: python-multipart>=0.0.9 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (0.0.21) +Requirement already satisfied: sse-starlette>=1.6.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (3.2.0) +Requirement already satisfied: starlette>=0.27 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (0.52.1) +Requirement already satisfied: typing-inspection>=0.4.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (0.4.2) +Requirement already satisfied: uvicorn>=0.31.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (0.40.0) +Requirement already satisfied: cryptography>=35.0.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) (46.0.3) +Requirement already satisfied: requests>=2.5 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) (2.32.5) +Collecting xmltodict (from moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) + Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB) +Collecting werkzeug!=2.2.0,!=2.2.1,>=0.5 (from moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) + Downloading werkzeug-3.1.5-py3-none-any.whl.metadata (4.0 kB) +Collecting responses!=0.25.5,>=0.15.0 (from moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) + Downloading responses-0.25.8-py3-none-any.whl.metadata (47 kB) +Collecting mypy_extensions>=1.0.0 (from mypy<2.0.0,>=1.15.0->strands-agents==0.1.dev1+g252f896b4) + Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB) +Collecting pathspec>=0.9.0 (from mypy<2.0.0,>=1.15.0->strands-agents==0.1.dev1+g252f896b4) + Using cached pathspec-1.0.3-py3-none-any.whl.metadata (13 kB) +Collecting librt>=0.6.2 (from mypy<2.0.0,>=1.15.0->strands-agents==0.1.dev1+g252f896b4) + Downloading librt-0.7.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (1.3 kB) +Requirement already satisfied: importlib-metadata<8.8.0,>=6.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from opentelemetry-api<2.0.0,>=1.30.0->strands-agents==0.1.dev1+g252f896b4) (8.7.1) +Requirement already satisfied: zipp>=3.20 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from importlib-metadata<8.8.0,>=6.0->opentelemetry-api<2.0.0,>=1.30.0->strands-agents==0.1.dev1+g252f896b4) (3.23.0) +Requirement already satisfied: opentelemetry-instrumentation==0.60b1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from opentelemetry-instrumentation-threading<1.00b0,>=0.51b0->strands-agents==0.1.dev1+g252f896b4) (0.60b1) +Requirement already satisfied: opentelemetry-semantic-conventions==0.60b1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from opentelemetry-instrumentation==0.60b1->opentelemetry-instrumentation-threading<1.00b0,>=0.51b0->strands-agents==0.1.dev1+g252f896b4) (0.60b1) +Collecting ptyprocess>=0.5 (from pexpect~=4.8->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading ptyprocess-0.7.0-py2.py3-none-any.whl.metadata (1.3 kB) +Collecting cfgv>=2.0.0 (from pre-commit<4.6.0,>=3.2.0->strands-agents==0.1.dev1+g252f896b4) + Downloading cfgv-3.5.0-py2.py3-none-any.whl.metadata (8.9 kB) +Collecting identify>=1.0.0 (from pre-commit<4.6.0,>=3.2.0->strands-agents==0.1.dev1+g252f896b4) + Downloading identify-2.6.16-py2.py3-none-any.whl.metadata (4.4 kB) +Collecting nodeenv>=0.11.1 (from pre-commit<4.6.0,>=3.2.0->strands-agents==0.1.dev1+g252f896b4) + Downloading nodeenv-1.10.0-py2.py3-none-any.whl.metadata (24 kB) +Requirement already satisfied: annotated-types>=0.6.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.4.0->strands-agents==0.1.dev1+g252f896b4) (0.7.0) +Requirement already satisfied: pydantic-core==2.41.5 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.4.0->strands-agents==0.1.dev1+g252f896b4) (2.41.5) +Collecting iniconfig>=1 (from pytest<9.0.0,>=8.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading iniconfig-2.3.0-py3-none-any.whl.metadata (2.5 kB) +Collecting pluggy<2,>=1.5 (from pytest<9.0.0,>=8.0.0->strands-agents==0.1.dev1+g252f896b4) + Using cached pluggy-1.6.0-py3-none-any.whl.metadata (4.8 kB) +Requirement already satisfied: pygments>=2.7.2 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from pytest<9.0.0,>=8.0.0->strands-agents==0.1.dev1+g252f896b4) (2.19.2) +Collecting coverage>=7.10.6 (from coverage[toml]>=7.10.6->pytest-cov<8.0.0,>=7.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading coverage-7.13.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (8.5 kB) +Collecting execnet>=2.1 (from pytest-xdist<4.0.0,>=3.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading execnet-2.1.2-py3-none-any.whl.metadata (2.9 kB) +Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<2.0.0,>=1.29.0->strands-agents==0.1.dev1+g252f896b4) (1.17.0) +Requirement already satisfied: wcwidth in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from prompt-toolkit!=3.0.52->commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) (0.3.0) +Requirement already satisfied: idna>=2.8 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from anyio>=4.5->mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (3.11) +Requirement already satisfied: cffi>=2.0.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from cryptography>=35.0.0->moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) (2.0.0) +Requirement already satisfied: pycparser in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from cffi>=2.0.0->cryptography>=35.0.0->moto<6.0.0,>=5.1.0->strands-agents==0.1.dev1+g252f896b4) (3.0) +Collecting trove-classifiers (from hatchling>=1.27.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Using cached trove_classifiers-2026.1.14.14-py3-none-any.whl.metadata (2.4 kB) +Requirement already satisfied: certifi in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from httpx>=0.22.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (2026.1.4) +Requirement already satisfied: httpcore==1.* in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from httpx>=0.22.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (1.0.9) +Requirement already satisfied: h11>=0.16 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from httpcore==1.*->httpx>=0.22.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (0.16.0) +Collecting MarkupSafe>=2.0 (from jinja2>=2.10.3->commitizen<5.0.0,>=4.4.0->strands-agents==0.1.dev1+g252f896b4) + Downloading markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.7 kB) +Collecting SecretStorage>=3.2 (from keyring>=23.5.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading secretstorage-3.5.0-py3-none-any.whl.metadata (4.0 kB) +Collecting jeepney>=0.4.2 (from keyring>=23.5.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading jeepney-0.9.0-py3-none-any.whl.metadata (1.2 kB) +Collecting jaraco.classes (from keyring>=23.5.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading jaraco.classes-3.4.0-py3-none-any.whl.metadata (2.6 kB) +Collecting jaraco.functools (from keyring>=23.5.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading jaraco_functools-4.4.0-py3-none-any.whl.metadata (3.0 kB) +Collecting jaraco.context (from keyring>=23.5.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading jaraco_context-6.1.0-py3-none-any.whl.metadata (4.3 kB) +Requirement already satisfied: python-dotenv>=0.21.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from pydantic-settings>=2.5.2->mcp<2.0.0,>=1.11.0->strands-agents==0.1.dev1+g252f896b4) (1.2.1) +Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from rich>=11.2.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (4.0.0) +Requirement already satisfied: mdurl~=0.1 in /opt/hostedtoolcache/Python/3.13.11/x64/lib/python3.13/site-packages (from markdown-it-py>=2.2.0->rich>=11.2.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) (0.1.2) +Collecting distlib<1,>=0.3.7 (from virtualenv>=20.26.6->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading distlib-0.4.0-py2.py3-none-any.whl.metadata (5.2 kB) +Collecting filelock<4,>=3.20.1 (from virtualenv>=20.26.6->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading filelock-3.20.3-py3-none-any.whl.metadata (2.1 kB) +Collecting more-itertools (from jaraco.classes->keyring>=23.5.0->hatch<2.0.0,>=1.0.0->strands-agents==0.1.dev1+g252f896b4) + Downloading more_itertools-10.8.0-py3-none-any.whl.metadata (39 kB) +Downloading commitizen-4.12.0-py3-none-any.whl (84 kB) +Downloading argcomplete-3.6.3-py3-none-any.whl (43 kB) +Downloading decli-0.6.3-py3-none-any.whl (8.0 kB) +Downloading deprecated-1.3.1-py2.py3-none-any.whl (11 kB) +Downloading hatch-1.16.3-py3-none-any.whl (141 kB) +Downloading moto-5.1.20-py3-none-any.whl (6.4 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 212.4 MB/s 0:00:00 +Downloading mypy-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (13.6 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.6/13.6 MB 280.9 MB/s 0:00:00 +Downloading pexpect-4.9.0-py2.py3-none-any.whl (63 kB) +Downloading pre_commit-4.5.1-py2.py3-none-any.whl (226 kB) +Downloading pytest-8.4.2-py3-none-any.whl (365 kB) +Using cached pluggy-1.6.0-py3-none-any.whl (20 kB) +Downloading pytest_asyncio-1.3.0-py3-none-any.whl (15 kB) +Downloading pytest_cov-7.0.0-py3-none-any.whl (22 kB) +Downloading pytest_xdist-3.8.0-py3-none-any.whl (46 kB) +Downloading questionary-2.1.1-py3-none-any.whl (36 kB) +Downloading prompt_toolkit-3.0.51-py3-none-any.whl (387 kB) +Downloading ruff-0.14.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.9 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.9/13.9 MB 253.3 MB/s 0:00:00 +Downloading termcolor-3.3.0-py3-none-any.whl (7.7 kB) +Downloading tomlkit-0.14.0-py3-none-any.whl (39 kB) +Downloading userpath-1.9.2-py3-none-any.whl (9.1 kB) +Downloading backports_zstd-1.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (490 kB) +Downloading cfgv-3.5.0-py2.py3-none-any.whl (7.4 kB) +Downloading coverage-7.13.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl (252 kB) +Downloading execnet-2.1.2-py3-none-any.whl (40 kB) +Using cached hatchling-1.28.0-py3-none-any.whl (76 kB) +Downloading hyperlink-21.0.0-py2.py3-none-any.whl (74 kB) +Downloading identify-2.6.16-py2.py3-none-any.whl (99 kB) +Downloading iniconfig-2.3.0-py3-none-any.whl (7.5 kB) +Downloading jinja2-3.1.6-py3-none-any.whl (134 kB) +Downloading keyring-25.7.0-py3-none-any.whl (39 kB) +Downloading jeepney-0.9.0-py3-none-any.whl (49 kB) +Downloading librt-0.7.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (193 kB) +Downloading markupsafe-3.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB) +Downloading mypy_extensions-1.1.0-py3-none-any.whl (5.0 kB) +Downloading nodeenv-1.10.0-py2.py3-none-any.whl (23 kB) +Using cached pathspec-1.0.3-py3-none-any.whl (55 kB) +Downloading platformdirs-4.5.1-py3-none-any.whl (18 kB) +Downloading ptyprocess-0.7.0-py2.py3-none-any.whl (13 kB) +Downloading pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (801 kB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 801.6/801.6 kB 120.7 MB/s 0:00:00 +Downloading responses-0.25.8-py3-none-any.whl (34 kB) +Downloading secretstorage-3.5.0-py3-none-any.whl (15 kB) +Downloading shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB) +Downloading tomli_w-1.2.0-py3-none-any.whl (6.7 kB) +Downloading uv-0.9.26-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.3 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.3/23.3 MB 256.8 MB/s 0:00:00 +Downloading virtualenv-20.36.1-py3-none-any.whl (6.0 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/6.0 MB 246.9 MB/s 0:00:00 +Downloading distlib-0.4.0-py2.py3-none-any.whl (469 kB) +Downloading filelock-3.20.3-py3-none-any.whl (16 kB) +Downloading werkzeug-3.1.5-py3-none-any.whl (225 kB) +Downloading jaraco.classes-3.4.0-py3-none-any.whl (6.8 kB) +Downloading jaraco_context-6.1.0-py3-none-any.whl (7.1 kB) +Downloading jaraco_functools-4.4.0-py3-none-any.whl (10 kB) +Downloading more_itertools-10.8.0-py3-none-any.whl (69 kB) +Downloading pyproject_hooks-1.2.0-py3-none-any.whl (10 kB) +Using cached trove_classifiers-2026.1.14.14-py3-none-any.whl (14 kB) +Downloading xmltodict-1.0.2-py3-none-any.whl (13 kB) +Building wheels for collected packages: strands-agents + Building editable for strands-agents (pyproject.toml): started + Building editable for strands-agents (pyproject.toml): finished with status 'done' + Created wheel for strands-agents: filename=strands_agents-0.1.dev1+g252f896b4-py3-none-any.whl size=10025 sha256=4b43428675806af2535aeae6fab9ed55519696f758f1b8f1c3a180704457d448 + Stored in directory: /tmp/pip-ephem-wheel-cache-xw66ray_/wheels/94/60/63/fc2d04fbd73b5e7d5ee8ee2c7af924f44becb4b085a98d9503 +Successfully built strands-agents +Installing collected packages: trove-classifiers, ptyprocess, distlib, xmltodict, uv, userpath, tomlkit, tomli-w, termcolor, shellingham, ruff, pyyaml, pyproject-hooks, prompt-toolkit, pluggy, platformdirs, pexpect, pathspec, nodeenv, mypy_extensions, more-itertools, MarkupSafe, librt, jeepney, jaraco.context, iniconfig, identify, hyperlink, filelock, execnet, deprecated, decli, coverage, cfgv, backports-zstd, argcomplete, werkzeug, virtualenv, responses, questionary, pytest, mypy, jinja2, jaraco.functools, jaraco.classes, hatchling, SecretStorage, pytest-xdist, pytest-cov, pytest-asyncio, pre-commit, commitizen, keyring, moto, hatch, strands-agents + Attempting uninstall: prompt-toolkit + Found existing installation: prompt_toolkit 3.0.52 + Uninstalling prompt_toolkit-3.0.52: + Successfully uninstalled prompt_toolkit-3.0.52 + Attempting uninstall: strands-agents + Found existing installation: strands-agents 1.23.0 + Uninstalling strands-agents-1.23.0: + Successfully uninstalled strands-agents-1.23.0 + +ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. +strands-agents-tools 0.2.19 requires strands-agents>=1.0.0, but you have strands-agents 0.1.dev1+g252f896b4 which is incompatible. +Successfully installed MarkupSafe-3.0.3 SecretStorage-3.5.0 argcomplete-3.6.3 backports-zstd-1.3.0 cfgv-3.5.0 commitizen-4.12.0 coverage-7.13.1 decli-0.6.3 deprecated-1.3.1 distlib-0.4.0 execnet-2.1.2 filelock-3.20.3 hatch-1.16.3 hatchling-1.28.0 hyperlink-21.0.0 identify-2.6.16 iniconfig-2.3.0 jaraco.classes-3.4.0 jaraco.context-6.1.0 jaraco.functools-4.4.0 jeepney-0.9.0 jinja2-3.1.6 keyring-25.7.0 librt-0.7.8 more-itertools-10.8.0 moto-5.1.20 mypy-1.19.1 mypy_extensions-1.1.0 nodeenv-1.10.0 pathspec-1.0.3 pexpect-4.9.0 platformdirs-4.5.1 pluggy-1.6.0 pre-commit-4.5.1 prompt-toolkit-3.0.51 ptyprocess-0.7.0 pyproject-hooks-1.2.0 pytest-8.4.2 pytest-asyncio-1.3.0 pytest-cov-7.0.0 pytest-xdist-3.8.0 pyyaml-6.0.3 questionary-2.1.1 responses-0.25.8 ruff-0.14.13 shellingham-1.5.4 strands-agents-0.1.dev1+g252f896b4 termcolor-3.3.0 tomli-w-1.2.0 tomlkit-0.14.0 trove-classifiers-2026.1.14.14 userpath-1.9.2 uv-0.9.26 virtualenv-20.36.1 werkzeug-3.1.5 xmltodict-1.0.2 diff --git a/test_output.log b/test_output.log new file mode 100644 index 000000000..726a4d159 --- /dev/null +++ b/test_output.log @@ -0,0 +1 @@ +/bin/sh: 1: hatch: not found From ef95680bcc01cdf38d7ea48e56e29207f35c5f92 Mon Sep 17 00:00:00 2001 From: Strands Agent <217235299+strands-agent@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:38:20 +0000 Subject: [PATCH 16/17] fix(hooks): address PR review feedback for batch event handling Address review feedback from PR #26: 1. Create ToolsInterruptEvent for batch-level interrupts (don't reuse ToolInterruptEvent) 2. Always fire AfterToolsEvent even when interrupted (maintain same contract as per-tool events) 3. Keep empty tool check in executors to avoid duplicate logic Changes: - Add ToolsInterruptEvent class for batch-level interrupts in types/_events.py - Update event loop to handle ToolsInterruptEvent - Update both executors to use ToolsInterruptEvent for batch interrupts - Ensure AfterToolsEvent always fires if BeforeToolsEvent fires - Update tests to reflect new contract (AfterToolsEvent fires even on interrupt) - All 233 tests passing --- src/strands/event_loop/event_loop.py | 4 ++++ src/strands/tools/executors/concurrent.py | 15 +++++++++------ src/strands/tools/executors/sequential.py | 20 +++++++++++--------- src/strands/types/_events.py | 22 ++++++++++++++++++++++ tests/strands/agent/test_agent_hooks.py | 15 +++++++++++---- 5 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/strands/event_loop/event_loop.py b/src/strands/event_loop/event_loop.py index f50c93ba1..9f7d6b7d9 100644 --- a/src/strands/event_loop/event_loop.py +++ b/src/strands/event_loop/event_loop.py @@ -31,6 +31,7 @@ StructuredOutputEvent, ToolInterruptEvent, ToolResultMessageEvent, + ToolsInterruptEvent, TypedEvent, ) from ..types.content import Message, Messages @@ -485,12 +486,15 @@ async def _handle_tool_execution( tool_uses = [tool_use for tool_use in tool_uses if tool_use["toolUseId"] not in tool_use_ids] interrupts = [] + tool_events = agent.tool_executor._execute( agent, message, tool_uses, tool_results, cycle_trace, cycle_span, invocation_state, structured_output_context ) async for tool_event in tool_events: if isinstance(tool_event, ToolInterruptEvent): interrupts.extend(tool_event["tool_interrupt_event"]["interrupts"]) + elif isinstance(tool_event, ToolsInterruptEvent): + interrupts.extend(tool_event["tools_interrupt_event"]["interrupts"]) yield tool_event diff --git a/src/strands/tools/executors/concurrent.py b/src/strands/tools/executors/concurrent.py index 91232c72d..c8fedd00d 100644 --- a/src/strands/tools/executors/concurrent.py +++ b/src/strands/tools/executors/concurrent.py @@ -7,7 +7,7 @@ from ...hooks import AfterToolsEvent, BeforeToolsEvent from ...telemetry.metrics import Trace -from ...types._events import ToolInterruptEvent, TypedEvent +from ...types._events import ToolInterruptEvent, ToolsInterruptEvent, TypedEvent from ...types.content import Message from ...types.tools import ToolResult, ToolUse from ._executor import ToolExecutor @@ -47,17 +47,20 @@ async def _execute( Yields: Events from the tool execution stream. """ - # Skip batch events if no tools + # Skip batch events if no tools to execute if not tool_uses: return - + # Trigger BeforeToolsEvent before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) _, interrupts = await agent.hooks.invoke_callbacks_async(before_event) if interrupts: - # Use the first tool_use for the interrupt event (tools not executed yet) - yield ToolInterruptEvent(tool_uses[0], interrupts) + # Use ToolsInterruptEvent for batch-level interrupts + yield ToolsInterruptEvent(tool_uses, interrupts) + # Always fire AfterToolsEvent even if interrupted + after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + await agent.hooks.invoke_callbacks_async(after_event) return task_queue: asyncio.Queue[tuple[int, Any]] = asyncio.Queue() @@ -93,7 +96,7 @@ async def _execute( yield event task_events[task_id].set() - # Trigger AfterToolsEvent + # Always trigger AfterToolsEvent after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) await agent.hooks.invoke_callbacks_async(after_event) diff --git a/src/strands/tools/executors/sequential.py b/src/strands/tools/executors/sequential.py index 36ca3580b..ab76ca4cb 100644 --- a/src/strands/tools/executors/sequential.py +++ b/src/strands/tools/executors/sequential.py @@ -6,7 +6,7 @@ from ...hooks import AfterToolsEvent, BeforeToolsEvent from ...telemetry.metrics import Trace -from ...types._events import ToolInterruptEvent, TypedEvent +from ...types._events import ToolInterruptEvent, ToolsInterruptEvent, TypedEvent from ...types.content import Message from ...types.tools import ToolResult, ToolUse from ._executor import ToolExecutor @@ -48,17 +48,20 @@ async def _execute( Yields: Events from the tool execution stream. """ - # Skip batch events if no tools + # Skip batch events if no tools to execute if not tool_uses: return - + # Trigger BeforeToolsEvent before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) _, interrupts = await agent.hooks.invoke_callbacks_async(before_event) if interrupts: - # Use the first tool_use for the interrupt event (tools not executed yet) - yield ToolInterruptEvent(tool_uses[0], interrupts) + # Use ToolsInterruptEvent for batch-level interrupts + yield ToolsInterruptEvent(tool_uses, interrupts) + # Always fire AfterToolsEvent even if interrupted + after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + await agent.hooks.invoke_callbacks_async(after_event) return interrupted = False @@ -76,7 +79,6 @@ async def _execute( if interrupted: break - # Only trigger AfterToolsEvent if no interrupts occurred - if not interrupted: - after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - await agent.hooks.invoke_callbacks_async(after_event) + # Always trigger AfterToolsEvent + after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + await agent.hooks.invoke_callbacks_async(after_event) diff --git a/src/strands/types/_events.py b/src/strands/types/_events.py index d64357cf8..6985df418 100644 --- a/src/strands/types/_events.py +++ b/src/strands/types/_events.py @@ -358,6 +358,28 @@ def interrupts(self) -> list[Interrupt]: return cast(list[Interrupt], self["tool_interrupt_event"]["interrupts"]) +class ToolsInterruptEvent(TypedEvent): + """Event emitted when a batch of tools is interrupted before execution. + + This event is fired when BeforeToolsEvent callbacks raise interrupts, + preventing the batch of tools from executing. + """ + + def __init__(self, tool_uses: list[ToolUse], interrupts: list[Interrupt]) -> None: + """Set batch interrupt in the event payload. + + Args: + tool_uses: The list of tools that would have been executed + interrupts: The interrupts raised during BeforeToolsEvent + """ + super().__init__({"tools_interrupt_event": {"tool_uses": tool_uses, "interrupts": interrupts}}) + + @property + def interrupts(self) -> list[Interrupt]: + """The interrupt instances.""" + return cast(list[Interrupt], self["tools_interrupt_event"]["interrupts"]) + + class ModelMessageEvent(TypedEvent): """Event emitted when the model invocation has completed. diff --git a/tests/strands/agent/test_agent_hooks.py b/tests/strands/agent/test_agent_hooks.py index 17b0eb9c1..3f2baccaa 100644 --- a/tests/strands/agent/test_agent_hooks.py +++ b/tests/strands/agent/test_agent_hooks.py @@ -656,12 +656,15 @@ def sample_tool(x: int) -> int: assert len(result.interrupts) == 1 assert result.interrupts[0].name == "batch-approval" - # BeforeToolsEvent should be triggered but AfterToolsEvent should not + # Both BeforeToolsEvent and AfterToolsEvent should be triggered batch_length, batch_events = batch_hook_provider.get_events() - assert batch_length == 1 # Only BeforeToolsEvent - assert isinstance(next(batch_events), BeforeToolsEvent) + assert batch_length == 2 # BeforeToolsEvent and AfterToolsEvent + event1 = next(batch_events) + event2 = next(batch_events) + assert isinstance(event1, BeforeToolsEvent) + assert isinstance(event2, AfterToolsEvent) - # No individual tool events should be triggered + # No individual tool events should be triggered (tools didn't execute) tool_length, _ = tool_hook_provider.get_events() assert tool_length == 0 @@ -700,6 +703,10 @@ def sample_tool(x: int) -> int: assert result.stop_reason == "interrupt" assert len(result.interrupts) == 1 assert result.interrupts[0].name == "async-batch-approval" + + # Both BeforeToolsEvent and AfterToolsEvent should be triggered + batch_length, _ = batch_hook_provider.get_events() + assert batch_length == 2 def test_batch_events_with_tool_events(): From 6e21e03e2f91539ce8486834c3b3ced2a14bb05a Mon Sep 17 00:00:00 2001 From: Strands Agent <217235299+strands-agent@users.noreply.github.com> Date: Wed, 21 Jan 2026 21:54:21 +0000 Subject: [PATCH 17/17] refactor(hooks): move batch event firing to event loop Address additional PR review feedback: 1. Move batch event firing out of executors to event loop - Eliminates duplication between sequential and concurrent executors - BeforeToolsEvent and AfterToolsEvent now fired in _handle_tool_execution 2. Remove message parameter from _execute() signature - No longer needed since batch events fire in event loop - Simplifies executor interface 3. AfterToolsEvent does NOT fire on batch-level interrupt - Only fires if BeforeToolsEvent completes without interrupt - Per-tool interrupts still allow AfterToolsEvent to fire All 233 tests passing with clean executor implementations. --- src/strands/event_loop/event_loop.py | 37 +++++++++++++++++------ src/strands/tools/executors/_executor.py | 6 ++-- src/strands/tools/executors/concurrent.py | 26 +--------------- src/strands/tools/executors/sequential.py | 26 +--------------- tests/strands/agent/test_agent_hooks.py | 10 +++--- 5 files changed, 35 insertions(+), 70 deletions(-) diff --git a/src/strands/event_loop/event_loop.py b/src/strands/event_loop/event_loop.py index 9f7d6b7d9..a63dbae30 100644 --- a/src/strands/event_loop/event_loop.py +++ b/src/strands/event_loop/event_loop.py @@ -15,7 +15,7 @@ from opentelemetry import trace as trace_api -from ..hooks import AfterModelCallEvent, BeforeModelCallEvent, MessageAddedEvent +from ..hooks import AfterModelCallEvent, AfterToolsEvent, BeforeModelCallEvent, BeforeToolsEvent, MessageAddedEvent from ..telemetry.metrics import Trace from ..telemetry.tracer import Tracer, get_tracer from ..tools._validator import validate_and_prepare_tools @@ -486,17 +486,34 @@ async def _handle_tool_execution( tool_uses = [tool_use for tool_use in tool_uses if tool_use["toolUseId"] not in tool_use_ids] interrupts = [] + batch_interrupted = False - tool_events = agent.tool_executor._execute( - agent, message, tool_uses, tool_results, cycle_trace, cycle_span, invocation_state, structured_output_context - ) - async for tool_event in tool_events: - if isinstance(tool_event, ToolInterruptEvent): - interrupts.extend(tool_event["tool_interrupt_event"]["interrupts"]) - elif isinstance(tool_event, ToolsInterruptEvent): - interrupts.extend(tool_event["tools_interrupt_event"]["interrupts"]) + # Fire BeforeToolsEvent if there are tools to execute + if tool_uses: + before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + _, batch_interrupts = await agent.hooks.invoke_callbacks_async(before_event) + + if batch_interrupts: + batch_interrupted = True + interrupts.extend(batch_interrupts) + # Yield ToolsInterruptEvent for batch-level interrupts + yield ToolsInterruptEvent(tool_uses, batch_interrupts) + + # Only execute tools if not interrupted at batch level + if not batch_interrupted: + tool_events = agent.tool_executor._execute( + agent, tool_uses, tool_results, cycle_trace, cycle_span, invocation_state, structured_output_context + ) + async for tool_event in tool_events: + if isinstance(tool_event, ToolInterruptEvent): + interrupts.extend(tool_event["tool_interrupt_event"]["interrupts"]) - yield tool_event + yield tool_event + + # Fire AfterToolsEvent if there are tools and no batch-level interrupt + if tool_uses and not batch_interrupted: + after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) + await agent.hooks.invoke_callbacks_async(after_event) structured_output_result = None if structured_output_context.is_enabled: diff --git a/src/strands/tools/executors/_executor.py b/src/strands/tools/executors/_executor.py index d51c4947b..9e20294d0 100644 --- a/src/strands/tools/executors/_executor.py +++ b/src/strands/tools/executors/_executor.py @@ -315,7 +315,6 @@ async def _stream_with_trace( def _execute( self, agent: "Agent", - message: Message, tool_uses: list[ToolUse], tool_results: list[ToolResult], cycle_trace: Trace, @@ -325,12 +324,11 @@ def _execute( ) -> AsyncGenerator[TypedEvent, None]: """Execute the given tools according to this executor's strategy. - This method is responsible for executing the tools in a batch and triggering - the BeforeToolsEvent and AfterToolsEvent hooks at the appropriate times. + BeforeToolsEvent and AfterToolsEvent hooks are triggered by the event loop, + not by the executor implementations. Args: agent: The agent for which tools are being executed. - message: The message from the model containing tool use blocks. tool_uses: Metadata and inputs for the tools to be executed. tool_results: List of tool results from each tool execution. cycle_trace: Trace object for the current event loop cycle. diff --git a/src/strands/tools/executors/concurrent.py b/src/strands/tools/executors/concurrent.py index c8fedd00d..216eee379 100644 --- a/src/strands/tools/executors/concurrent.py +++ b/src/strands/tools/executors/concurrent.py @@ -5,10 +5,8 @@ from typing_extensions import override -from ...hooks import AfterToolsEvent, BeforeToolsEvent from ...telemetry.metrics import Trace -from ...types._events import ToolInterruptEvent, ToolsInterruptEvent, TypedEvent -from ...types.content import Message +from ...types._events import TypedEvent from ...types.tools import ToolResult, ToolUse from ._executor import ToolExecutor @@ -24,7 +22,6 @@ class ConcurrentToolExecutor(ToolExecutor): async def _execute( self, agent: "Agent", - message: Message, tool_uses: list[ToolUse], tool_results: list[ToolResult], cycle_trace: Trace, @@ -36,7 +33,6 @@ async def _execute( Args: agent: The agent for which tools are being executed. - message: The message from the model containing tool use blocks. tool_uses: Metadata and inputs for the tools to be executed. tool_results: List of tool results from each tool execution. cycle_trace: Trace object for the current event loop cycle. @@ -47,22 +43,6 @@ async def _execute( Yields: Events from the tool execution stream. """ - # Skip batch events if no tools to execute - if not tool_uses: - return - - # Trigger BeforeToolsEvent - before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - _, interrupts = await agent.hooks.invoke_callbacks_async(before_event) - - if interrupts: - # Use ToolsInterruptEvent for batch-level interrupts - yield ToolsInterruptEvent(tool_uses, interrupts) - # Always fire AfterToolsEvent even if interrupted - after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - await agent.hooks.invoke_callbacks_async(after_event) - return - task_queue: asyncio.Queue[tuple[int, Any]] = asyncio.Queue() task_events = [asyncio.Event() for _ in tool_uses] stop_event = object() @@ -96,10 +76,6 @@ async def _execute( yield event task_events[task_id].set() - # Always trigger AfterToolsEvent - after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - await agent.hooks.invoke_callbacks_async(after_event) - async def _task( self, agent: "Agent", diff --git a/src/strands/tools/executors/sequential.py b/src/strands/tools/executors/sequential.py index ab76ca4cb..f78e60872 100644 --- a/src/strands/tools/executors/sequential.py +++ b/src/strands/tools/executors/sequential.py @@ -4,10 +4,8 @@ from typing_extensions import override -from ...hooks import AfterToolsEvent, BeforeToolsEvent from ...telemetry.metrics import Trace -from ...types._events import ToolInterruptEvent, ToolsInterruptEvent, TypedEvent -from ...types.content import Message +from ...types._events import ToolInterruptEvent, TypedEvent from ...types.tools import ToolResult, ToolUse from ._executor import ToolExecutor @@ -23,7 +21,6 @@ class SequentialToolExecutor(ToolExecutor): async def _execute( self, agent: "Agent", - message: Message, tool_uses: list[ToolUse], tool_results: list[ToolResult], cycle_trace: Trace, @@ -37,7 +34,6 @@ async def _execute( Args: agent: The agent for which tools are being executed. - message: The message from the model containing tool use blocks. tool_uses: Metadata and inputs for the tools to be executed. tool_results: List of tool results from each tool execution. cycle_trace: Trace object for the current event loop cycle. @@ -48,22 +44,6 @@ async def _execute( Yields: Events from the tool execution stream. """ - # Skip batch events if no tools to execute - if not tool_uses: - return - - # Trigger BeforeToolsEvent - before_event = BeforeToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - _, interrupts = await agent.hooks.invoke_callbacks_async(before_event) - - if interrupts: - # Use ToolsInterruptEvent for batch-level interrupts - yield ToolsInterruptEvent(tool_uses, interrupts) - # Always fire AfterToolsEvent even if interrupted - after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - await agent.hooks.invoke_callbacks_async(after_event) - return - interrupted = False for tool_use in tool_uses: @@ -78,7 +58,3 @@ async def _execute( if interrupted: break - - # Always trigger AfterToolsEvent - after_event = AfterToolsEvent(agent=agent, message=message, tool_uses=tool_uses) - await agent.hooks.invoke_callbacks_async(after_event) diff --git a/tests/strands/agent/test_agent_hooks.py b/tests/strands/agent/test_agent_hooks.py index 3f2baccaa..841dc2fb5 100644 --- a/tests/strands/agent/test_agent_hooks.py +++ b/tests/strands/agent/test_agent_hooks.py @@ -656,13 +656,11 @@ def sample_tool(x: int) -> int: assert len(result.interrupts) == 1 assert result.interrupts[0].name == "batch-approval" - # Both BeforeToolsEvent and AfterToolsEvent should be triggered + # Only BeforeToolsEvent should be triggered (AfterToolsEvent NOT fired on batch interrupt) batch_length, batch_events = batch_hook_provider.get_events() - assert batch_length == 2 # BeforeToolsEvent and AfterToolsEvent + assert batch_length == 1 # Only BeforeToolsEvent event1 = next(batch_events) - event2 = next(batch_events) assert isinstance(event1, BeforeToolsEvent) - assert isinstance(event2, AfterToolsEvent) # No individual tool events should be triggered (tools didn't execute) tool_length, _ = tool_hook_provider.get_events() @@ -704,9 +702,9 @@ def sample_tool(x: int) -> int: assert len(result.interrupts) == 1 assert result.interrupts[0].name == "async-batch-approval" - # Both BeforeToolsEvent and AfterToolsEvent should be triggered + # Only BeforeToolsEvent should be triggered (AfterToolsEvent NOT fired on batch interrupt) batch_length, _ = batch_hook_provider.get_events() - assert batch_length == 2 + assert batch_length == 1 def test_batch_events_with_tool_events():