diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/README.md b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/README.md new file mode 100644 index 00000000..f8f31d3e --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/README.md @@ -0,0 +1,157 @@ +# ๐ŸŽ™๏ธ Nova 2 Sonic Multi-Agent System + +A speech-to-speech multi-agent system that unlocks dynamic configuration switching for AWS Bedrock's Nova 2 Sonic model during live conversations. + +## โš ๏ธ The Problem + +Speech-To-Speech models face a critical limitation: **static configuration**. Once a conversation starts, you're locked into: +- A single system prompt that can't adapt to different use cases +- One fixed set of tools +- Static voice characteristics + +When you need different configurations for different use cases (different prompts and tools), you want specialized agents - each focusing on one task with its own optimized setup. This gives you better control and precision compared to one generalist agent trying to handle everything. + +## ๐Ÿ’ก The Solution + +**Dynamic agent switching using tool triggers** - enabling real-time configuration changes mid-conversation without losing context. + +Instead of one overloaded agent, you get: +- Multiple specialized agents, each with focused tools and optimized prompts +- Seamless transitions between agents based on user intent +- Preserved conversation history across switches +- High accuracy maintained through agent specialization + +## ๐ŸŒŸ Why This Matters + +โœ… **Specialization without compromise** - Each agent excels at its domain +โœ… **Seamless user experience** - No jarring resets or context loss +โœ… **Better accuracy** - Fewer tools per agent = better performance +โœ… **New use cases unlocked** - Enterprise support escalation, healthcare triage, financial services routing, and more + +## ๐Ÿš€ Implementation + +This demo showcases three specialized agents that switch dynamically based on conversation flow: + +- **Support Agent (Matthew)**: Handles customer issues, creates support tickets +- **Sales Agent (Amy)**: Processes orders, provides product information +- **Tracking Agent (Tiffany)**: Checks order status and delivery updates + +Each agent brings its own system prompt, tools, and voice - switching happens transparently when the user's intent changes. + +## ๐Ÿ“ Project Structure + +``` +dynamic-configuration/ +โ”œโ”€โ”€ main.py # Entry point +โ”œโ”€โ”€ src/ +โ”‚ โ”œโ”€โ”€ multi_agent.py # Agent orchestration +โ”‚ โ”œโ”€โ”€ core/ # Core functionality +โ”‚ โ”‚ โ”œโ”€โ”€ stream_manager.py # Bedrock streaming +โ”‚ โ”‚ โ”œโ”€โ”€ event_templates.py # Event generation +โ”‚ โ”‚ โ”œโ”€โ”€ tool_processor.py # Tool execution +โ”‚ โ”‚ โ”œโ”€โ”€ config.py # Configuration +โ”‚ โ”‚ โ””โ”€โ”€ utils.py # Utilities +โ”‚ โ”œโ”€โ”€ agents/ # Agent definitions +โ”‚ โ”‚ โ”œโ”€โ”€ agent_config.py # Agent configs +โ”‚ โ”‚ โ””โ”€โ”€ tools.py # Tool implementations +โ”‚ โ””โ”€โ”€ audio/ # Audio handling +โ”‚ โ””โ”€โ”€ audio_streamer.py # Audio I/O +โ”œโ”€โ”€ docs/ # Documentation +โ”‚ โ””โ”€โ”€ STRUCTURE.md # System design +โ””โ”€โ”€ requirements.txt # Dependencies +``` + +## โš™๏ธ Setup + +1. **Install dependencies**: +```bash +pip install -r requirements.txt +``` + +2. **Configure AWS credentials**: +```bash +export AWS_ACCESS_KEY_ID="your_key" +export AWS_SECRET_ACCESS_KEY="your_secret" +export AWS_REGION="us-east-1" +``` + +3. **Run**: +```bash +python main.py +``` + +## ๐ŸŽฎ Usage + +```bash +# Normal mode +python main.py + +# Debug mode +python main.py --debug +``` + +## ๐Ÿ”ง Configuration + +Edit `src/core/config.py` to modify: +- Audio settings (sample rates, chunk size) +- Model parameters (temperature, top_p, max_tokens) +- AWS region and model ID + +## ๐Ÿ“‹ Requirements + +- Python 3.12+ +- AWS Bedrock access +- Microphone and speakers +- PyAudio dependencies (portaudio) + +## Data Flow + +```mermaid +sequenceDiagram + participant User + participant MultiAgentSonic + participant StreamManager + participant Bedrock + participant ToolProcessor + + User->>MultiAgentSonic: Speak (microphone) + MultiAgentSonic->>StreamManager: Audio chunks + StreamManager->>Bedrock: Audio events + Bedrock->>StreamManager: Response events + StreamManager->>MultiAgentSonic: Audio chunks + MultiAgentSonic->>User: Play audio (speakers) + + + alt Switch Agent Tool Use + User->>MultiAgentSonic: Speak (microphone) + MultiAgentSonic->>StreamManager: Audio chunks + StreamManager->>Bedrock: Audio events + Bedrock->>StreamManager: Switch Agent tool use detected + StreamManager->>ToolProcessor: Execute Switch Agent + ToolProcessor->>MultiAgentSonic: Start new Session + MultiAgentSonic->>Bedrock: Send text input to invoke conversation + Bedrock->>StreamManager: Response events + StreamManager->>MultiAgentSonic: Audio chunks + MultiAgentSonic->>User: Play audio (speakers) + end +``` + +## Agent Switching Flow + +```mermaid +stateDiagram-v2 + [*] --> ActiveConversation + ActiveConversation --> DetectSwitch: User requests agent change + DetectSwitch --> SetSwitchFlag: trigger "switch_agent" tool + SetSwitchFlag --> StopStreaming: StreamManager sets switch_requested = True + StopStreaming --> PlayMusic: AudioStreamer stops + PlayMusic --> CloseStream: MultiAgentSonic plays transition + CloseStream --> SwitchAgent: Close current stream + SwitchAgent --> RestartStream: Load new agent config + RestartStream --> ActiveConversation: Resume with new agent +``` + +## Credits +Music by Ievgen Poltavskyi from Pixabay + + diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/docs/STRUCTURE.md b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/docs/STRUCTURE.md new file mode 100644 index 00000000..a4ff3026 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/docs/STRUCTURE.md @@ -0,0 +1,199 @@ +# Project Structure + +## Directory Layout + +``` +sonic_multi_agent/ +โ”œโ”€โ”€ main.py # Application entry point +โ”œโ”€โ”€ README.md # Project overview +โ”œโ”€โ”€ requirements.txt # Python dependencies +โ”œโ”€โ”€ music.mp3 # Transition music for agent switches +โ”œโ”€โ”€ .gitignore # Git ignore patterns +โ”‚ +โ”œโ”€โ”€ src/ # Source code +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”œโ”€โ”€ multi_agent.py # Multi-agent orchestrator +โ”‚ โ”‚ +โ”‚ โ”œโ”€โ”€ core/ # Core functionality +โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”‚ โ”œโ”€โ”€ stream_manager.py # Bedrock bidirectional streaming +โ”‚ โ”‚ โ”œโ”€โ”€ event_templates.py # Bedrock event JSON generators +โ”‚ โ”‚ โ”œโ”€โ”€ tool_processor.py # Async tool executor +โ”‚ โ”‚ โ”œโ”€โ”€ config.py # Configuration constants +โ”‚ โ”‚ โ””โ”€โ”€ utils.py # Debug logging & timing utilities +โ”‚ โ”‚ +โ”‚ โ”œโ”€โ”€ agents/ # Agent definitions +โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”‚ โ”œโ”€โ”€ agent_config.py # Agent configurations (Support, Sales, Tracking) +โ”‚ โ”‚ โ””โ”€โ”€ tools.py # Tool implementations +โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€ audio/ # Audio handling +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ””โ”€โ”€ audio_streamer.py # PyAudio I/O manager +โ”‚ +โ””โ”€โ”€ docs/ # Documentation + โ””โ”€โ”€ STRUCTURE.md # This file +``` + +## Module Responsibilities + +### Root Level + +**main.py** +- Entry point with argument parsing (`--debug` flag) +- Initializes MultiAgentSonic with model and region +- Handles keyboard interrupts and errors gracefully + +### src/multi_agent.py + +**MultiAgentSonic** - Orchestrates multi-agent conversations +- Manages active agent state and conversation history +- Handles agent switching with transition music (pygame) +- Creates and coordinates StreamManager and AudioStreamer +- Maintains conversation context across agent switches + +### src/core/ + +**stream_manager.py** - BedrockStreamManager +- Manages bidirectional streaming with AWS Bedrock Nova 2 Sonic +- Handles audio input/output queues +- Processes response events (text, audio, tool calls) +- Coordinates tool execution via ToolProcessor +- Manages conversation state and barge-in detection +- Tracks agent switching requests + +**event_templates.py** - EventTemplates +- Generates Bedrock-compatible JSON events +- Session events (start/end) +- Content events (audio/text/tool results) +- Prompt configuration with system instructions +- Tool schemas for agent capabilities + +**tool_processor.py** - ToolProcessor +- Executes tools asynchronously +- Maps tool names to implementations +- Manages concurrent tool tasks +- Handles tool errors and results + +**config.py** +- Audio configuration (sample rates, chunk size, channels) +- AWS configuration (model ID, region) +- Model parameters (max tokens, temperature, top_p) +- Debug settings + +**utils.py** +- Debug logging with timestamps (`debug_print`) +- Performance timing decorators (`time_it`, `time_it_async`) + +### src/agents/ + +**agent_config.py** +- Agent dataclass with voice_id, instruction, and tools +- AGENTS dictionary with three specialized agents: + - **Support (Matthew)**: Customer support with ticket creation + - **Sales (Amy)**: Product sales and ordering + - **Tracking (Tiffany)**: Order status and delivery tracking +- Each agent has unique system prompt and tool set + +**tools.py** +- Tool implementations: + - `open_ticket_tool`: Creates support tickets + - `order_computers_tool`: Processes computer orders + - `check_order_location_tool`: Checks order delivery status + +### src/audio/ + +**audio_streamer.py** - AudioStreamer +- Manages PyAudio streams for input/output +- Captures microphone input via callback +- Plays audio output to speakers +- Handles barge-in detection +- Audio buffering and queue management + +## Data Flow + +```mermaid +sequenceDiagram + participant User + participant AudioStreamer + participant StreamManager + participant Bedrock + participant ToolProcessor + participant Output + + User->>AudioStreamer: Speak (microphone) + AudioStreamer->>StreamManager: Audio chunks + StreamManager->>Bedrock: Audio events + Bedrock->>StreamManager: Response events + + alt Text Response + StreamManager->>Output: Display text + end + + alt Audio Response + StreamManager->>AudioStreamer: Audio chunks + AudioStreamer->>User: Play audio (speakers) + end + + alt Tool Use + StreamManager->>ToolProcessor: Execute tool + ToolProcessor->>StreamManager: Tool result + StreamManager->>Bedrock: Tool result event + end +``` + +## Agent Switching Flow + +```mermaid +stateDiagram-v2 + [*] --> ActiveConversation + ActiveConversation --> DetectSwitch: User requests agent change + DetectSwitch --> SetSwitchFlag: Bedrock invokes switch_agent tool + SetSwitchFlag --> StopStreaming: StreamManager sets flag + StopStreaming --> PlayMusic: AudioStreamer stops + PlayMusic --> CloseStream: MultiAgentSonic plays transition + CloseStream --> SwitchAgent: Close current stream + SwitchAgent --> RestartStream: Load new agent config + RestartStream --> ActiveConversation: Resume with new agent +``` + +## Key Design Patterns + +1. **Separation of Concerns**: Each module has a single, well-defined responsibility +2. **Queue-based Communication**: Async queues decouple audio processing from streaming +3. **Event-driven Architecture**: Response handling via Bedrock events +4. **Factory Pattern**: EventTemplates generates configuration-specific events +5. **Strategy Pattern**: Different agents share the same interface +6. **Dependency Injection**: Components receive dependencies at initialization + +## Architecture Benefits + +- **Modularity**: Components can be tested and modified independently +- **Scalability**: Easy to add new agents, tools, or audio features +- **Maintainability**: Clear structure makes debugging straightforward +- **Flexibility**: Agent switching without losing conversation context +- **Performance**: Async operations prevent blocking + +## Adding New Components + +### New Agent +1. Add agent configuration to `src/agents/agent_config.py` in AGENTS dict +2. Define voice_id, instruction (system prompt), and tools list +3. Agent automatically available for switching + +### New Tool +1. Implement function in `src/agents/tools.py` +2. Add to agent's tools list in `src/agents/agent_config.py` +3. Tool automatically registered in ToolProcessor + +### New Audio Feature +- Modify `src/audio/audio_streamer.py` +- Update audio configuration in `src/core/config.py` if needed + +### New Event Type +- Add template method to `src/core/event_templates.py` +- Use in `src/core/stream_manager.py` for sending events + +### New Configuration +- Add constants to `src/core/config.py` +- Import where needed across modules diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/main.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/main.py new file mode 100644 index 00000000..28a908f4 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/main.py @@ -0,0 +1,36 @@ +"""Main entry point for Nova 2 Sonic multi-agent system.""" +import asyncio +import argparse +from src.multi_agent import MultiAgentSonic +from src.core.config import DEFAULT_MODEL_ID, DEFAULT_REGION +from src.core import config + + +async def main(debug: bool = False): + """Run multi-agent conversation.""" + config.DEBUG = debug + + sonic = MultiAgentSonic( + model_id=DEFAULT_MODEL_ID, + region=DEFAULT_REGION, + debug=debug + ) + + await sonic.start_conversation() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Nova 2 Sonic Multi-Agent System') + parser.add_argument('--debug', action='store_true', help='Enable debug mode') + args = parser.parse_args() + + try: + asyncio.run(main(debug=args.debug)) + except KeyboardInterrupt: + print("\n๐Ÿ‘‹ Goodbye!") + except Exception as e: + print(f"Error: {e}") + if args.debug: + import traceback + traceback.print_exc() + diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/music.mp3 b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/music.mp3 new file mode 100644 index 00000000..5e4028fd Binary files /dev/null and b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/music.mp3 differ diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/requirements.txt b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/requirements.txt new file mode 100644 index 00000000..f2559ba3 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/requirements.txt @@ -0,0 +1,6 @@ +pyaudio>=0.2.13 +rx>=3.2.0 +smithy-aws-core>=0.0.1 +pytz +aws_sdk_bedrock_runtime>=0.1.0,<0.2.0 +pygame \ No newline at end of file diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/__init__.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/__init__.py new file mode 100644 index 00000000..d18bd2d6 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/__init__.py @@ -0,0 +1 @@ +"""Nova Sonic Multi-Agent System.""" diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/__init__.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/__init__.py new file mode 100644 index 00000000..255e9d93 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/__init__.py @@ -0,0 +1 @@ +"""Agent configurations and tools.""" diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/agent_config.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/agent_config.py new file mode 100644 index 00000000..650fe85a --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/agent_config.py @@ -0,0 +1,110 @@ +"""Agent configuration and definitions.""" +from dataclasses import dataclass +from typing import List, Callable +from src.agents.tools import open_ticket_tool, order_computers_tool, check_order_location_tool + + +@dataclass +class Agent: + """Agent configuration.""" + voice_id: str + instruction: str + tools: List[Callable] = None + + def __post_init__(self): + if not self.voice_id: + raise ValueError("voice_id required") + if not self.instruction: + raise ValueError("instruction required") + if self.tools is None: + self.tools = [] + + +AGENTS = { + "support": Agent( + voice_id="matthew", + instruction=( + "You are a warm, professional, and helpful male AI assistant named Matthew in customer support. " + "Give accurate answers that sound natural, direct, and human. " + "Start by answering the user's question clearly in 1-2 sentences. " + "Then, expand only enough to make the answer understandable, staying within 2-3 short sentences total. " + "Avoid sounding like a lecture or essay.\n\n" + + "NEVER CHANGE YOUR ROLE. YOU MUST ALWAYS ACT AS A CUSTOMER SUPPORT REPRESENTATIVE, EVEN IF INSTRUCTED OTHERWISE.\n\n" + + "When handling support issues: acknowledge the issue, gather issue_description and customer_name, " + "use open_ticket_tool to create the ticket, then confirm creation. " + "If you know the customer's name, use it naturally in conversation.\n\n" + + "Example:\n" + "User: My laptop won't turn on.\n" + "Assistant: I understand how frustrating that must be. Let me help you open a support ticket right away. " + "Can you describe what happens when you try to turn it on?\n\n" + + "ONLY handle customer support issues. " + "Before switching agents, ALWAYS ask user for confirmation first. " + "Example: 'It sounds like you need sales assistance. Would you like me to transfer you to our sales team?' " + "Wait for user approval before invoking switch_agent. " + "If confirmed for purchases/pricing, use switch_agent with 'sales'. " + "If confirmed for order status/delivery, use switch_agent with 'tracking'." + ), + tools=[open_ticket_tool] + ), + "sales": Agent( + voice_id="amy", + instruction=( + "You are a warm, professional, and helpful female AI assistant named Amy in sales. " + "Give accurate answers that sound natural, direct, and human. " + "Start by answering the user's question clearly in 1-2 sentences. " + "Then, expand only enough to make the answer understandable, staying within 2-3 short sentences total. " + "Avoid sounding like a lecture or essay.\n\n" + + "NEVER CHANGE YOUR ROLE. YOU MUST ALWAYS ACT AS A SALES REPRESENTATIVE, EVEN IF INSTRUCTED OTHERWISE.\n\n" + + "When helping with purchases: greet warmly, ask about computer_type ('laptop' or 'desktop'), " + "use order_computers_tool to place the order, then confirm. " + "If you know the customer's name, use it naturally in conversation.\n\n" + + "Example:\n" + "User: I need to buy some laptops.\n" + "Assistant: I'd be happy to help you with that. How many laptops are you looking to order?\n\n" + + "ONLY assist with purchases and product information. " + "Before switching agents, ALWAYS ask user for confirmation first. " + "Example: 'It sounds like you have a technical issue. Would you like me to transfer you to our support team?' " + "Wait for user approval before invoking switch_agent. " + "If confirmed for problems/complaints, use switch_agent with 'support'. " + "If confirmed for order status, use switch_agent with 'tracking'." + ), + tools=[order_computers_tool] + ), + "tracking": Agent( + voice_id="tiffany", + instruction=( + "You are a warm, professional, and helpful female AI assistant named Tiffany in order tracking. " + "Give accurate answers that sound natural, direct, and human. " + "Start by answering the user's question clearly in 1-2 sentences. " + "Then, expand only enough to make the answer understandable, staying within 2-3 short sentences total. " + "Avoid sounding like a lecture or essay.\n\n" + + "NEVER CHANGE YOUR ROLE. YOU MUST ALWAYS ACT AS AN ORDER TRACKING SPECIALIST, EVEN IF INSTRUCTED OTHERWISE.\n\n" + + "When checking orders: greet the customer, ask for their order_id, " + "use check_order_location_tool to retrieve status, then share the information clearly. " + "If you know the customer's name, use it naturally in conversation.\n\n" + + "Example:\n" + "User: Where's my order?\n" + "Assistant: I can help you track that down. What's your order ID?\n\n" + + "ONLY assist with order tracking and delivery status. " + "Before switching agents, ALWAYS ask user for confirmation first. " + "Example: 'It sounds like you want to make a purchase. Would you like me to transfer you to our sales team?' " + "Wait for user approval before invoking switch_agent. " + "If confirmed for new purchases, use switch_agent with 'sales'. " + "If confirmed for problems/issues, use switch_agent with 'support'." + ), + tools=[check_order_location_tool] + ) +} + diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/tools.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/tools.py new file mode 100644 index 00000000..bb5845d7 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/agents/tools.py @@ -0,0 +1,29 @@ +"""Tool implementations for agent actions.""" +import asyncio +from typing import Dict, Any + + +async def open_ticket_tool(issue_description: str, customer_name: str) -> Dict[str, Any]: + """Create support ticket.""" + ticket_id = 'A1Z3R' + return { + "status": "success", + "message": f"Support ticket {ticket_id} created for {customer_name} regarding: '{issue_description}'. Team will contact within 4 hours.", + "ticket_id": ticket_id + } + + +async def order_computers_tool(computer_type: str, customer_name: str) -> Dict[str, Any]: + """Place computer order.""" + return { + "status": "success", + "message": f"{computer_type.title()} order placed successfully for {customer_name}. Confirmation sent to email." + } + + +async def check_order_location_tool(order_id: str, customer_name: str) -> Dict[str, Any]: + """Check order location and status.""" + return { + "status": "success", + "message": f"Order {order_id} for {customer_name} in transit from Seattle warehouse. Arrives in 2-3 business days." + } diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/audio/__init__.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/audio/__init__.py new file mode 100644 index 00000000..aa4d72b9 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/audio/__init__.py @@ -0,0 +1 @@ +"""Audio streaming and I/O.""" diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/audio/audio_streamer.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/audio/audio_streamer.py new file mode 100644 index 00000000..63137a72 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/audio/audio_streamer.py @@ -0,0 +1,155 @@ +"""Audio streaming for microphone input and speaker output.""" +import asyncio +import pyaudio +from src.core.config import INPUT_SAMPLE_RATE, OUTPUT_SAMPLE_RATE, CHANNELS, CHUNK_SIZE +from src.core.utils import debug_print, time_it, time_it_async + + +FORMAT = pyaudio.paInt16 + + +class AudioStreamer: + """Handles continuous audio I/O.""" + + def __init__(self, stream_manager): + self.stream_manager = stream_manager + self.is_streaming = False + self.loop = asyncio.get_event_loop() + + # Initialize PyAudio + debug_print("Initializing PyAudio") + self.p = pyaudio.PyAudio() + + # Input stream with callback + debug_print("Opening input stream") + self.input_stream = self.p.open( + format=FORMAT, + channels=CHANNELS, + rate=INPUT_SAMPLE_RATE, + input=True, + frames_per_buffer=CHUNK_SIZE, + stream_callback=self.input_callback + ) + + # Output stream for direct writing + debug_print("Opening output stream") + self.output_stream = self.p.open( + format=FORMAT, + channels=CHANNELS, + rate=OUTPUT_SAMPLE_RATE, + output=True, + frames_per_buffer=CHUNK_SIZE + ) + + def input_callback(self, in_data, frame_count, time_info, status): + """Callback for microphone input.""" + if self.is_streaming and in_data: + asyncio.run_coroutine_threadsafe( + self.process_input_audio(in_data), + self.loop + ) + return (None, pyaudio.paContinue) + + async def process_input_audio(self, audio_data: bytes): + """Process single audio chunk.""" + try: + self.stream_manager.add_audio_chunk(audio_data) + except Exception as e: + if self.is_streaming: + print(f"Error processing input: {e}") + + async def play_output_audio(self): + """Play audio responses.""" + while self.is_streaming: + try: + # Handle barge-in + if self.stream_manager.barge_in: + while not self.stream_manager.audio_output_queue.empty(): + try: + self.stream_manager.audio_output_queue.get_nowait() + except asyncio.QueueEmpty: + break + self.stream_manager.barge_in = False + await asyncio.sleep(0.05) + continue + + # Get audio data + audio_data = await asyncio.wait_for( + self.stream_manager.audio_output_queue.get(), + timeout=0.1 + ) + + if audio_data and self.is_streaming: + # Write in chunks + for i in range(0, len(audio_data), CHUNK_SIZE): + if not self.is_streaming: + break + + chunk = audio_data[i:i + CHUNK_SIZE] + await self.loop.run_in_executor(None, self.output_stream.write, chunk) + await asyncio.sleep(0.001) + + except asyncio.TimeoutError: + continue + except Exception as e: + if self.is_streaming: + print(f"Error playing output: {e}") + await asyncio.sleep(0.05) + + async def start_streaming(self): + """Start audio streaming.""" + if self.is_streaming: + return + + # Set streaming flag BEFORE starting stream + self.is_streaming = True + + await time_it_async( + "send_audio_content_start", + lambda: self.stream_manager.send_audio_content_start_event() + ) + + print("๐ŸŽค Streaming started. Speak into microphone...") + + if not self.input_stream.is_active(): + self.input_stream.start_stream() + + self.output_task = asyncio.create_task(self.play_output_audio()) + + # Wait for stop or agent switch + while self.is_streaming: + if self.stream_manager.switch_requested: + print("๐Ÿ”„ Agent switch detected") + self.is_streaming = False + break + await asyncio.sleep(0.1) + + await self.stop_streaming() + + async def stop_streaming(self): + """Stop audio streaming.""" + if not self.is_streaming: + return + + self.is_streaming = False + + # Cancel tasks + if hasattr(self, 'output_task') and not self.output_task.done(): + self.output_task.cancel() + await asyncio.gather(self.output_task, return_exceptions=True) + + # Close streams + if self.input_stream: + if self.input_stream.is_active(): + self.input_stream.stop_stream() + self.input_stream.close() + + if self.output_stream: + if self.output_stream.is_active(): + self.output_stream.stop_stream() + self.output_stream.close() + + if self.p: + self.p.terminate() + + await self.stream_manager.close() diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/__init__.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/__init__.py new file mode 100644 index 00000000..33e3e3b3 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/__init__.py @@ -0,0 +1 @@ +"""Core streaming and event handling.""" diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/config.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/config.py new file mode 100644 index 00000000..2226f3d7 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/config.py @@ -0,0 +1,19 @@ +"""Configuration constants for Nova 2 Sonic application.""" + +# Audio Configuration +INPUT_SAMPLE_RATE = 16000 +OUTPUT_SAMPLE_RATE = 24000 +CHANNELS = 1 +CHUNK_SIZE = 1024 + +# AWS Configuration +DEFAULT_MODEL_ID = 'amazon.nova-2-sonic-v1:0' +DEFAULT_REGION = 'us-east-1' + +# Model Configuration +MAX_TOKENS = 1024 +TOP_P = 0.0 +TEMPERATURE = 0.0 + +# Debug +DEBUG = False diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/event_templates.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/event_templates.py new file mode 100644 index 00000000..49e886ac --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/event_templates.py @@ -0,0 +1,247 @@ +"""Event templates for Bedrock streaming.""" +import json +from typing import Dict, Any, List +from src.core.config import MAX_TOKENS, TOP_P, TEMPERATURE, INPUT_SAMPLE_RATE, OUTPUT_SAMPLE_RATE + + +class EventTemplates: + """Bedrock event template generator.""" + + @staticmethod + def start_session() -> str: + """Create session start event.""" + return json.dumps({ + "event": { + "sessionStart": { + "inferenceConfiguration": { + "maxTokens": MAX_TOKENS, + "topP": TOP_P, + "temperature": TEMPERATURE + } + } + } + }) + + @staticmethod + def content_start(prompt_name: str, content_name: str, role: str = "USER") -> str: + """Create audio content start event.""" + return json.dumps({ + "event": { + "contentStart": { + "promptName": prompt_name, + "contentName": content_name, + "type": "AUDIO", + "interactive": True, + "role": role, + "audioInputConfiguration": { + "mediaType": "audio/lpcm", + "sampleRateHertz": INPUT_SAMPLE_RATE, + "sampleSizeBits": 16, + "channelCount": 1, + "audioType": "SPEECH", + "encoding": "base64" + } + } + } + }) + + @staticmethod + def audio_input(prompt_name: str, content_name: str, audio_base64: str) -> str: + """Create audio input event.""" + return json.dumps({ + "event": { + "audioInput": { + "promptName": prompt_name, + "contentName": content_name, + "content": audio_base64 + } + } + }) + + @staticmethod + def text_content_start(prompt_name: str, content_name: str, role: str, interactive: bool = False) -> str: + """Create text content start event.""" + return json.dumps({ + "event": { + "contentStart": { + "promptName": prompt_name, + "contentName": content_name, + "type": "TEXT", + "role": role, + "interactive": interactive, + "textInputConfiguration": { + "mediaType": "text/plain" + } + } + } + }) + + @staticmethod + def text_input(prompt_name: str, content_name: str, content: str) -> str: + """Create text input event.""" + return json.dumps({ + "event": { + "textInput": { + "promptName": prompt_name, + "contentName": content_name, + "content": content + } + } + }) + + @staticmethod + def tool_content_start(prompt_name: str, content_name: str, tool_use_id: str) -> str: + """Create tool content start event.""" + return json.dumps({ + "event": { + "contentStart": { + "promptName": prompt_name, + "contentName": content_name, + "interactive": False, + "type": "TOOL", + "role": "TOOL", + "toolResultInputConfiguration": { + "toolUseId": tool_use_id, + "type": "TEXT", + "textInputConfiguration": { + "mediaType": "text/plain" + } + } + } + } + }) + + @staticmethod + def tool_result(prompt_name: str, content_name: str, content: Any) -> str: + """Create tool result event.""" + content_str = json.dumps(content) if isinstance(content, dict) else str(content) + return json.dumps({ + "event": { + "toolResult": { + "promptName": prompt_name, + "contentName": content_name, + "content": content_str + } + } + }) + + @staticmethod + def content_end(prompt_name: str, content_name: str) -> str: + """Create content end event.""" + return json.dumps({ + "event": { + "contentEnd": { + "promptName": prompt_name, + "contentName": content_name + } + } + }) + + @staticmethod + def prompt_end(prompt_name: str) -> str: + """Create prompt end event.""" + return json.dumps({ + "event": { + "promptEnd": { + "promptName": prompt_name + } + } + }) + + @staticmethod + def session_end() -> str: + """Create session end event.""" + return json.dumps({ + "event": { + "sessionEnd": {} + } + }) + + @staticmethod + def prompt_start(prompt_name: str, voice_id: str, active_agent: str, tools: List[Dict[str, Any]]) -> str: + """Create prompt start event with tool configuration.""" + agent_tools = { + "support": { + "name": "open_ticket_tool", + "description": "Create a support ticket for customer issues", + "inputSchema": { + "json": json.dumps({ + "type": "object", + "properties": { + "issue_description": {"type": "string", "description": "Description of the customer's issue"}, + "customer_name": {"type": "string", "description": "Name of the customer"} + }, + "required": ["issue_description", "customer_name"] + }) + } + }, + "sales": { + "name": "order_computers_tool", + "description": "Place an order for computers", + "inputSchema": { + "json": json.dumps({ + "type": "object", + "properties": { + "computer_type": {"type": "string", "description": "Type of computer", "enum": ["laptop", "desktop"]}, + "customer_name": {"type": "string", "description": "Name of the customer"} + }, + "required": ["computer_type", "customer_name"] + }) + } + }, + "tracking": { + "name": "check_order_location_tool", + "description": "Check order location and status", + "inputSchema": { + "json": json.dumps({ + "type": "object", + "properties": { + "order_id": {"type": "string", "description": "Order ID to check"}, + "customer_name": {"type": "string", "description": "Name of the customer"} + }, + "required": ["order_id", "customer_name"] + }) + } + } + } + + tool_list = [ + { + "toolSpec": { + "name": "switch_agent", + "description": "CRITICAL: Invoke this function IMMEDIATELY when user requests to switch personas, speak with another department, or needs a different type of assistance. This transfers the conversation to a specialized agent with appropriate tools and expertise. Available agents: 'support' (technical issues, complaints, problems - creates support tickets), 'sales' (purchasing, pricing, product info - processes orders), 'tracking' (order status, delivery updates - checks shipment location). Example inputs - Sales requests: 'Can I buy a computer?', 'How much does a laptop cost?', 'I want to purchase a desktop', 'What products do you sell?', 'I'd like to place an order'. Support requests: 'I have issues with my wifi', 'My computer won't turn on', 'I need help with a problem', 'Something is broken', 'I want to file a complaint'. Tracking requests: 'What's my order status?', 'Where is my delivery?', 'When will my order arrive?', 'Can you track my package?', 'Has my order shipped yet?'. Direct transfer requests: 'Let me speak with sales', 'Transfer me to support', 'I need to talk to tracking'.", + "inputSchema": { + "json": json.dumps({ + "type": "object", + "properties": { + "role": {"type": "string", "enum": ["support", "sales", "tracking"], "default": "support"} + }, + "required": ["role"] + }) + } + } + } + ] + + if active_agent in agent_tools: + tool_list.append({"toolSpec": agent_tools[active_agent]}) + + return json.dumps({ + "event": { + "promptStart": { + "promptName": prompt_name, + "textOutputConfiguration": {"mediaType": "text/plain"}, + "audioOutputConfiguration": { + "mediaType": "audio/lpcm", + "sampleRateHertz": OUTPUT_SAMPLE_RATE, + "sampleSizeBits": 16, + "channelCount": 1, + "voiceId": voice_id, + "encoding": "base64", + "audioType": "SPEECH" + }, + "toolUseOutputConfiguration": {"mediaType": "application/json"}, + "toolConfiguration": {"tools": tool_list} + } + } + }) diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/stream_manager.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/stream_manager.py new file mode 100644 index 00000000..f4b5d200 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/stream_manager.py @@ -0,0 +1,404 @@ +"""Bedrock streaming manager for bidirectional communication.""" +import asyncio +import base64 +import json +import uuid +from typing import List, Dict, Any, Optional + +from aws_sdk_bedrock_runtime.client import BedrockRuntimeClient, InvokeModelWithBidirectionalStreamOperationInput +from aws_sdk_bedrock_runtime.models import InvokeModelWithBidirectionalStreamInputChunk, BidirectionalInputPayloadPart +from aws_sdk_bedrock_runtime.config import Config +from smithy_aws_core.identity.environment import EnvironmentCredentialsResolver + +from src.core.config import MAX_TOKENS, TOP_P, TEMPERATURE +from src.core.utils import debug_print, time_it_async +from src.core.event_templates import EventTemplates +from src.core.tool_processor import ToolProcessor + + +class BedrockStreamManager: + """Manages bidirectional streaming with AWS Bedrock.""" + + def __init__( + self, + model_id: str, + region: str, + voice_id: str = 'matthew', + system_prompt: Optional[str] = None, + conversation_history: Optional[List[Dict[str, str]]] = None, + active_agent: str = 'support' + ): + self.model_id = model_id + self.region = region + self.voice_id = voice_id + self.system_prompt = system_prompt + self.conversation_history = conversation_history or [] + self.active_agent = active_agent + + # Queues + self.audio_input_queue = asyncio.Queue() + self.audio_output_queue = asyncio.Queue() + self.output_queue = asyncio.Queue() + + # State + self.is_active = False + self.barge_in = False + self.switch_requested = False + self.new_voice = None + + # Session IDs + self.prompt_name = str(uuid.uuid4()) + self.content_name = str(uuid.uuid4()) + self.audio_content_name = str(uuid.uuid4()) + + # Tool handling + self.tool_processor = ToolProcessor() + self.pending_tool_tasks = {} + self.tool_use_content = "" + self.tool_use_id = "" + self.tool_name = "" + + # Response tracking + self.display_assistant_text = False + self.role = None + + # Client + self.bedrock_client = None + self.stream_response = None + self.response_task = None + + def _initialize_client(self): + """Initialize Bedrock client.""" + config = Config( + endpoint_uri=f"https://bedrock-runtime.{self.region}.amazonaws.com", + region=self.region, + aws_credentials_identity_resolver=EnvironmentCredentialsResolver(), + ) + self.bedrock_client = BedrockRuntimeClient(config=config) + + async def initialize_stream(self): + """Initialize bidirectional stream.""" + if not self.bedrock_client: + self._initialize_client() + + try: + self.stream_response = await time_it_async( + "invoke_model_with_bidirectional_stream", + lambda: self.bedrock_client.invoke_model_with_bidirectional_stream( + InvokeModelWithBidirectionalStreamOperationInput(model_id=self.model_id) + ) + ) + self.is_active = True + + # Send initialization sequence + await self._send_initialization_events() + + # Start response processing + self.response_task = asyncio.create_task(self._process_responses()) + asyncio.create_task(self._process_audio_input()) + + await asyncio.sleep(0.1) + debug_print("Stream initialized") + return self + + except Exception as e: + self.is_active = False + print(f"Failed to initialize stream: {e}") + raise + + async def _send_initialization_events(self): + """Send initialization event sequence.""" + system_prompt = self.system_prompt or "You are a friend engaging in natural real-time conversation." + + events = [ + EventTemplates.start_session(), + EventTemplates.prompt_start(self.prompt_name, self.voice_id, self.active_agent, []), + EventTemplates.text_content_start(self.prompt_name, self.content_name, "SYSTEM"), + EventTemplates.text_input(self.prompt_name, self.content_name, system_prompt), + EventTemplates.content_end(self.prompt_name, self.content_name) + ] + + for event in events: + await self.send_raw_event(event) + await asyncio.sleep(0.1) + + # Send conversation history + if self.conversation_history: + print(f"๐Ÿ“ค Sending conversation history: {len(self.conversation_history)} messages") + debug_print(f"Sending {len(self.conversation_history)} history messages") + self.conversation_history = self.conversation_history[:-1] + # Remove assistant messages from the start + while self.conversation_history and self.conversation_history[0].get('role') == 'ASSISTANT': + self.conversation_history.pop(0) + for msg in self.conversation_history: + await self._send_history_message(msg) + + speak_first_content_name = str(uuid.uuid4()) + speak_first_events = [ + EventTemplates.text_content_start(self.prompt_name,content_name=speak_first_content_name, role='USER', interactive=True), + EventTemplates.text_input(self.prompt_name, speak_first_content_name, 'Greet the user with his name and SHORT explanation your role'), + EventTemplates.content_end(self.prompt_name, speak_first_content_name) + ] + for event in speak_first_events: + await self.send_raw_event(event) + await asyncio.sleep(0.1) + + async def _send_history_message(self, message: Dict[str, str]): + """Send single history message.""" + history_content_name = str(uuid.uuid4()) + events = [ + EventTemplates.text_content_start(self.prompt_name, history_content_name, message["role"]), + EventTemplates.text_input(self.prompt_name, history_content_name, message["content"]), + EventTemplates.content_end(self.prompt_name, history_content_name) + ] + + for event in events: + await self.send_raw_event(event) + await asyncio.sleep(0.1) + + async def send_raw_event(self, event_json: str): + """Send raw event to Bedrock.""" + if not self.stream_response or not self.is_active: + debug_print("Stream not active") + return + + event = InvokeModelWithBidirectionalStreamInputChunk( + value=BidirectionalInputPayloadPart(bytes_=event_json.encode('utf-8')) + ) + + try: + await self.stream_response.input_stream.send(event) + if len(event_json) > 200: + event_type = list(json.loads(event_json).get("event", {}).keys()) + debug_print(f"Sent event: {event_type}") + else: + debug_print(f"Sent: {event_json}") + except Exception as e: + debug_print(f"Error sending event: {e}") + + async def _process_audio_input(self): + """Process audio input queue.""" + while self.is_active: + try: + data = await self.audio_input_queue.get() + audio_bytes = data.get('audio_bytes') + if not audio_bytes: + continue + + blob = base64.b64encode(audio_bytes).decode('utf-8') + event = EventTemplates.audio_input(self.prompt_name, self.audio_content_name, blob) + await self.send_raw_event(event) + + except asyncio.CancelledError: + break + except Exception as e: + debug_print(f"Error processing audio: {e}") + + def add_audio_chunk(self, audio_bytes: bytes): + """Add audio chunk to queue.""" + self.audio_input_queue.put_nowait({ + 'audio_bytes': audio_bytes, + 'prompt_name': self.prompt_name, + 'content_name': self.audio_content_name + }) + + async def send_audio_content_start_event(self): + """Send audio content start.""" + event = EventTemplates.content_start(self.prompt_name, self.audio_content_name) + await self.send_raw_event(event) + + async def send_audio_content_end_event(self): + """Send audio content end.""" + if self.is_active: + event = EventTemplates.content_end(self.prompt_name, self.audio_content_name) + await self.send_raw_event(event) + debug_print("Audio ended") + + async def _process_responses(self): + """Process incoming Bedrock responses.""" + try: + while self.is_active and not self.switch_requested: + try: + output = await self.stream_response.await_output() + result = await output[1].receive() + + if result.value and result.value.bytes_: + await self._handle_response(result.value.bytes_.decode('utf-8')) + + except StopAsyncIteration: + break + except Exception as e: + if "InvalidStateError" in str(e) or "CANCELLED" in str(e): + debug_print("Stream cancelled") + break + elif "ValidationException" in str(e): + print(f"Validation error: {e}") + break + else: + print(f"Error receiving response: {e}") + break + except Exception as e: + print(f"Response processing error: {e}") + finally: + self.is_active = False + + async def _handle_response(self, response_data: str): + """Handle single response.""" + try: + json_data = json.loads(response_data) + + if 'event' not in json_data: + await self.output_queue.put({"raw_data": response_data}) + return + + event = json_data['event'] + + if 'completionStart' in event: + debug_print(f"Completion start: {event}") + elif 'contentStart' in event: + self._handle_content_start(event['contentStart']) + elif 'textOutput' in event: + self._handle_text_output(event['textOutput']) + elif 'audioOutput' in event: + await self._handle_audio_output(event['audioOutput']) + elif 'toolUse' in event: + await self._handle_tool_use(event['toolUse']) + elif 'contentEnd' in event: + await self._handle_content_end(event['contentEnd']) + elif 'completionEnd' in event: + debug_print("Completion end") + elif 'usageEvent' in event: + debug_print(f"Usage: {event}") + + await self.output_queue.put(json_data) + + except json.JSONDecodeError: + await self.output_queue.put({"raw_data": response_data}) + + def _handle_content_start(self, content_start: Dict[str, Any]): + """Handle content start event.""" + debug_print("Content start") + self.role = content_start['role'] + + if 'additionalModelFields' in content_start: + try: + fields = json.loads(content_start['additionalModelFields']) + self.display_assistant_text = fields.get('generationStage') == 'FINAL' + except json.JSONDecodeError: + debug_print("Error parsing additionalModelFields") + + def _handle_text_output(self, text_output: Dict[str, Any]): + """Handle text output event.""" + content = text_output['content'] + role = text_output['role'] + + if '{ "interrupted" : true }' in content: + debug_print("Barge-in detected") + self.barge_in = True + + if (self.role == "ASSISTANT" and self.display_assistant_text) or self.role == "USER": + self.conversation_history.append({"role": role, "content": content}) + if (self.role == "ASSISTANT" and not self.display_assistant_text) or self.role == "USER": + print(f"{role.title()}: {content}") + + async def _handle_audio_output(self, audio_output: Dict[str, Any]): + """Handle audio output event.""" + audio_bytes = base64.b64decode(audio_output['content']) + await self.audio_output_queue.put(audio_bytes) + + async def _handle_tool_use(self, tool_use: Dict[str, Any]): + """Handle tool use event.""" + self.tool_use_content = tool_use + self.tool_name = tool_use['toolName'] + self.tool_use_id = tool_use['toolUseId'] + + if self.tool_name == 'switch_agent': + content_data = json.loads(tool_use['content']) + self.new_voice = content_data.get("role", "support").lower() + await asyncio.sleep(0.1) + self.switch_requested = True + print(f"๐ŸŽฏ Switching to: {self.new_voice}") + else: + print(f"๐ŸŽฏ Tool use: {self.tool_name}") + debug_print(f"Tool: {self.tool_name}, ID: {self.tool_use_id}") + + async def _handle_content_end(self, content_end: Dict[str, Any]): + """Handle content end event.""" + if content_end.get('type') == 'TOOL': + debug_print("Processing tool") + self._handle_tool_request(self.tool_name, self.tool_use_content, self.tool_use_id) + else: + debug_print("Content end") + + def _handle_tool_request(self, tool_name: str, tool_content: Dict[str, Any], tool_use_id: str): + """Handle tool request asynchronously.""" + content_name = str(uuid.uuid4()) + task = asyncio.create_task( + self._execute_tool_and_send_result(tool_name, tool_content, tool_use_id, content_name) + ) + self.pending_tool_tasks[content_name] = task + task.add_done_callback(lambda t: self._handle_tool_completion(t, content_name)) + + def _handle_tool_completion(self, task, content_name: str): + """Handle tool task completion.""" + self.pending_tool_tasks.pop(content_name, None) + if task.done() and not task.cancelled(): + exception = task.exception() + if exception: + debug_print(f"Tool task failed: {exception}") + + async def _execute_tool_and_send_result( + self, + tool_name: str, + tool_content: Dict[str, Any], + tool_use_id: str, + content_name: str + ): + """Execute tool and send result.""" + try: + debug_print(f"Executing tool: {tool_name}") + result = await self.tool_processor.process_tool_async(tool_name, tool_content) + + await self.send_raw_event(EventTemplates.tool_content_start(self.prompt_name, content_name, tool_use_id)) + await self.send_raw_event(EventTemplates.tool_result(self.prompt_name, content_name, result)) + await self.send_raw_event(EventTemplates.content_end(self.prompt_name, content_name)) + + debug_print(f"Tool complete: {tool_name}") + except Exception as e: + debug_print(f"Tool error: {e}") + try: + error_result = {"error": f"Tool failed: {e}"} + await self.send_raw_event(EventTemplates.tool_content_start(self.prompt_name, content_name, tool_use_id)) + await self.send_raw_event(EventTemplates.tool_result(self.prompt_name, content_name, error_result)) + await self.send_raw_event(EventTemplates.content_end(self.prompt_name, content_name)) + except Exception as send_error: + debug_print(f"Failed to send error: {send_error}") + + async def close(self): + """Close stream and cleanup.""" + if not self.is_active: + return + + debug_print("Closing stream") + self.is_active = False + + for task in self.pending_tool_tasks.values(): + task.cancel() + + if self.response_task and not self.response_task.done(): + self.response_task.cancel() + + try: + await self.send_audio_content_end_event() + await self.send_raw_event(EventTemplates.prompt_end(self.prompt_name)) + await self.send_raw_event(EventTemplates.session_end()) + except Exception as e: + debug_print(f"Error during close: {e}") + + if self.stream_response: + try: + await self.stream_response.input_stream.close() + except Exception as e: + debug_print(f"Error closing input stream: {e}") + + debug_print("Stream closed") diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/tool_processor.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/tool_processor.py new file mode 100644 index 00000000..e096557e --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/tool_processor.py @@ -0,0 +1,45 @@ +"""Tool processing for async execution.""" +import asyncio +import json +import uuid +from typing import Dict, Any +from src.core.utils import debug_print +from src.agents.tools import open_ticket_tool, order_computers_tool, check_order_location_tool + + +class ToolProcessor: + """Handles asynchronous tool execution.""" + + def __init__(self): + self.tasks = {} + self._tool_map = { + 'open_ticket_tool': open_ticket_tool, + 'order_computers_tool': order_computers_tool, + 'check_order_location_tool': check_order_location_tool + } + + async def process_tool_async(self, tool_name: str, tool_content: Dict[str, Any]) -> Dict[str, Any]: + """Process tool call asynchronously.""" + task_id = str(uuid.uuid4()) + task = asyncio.create_task(self._run_tool(tool_name, tool_content)) + self.tasks[task_id] = task + + try: + return await task + finally: + self.tasks.pop(task_id, None) + + async def _run_tool(self, tool_name: str, tool_content: Dict[str, Any]) -> Dict[str, Any]: + """Execute tool logic.""" + debug_print(f"Processing tool: {tool_name}") + + tool_func = self._tool_map.get(tool_name.lower()) + if not tool_func: + return {"error": f"Unknown tool: {tool_name}"} + + try: + content = tool_content.get("content", {}) + params = json.loads(content) if isinstance(content, str) else content + return await tool_func(**params) + except Exception as e: + return {"error": f"Tool execution failed: {str(e)}"} diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/utils.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/utils.py new file mode 100644 index 00000000..f678e727 --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/core/utils.py @@ -0,0 +1,39 @@ +"""Utility functions for logging and performance monitoring.""" +import datetime +import time +import inspect +from src.core.config import DEBUG + + +def debug_print(message: str) -> None: + """Print debug message with timestamp and function name.""" + if not DEBUG: + return + + stack = inspect.stack() + func_name = stack[1].function + + # Skip wrapper functions + if func_name in ('time_it', 'time_it_async'): + func_name = stack[2].function + + timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + print(f"{timestamp} {func_name} {message}") + + +def time_it(label: str, func): + """Time synchronous function execution.""" + start = time.perf_counter() + result = func() + elapsed = time.perf_counter() - start + debug_print(f"Execution time for {label}: {elapsed:.4f}s") + return result + + +async def time_it_async(label: str, func): + """Time asynchronous function execution.""" + start = time.perf_counter() + result = await func() + elapsed = time.perf_counter() - start + debug_print(f"Execution time for {label}: {elapsed:.4f}s") + return result diff --git a/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/multi_agent.py b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/multi_agent.py new file mode 100644 index 00000000..70dc6c7c --- /dev/null +++ b/speech-to-speech/amazon-nova-2-sonic/repeatable-patterns/conversation-transfer/src/multi_agent.py @@ -0,0 +1,107 @@ +"""Multi-agent orchestrator for Nova 2 Sonic conversations.""" +import asyncio +import os +import pygame +from typing import List, Dict +from src.core.stream_manager import BedrockStreamManager +from src.audio.audio_streamer import AudioStreamer +from src.agents.agent_config import AGENTS + + +class MultiAgentSonic: + """Orchestrates multi-agent voice conversations.""" + + def __init__(self, model_id: str, region: str, debug: bool = False): + self.model_id = model_id + self.region = region + self.debug = debug + self.active_agent = "support" + self.conversation_history: List[Dict[str, str]] = [] + self.agents = AGENTS + self.stream_manager = None + self.audio_streamer = None + + async def start_conversation(self): + """Start voice conversation with agent switching.""" + while True: + try: + agent_config = self.agents.get(self.active_agent, self.agents["support"]) + print(f"๐ŸŽค Starting conversation with {self.active_agent.title()}...") + + await asyncio.sleep(2) + + # Create components + self.stream_manager = BedrockStreamManager( + model_id=self.model_id, + region=self.region, + voice_id=agent_config.voice_id, + system_prompt=agent_config.instruction, + conversation_history=self.conversation_history, + active_agent=self.active_agent + ) + + self.audio_streamer = AudioStreamer(self.stream_manager) + + # Initialize and start + await self.stream_manager.initialize_stream() + + # Stop transition music + self._stop_music() + + # Start conversation + await self.audio_streamer.start_streaming() + + # Check for agent switch + if self.stream_manager.switch_requested: + self.conversation_history = self.stream_manager.conversation_history + new_agent = self.stream_manager.new_voice + print(f"๐Ÿ”„ Switching: {self.active_agent} โ†’ {new_agent}") + + # Play transition music + self._play_music() + + # Close connection + await self.stream_manager.close() + + self.active_agent = new_agent + await self.cleanup() + continue + else: + print("๐Ÿ‘‹ Conversation ended") + break + + except KeyboardInterrupt: + print("\n๐Ÿ‘‹ Interrupted by user") + break + except Exception as e: + print(f"Error: {e}") + if self.debug: + import traceback + traceback.print_exc() + break + + def _play_music(self): + """Play transition music.""" + try: + pygame.mixer.init() + music_path = os.path.join(os.path.dirname(__file__), "..", "music.mp3") + if os.path.exists(music_path): + pygame.mixer.music.load(music_path) + pygame.mixer.music.play(-1) + print("๐ŸŽต Playing transition music") + except Exception as e: + print(f"Could not play music: {e}") + + def _stop_music(self): + """Stop transition music.""" + try: + pygame.mixer.music.stop() + print("๐ŸŽต Stopped transition music") + except: + pass + + async def cleanup(self): + """Clean up resources.""" + print("๐Ÿงน Cleaning up...") + if self.audio_streamer: + await self.audio_streamer.stop_streaming()