diff --git a/nova-omni/getting-started/00_setup.ipynb b/nova-omni/getting-started/00_setup.ipynb new file mode 100644 index 00000000..11cbc2a5 --- /dev/null +++ b/nova-omni/getting-started/00_setup.ipynb @@ -0,0 +1,273 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "setup-title", + "metadata": {}, + "source": [ + "# Amazon Nova 2 Omni - Setup and Configuration\n", + "\n", + "This notebook helps you set up your environment for working with Amazon Nova 2 Omni model.\n", + "\n", + "## What is Amazon Nova 2 Omni?\n", + "\n", + "Amazon Nova 2 Omni is a multimodal foundation model that can understand and generate content across text, images, and audio. Key capabilities include:\n", + "\n", + "- **Speech Understanding**: Transcribe, summarize, analyze, and answer questions about audio content\n", + "- **Image Generation**: Create high-quality images from text descriptions\n", + "- **Multimodal Reasoning**: Process and understand multiple input modalities simultaneously\n", + "\n", + "**Supported Audio Formats:** mp3, opus, wav, aac, flac, mp4, ogg, mkv\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "prerequisites", + "metadata": {}, + "source": [ + "## Prerequisites Check\n", + "\n", + "Let's verify that your environment is properly configured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "check-python", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "print(f\"Python version: {sys.version}\")\n", + "\n", + "# Check Python version\n", + "if sys.version_info >= (3, 12):\n", + " print(\"✅ Python 3.12+ is installed\")\n", + "else:\n", + " print(\"❌ Python 3.12+ is required. Please upgrade your Python version.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install-dependencies", + "metadata": {}, + "outputs": [], + "source": [ + "# Install required boto3/botocore versions\n", + "!pip install boto3==1.42.4 botocore==1.42.4 --force-reinstall --no-cache-dir -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "check-dependencies", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify installed versions\n", + "import boto3\n", + "import botocore\n", + "\n", + "print(f\"boto3 version: {boto3.__version__}\")\n", + "print(f\"botocore version: {botocore.__version__}\")\n", + "\n", + "if boto3.__version__ == '1.42.4' and botocore.__version__ == '1.42.4':\n", + " print(\"✅ Correct boto3/botocore versions installed\")\n", + "else:\n", + " print(\"⚠️ Version mismatch detected\")" + ] + }, + { + "cell_type": "markdown", + "id": "aws-setup", + "metadata": {}, + "source": [ + "## AWS Configuration\n", + "\n", + "Let's verify your AWS credentials and region configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "check-aws-config", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from botocore.exceptions import NoCredentialsError, ClientError\n", + "\n", + "try:\n", + " # Check AWS credentials\n", + " session = boto3.Session()\n", + " credentials = session.get_credentials()\n", + " \n", + " if credentials:\n", + " print(\"✅ AWS credentials are configured\")\n", + " print(f\"Region: {session.region_name or 'Not set (will use us-east-1)'}\")\n", + " else:\n", + " print(\"❌ AWS credentials not found. Please configure your AWS CLI or set environment variables.\")\n", + " \n", + "except Exception as e:\n", + " print(f\"❌ Error checking AWS configuration: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "bedrock-setup", + "metadata": {}, + "source": [ + "## Amazon Bedrock Setup\n", + "\n", + "Let's test the connection to Amazon Bedrock and verify model access." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test-bedrock-connection", + "metadata": {}, + "outputs": [], + "source": [ + "from botocore.config import Config\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "def test_bedrock_connection():\n", + " \"\"\"Test connection to Amazon Bedrock\"\"\"\n", + " try:\n", + " config = Config(\n", + " read_timeout=2 * 60,\n", + " )\n", + " bedrock = boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config,\n", + " )\n", + " \n", + " # Test with a simple text-only request\n", + " response = bedrock.converse(\n", + " modelId=MODEL_ID,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [{\"text\": \"Hello, can you respond with just 'Hello back!'?\"}],\n", + " }\n", + " ],\n", + " inferenceConfig={\"maxTokens\": 50},\n", + " )\n", + " \n", + " print(\"✅ Successfully connected to Amazon Bedrock\")\n", + " print(\"✅ Nova 2 Omni model is accessible\")\n", + " print(f\"Test response: {response['output']['message']['content'][0]['text']}\")\n", + " return True\n", + " \n", + " except ClientError as e:\n", + " error_code = e.response['Error']['Code']\n", + " if error_code == 'AccessDeniedException':\n", + " print(\"❌ Access denied. Please check your IAM permissions include 'bedrock:InvokeModel'\")\n", + " elif error_code == 'ValidationException':\n", + " print(\"❌ Model not found. Please verify the model ID is correct.\")\n", + " else:\n", + " print(f\"❌ Bedrock error: {e}\")\n", + " return False\n", + " \n", + " except Exception as e:\n", + " print(f\"❌ Connection error: {e}\")\n", + " return False\n", + "\n", + "# Test the connection\n", + "connection_success = test_bedrock_connection()" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "If all checks passed successfully, you're ready to explore Nova 2 Omni capabilities!\n", + "\n", + "### Available Notebooks:\n", + "\n", + "1. **01_speech_understanding_examples.ipynb** - Audio processing:\n", + " - Transcribe audio with speaker diarization\n", + " - Summarize and analyze audio content\n", + " - Call analytics with structured output\n", + "\n", + "2. **02_image_generation_examples.ipynb** - Image generation:\n", + " - Text-to-image with aspect ratio control\n", + " - Image editing and style transfer\n", + " - Text in images and creative control\n", + "\n", + "3. **03_multimodal_understanding_examples.ipynb** - Multimodal analysis:\n", + " - Image and video understanding\n", + " - Video summarization and classification\n", + " - Audio content analysis\n", + "\n", + "4. **04_langchain_multimodal_reasoning.ipynb** - LangChain integration:\n", + " - Tool use with structured outputs\n", + " - Reasoning effort configuration\n", + " - MMMU-style evaluation patterns\n", + "\n", + "5. **05_langgraph_multimodal_reasoning.ipynb** - LangGraph workflows:\n", + " - Stateful reasoning workflows\n", + " - Multi-step reasoning chains\n", + " - Conditional routing with tools\n", + "\n", + "6. **06_strands_multimodal_reasoning.ipynb** - Multi-agent systems:\n", + " - Specialized agents for different modalities\n", + " - Agent orchestration and coordination\n", + " - Collaborative reasoning patterns\n", + "\n", + "7. **07_document_understanding_examples.ipynb** - Document processing:\n", + " - OCR and text extraction\n", + " - Key information extraction with JSON\n", + " - Object detection and counting\n", + "\n", + "### Tips for Success:\n", + "\n", + "- Start with the speech understanding examples if you're interested in audio processing\n", + "- The model supports various audio formats: mp3, opus, wav, aac, flac, mp4, ogg, mkv\n", + "- For best results with transcription, use temperature=0.0\n", + "- For creative tasks, experiment with different temperature values (0.1-0.9)\n", + "\n", + "Happy exploring! 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99b8b917-12b0-4a43-bdbd-a59778e03930", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/01-speech_understanding_examples.ipynb b/nova-omni/getting-started/01-speech_understanding_examples.ipynb new file mode 100644 index 00000000..59239c9d --- /dev/null +++ b/nova-omni/getting-started/01-speech_understanding_examples.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c80d6694", + "metadata": {}, + "source": [ + "# Speech Understanding Examples\n", + "\n", + "This notebook demonstrates how to use Amazon Nova 2 Omni for speech understanding tasks. Nova 2 Omni can transcribe, summarize, analyze, answer questions about, and translate speech content in audio files.\n", + "\n", + "**Supported audio formats:** mp3, opus, wav, aac, flac, mp4, ogg, mkv\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "3dfbc101", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "### Helper Functions\n", + "\n", + "Run the cell below to establish helper functions used by the examples in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d27624", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from botocore.config import Config\n", + "\n", + "import nova_utils\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "def get_bedrock_runtime():\n", + " \"\"\"Returns a properly configured Bedrock Runtime client.\"\"\"\n", + " config = Config(\n", + " read_timeout=2 * 60,\n", + " )\n", + " bedrock = boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config,\n", + " )\n", + " return bedrock\n", + "\n", + "\n", + "def speech_to_text(\n", + " audio_path,\n", + " audio_type,\n", + " text_prompt,\n", + " temperature=None,\n", + " max_tokens=10000,\n", + " reasoning_effort=None,\n", + "):\n", + " \"\"\"\n", + " Generates a text output from a text prompt and a single input audio.\n", + "\n", + " Args:\n", + " audio_path: The path to the input audio.\n", + " audio_type: Type of the audio file (mp3, opus, wav, aac, flac, mp4, ogg, mkv)\n", + " text_prompt: The text prompt to use for speech understanding.\n", + " temperature: Optional temperature parameter (0.0-1.0). If None, uses model default.\n", + " max_tokens: Maximum number of tokens to generate (default: 10000).\n", + " reasoning_effort: Optional reasoning effort level (\"low\", \"medium\", \"high\"). If None, reasoning is disabled.\n", + "\n", + " Returns:\n", + " (Dict) A dictionary with \"text\" and \"request_id\" keys\n", + " \"\"\"\n", + " audio_bytes = nova_utils.load_audio_as_bytes(audio_path)\n", + "\n", + " # Build inference config\n", + " inference_config = {\"maxTokens\": max_tokens}\n", + " if temperature is not None:\n", + " inference_config[\"temperature\"] = temperature\n", + "\n", + " # Build request\n", + " request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"audio\": {\"format\": audio_type, \"source\": {\"bytes\": audio_bytes}}},\n", + " {\"text\": text_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": inference_config,\n", + " }\n", + "\n", + " # Add reasoning config if specified\n", + " if reasoning_effort is not None:\n", + " if reasoning_effort.lower() not in [\"low\", \"medium\", \"high\"]:\n", + " raise ValueError(\"reasoning_effort must be 'low', 'medium', or 'high'\")\n", + "\n", + " request[\"additionalModelRequestFields\"] = {\n", + " \"reasoningConfig\": {\n", + " \"type\": \"enabled\",\n", + " \"maxReasoningEffort\": reasoning_effort.lower(),\n", + " }\n", + " }\n", + "\n", + " bedrock_runtime = get_bedrock_runtime()\n", + "\n", + " response = bedrock_runtime.converse(**request)\n", + " import json\n", + "\n", + " return {\n", + " \"text\": nova_utils.extract_response_text(response),\n", + " \"request_id\": response.get(\"ResponseMetadata\", {}).get(\"RequestId\", \"N/A\"),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "97f5a496", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Use Case 1: Transcribing Speech from Audio Files\n", + "\n", + "Nova 2 Omni can transcribe speech content in audio files and can provide annotations indicating who is speaking, known as diarization.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0 (greedy decoding)\n", + "* Reasoning should not be used\n", + "\n", + "---\n", + "\n", + "### Example 1a: Speech Transcription (Without Diarization)\n", + "\n", + "**Recommended prompt template:**\n", + "```\n", + "Transcribe the audio.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b36fa9bf", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "temperature = 0.0\n", + "\n", + "user_prompt = \"Transcribe the audio.\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt, temperature)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Transcription Output ==\\n\")\n", + " print(result[\"text\"])\n", + "\n", + " # Store for later use in Q&A examples\n", + " transcription = result[\"text\"]\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "y84ug4xk7xg", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 1b: Speech Transcription (With Diarization)\n", + "\n", + "**Recommended prompt template:**\n", + "```\n", + "For each speaker turn segment, transcribe, assign a speaker label, start and end timestamps. \n", + "You must follow the exact XML format shown in the example below: \n", + "'transcription_text\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "g8p0osrp35q", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "temperature = 0.0\n", + "\n", + "user_prompt = \"\"\"For each speaker turn segment, transcribe, assign a speaker label, start and end timestamps. You must follow the exact XML format shown in the example below: 'transcription_text'\"\"\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt, temperature)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Transcription with Diarization Output ==\\n\")\n", + " print(result[\"text\"])\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2byv2ph01dq", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Use Case 2: Summarizing Speech in Audio Files\n", + "\n", + "The Nova 2 Omni model is capable of understanding speech in audio files and generating concise summaries.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: text default parameters\n", + "* `topP`: text default parameters\n", + "* Some use cases may benefit from enabling model reasoning; however, we recommend starting without reasoning first\n", + "\n", + "---\n", + "\n", + "### Example 2: Summarize Audio Content\n", + "\n", + "**Recommended prompt template:**\n", + "```\n", + "Extract and summarize the essential details from the audio.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ho2i85qkmih", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"Extract and summarize the essential details from the audio.\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Summary Output ==\\n\")\n", + " print(result[\"text\"])\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "yryhbc7vevk", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Use Case 3: Analyzing Audio Calls\n", + "\n", + "The Nova 2 Omni model is capable of understanding speech in audio files and generating structured call analytics based on your business needs.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: text default parameters\n", + "* `topP`: text default parameters\n", + "* Some use cases may benefit from enabling model reasoning; however, we recommend starting without reasoning first\n", + "\n", + "---\n", + "\n", + "### Example 3: Call Analytics with Structured JSON Output\n", + "\n", + "**Example prompt:**\n", + "```\n", + "Analyze the call and return JSON:\n", + "{\n", + " \"call_summary\": \"Summarize the call\",\n", + " \"customer_intent\": \"What the customer wanted\",\n", + " \"resolution_status\": \"resolved/pending/escalated\",\n", + " \"key_topics\": [\"topic1\", \"topic2\"],\n", + " \"action_items\": [\n", + " {\"task\": \"description\", \"owner\": \"agent/customer\", \"priority\": \"high/medium/low\"}\n", + " ],\n", + " \"sentiment_analysis\": {\n", + " \"overall\": \"positive/neutral/negative\"\n", + " },\n", + " \"follow_up_required\": true/false\n", + "}\n", + "```\n", + "\n", + "**Note:** You can customize the JSON structure based on your specific business needs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bkuhqtgzp0j", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"\"\"Analyze the call and return JSON:\n", + "{\n", + " \"call_summary\": \"Summarize the call\",\n", + " \"customer_intent\": \"What the customer wanted\",\n", + " \"resolution_status\": \"resolved/pending/escalated\",\n", + " \"key_topics\": [\"topic1\", \"topic2\"],\n", + " \"action_items\": [\n", + " {\"task\": \"description\", \"owner\": \"agent/customer\", \"priority\": \"high/medium/low\"}\n", + " ],\n", + " \"sentiment_analysis\": {\n", + " \"overall\": \"positive/neutral/negative\"\n", + " },\n", + " \"follow_up_required\": true/false\n", + "}\"\"\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Call Analytics Output ==\\n\")\n", + " print(result[\"text\"])\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cvu5w46tjdl", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Use Case 4: Asking Questions About Audio File Content\n", + "\n", + "You can leverage Nova 2 Omni's speech understanding capabilities for question and answer use cases.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: text default parameters\n", + "* `topP`: text default parameters\n", + "* Some use cases may benefit from enabling model reasoning; however, we recommend starting without reasoning first\n", + "\n", + "**Note:** No specific prompting template is required. Simply ask your question naturally.\n", + "\n", + "---\n", + "\n", + "### Example 4a: Count Speakers in Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3svnjv7ntq9", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"How many speakers are in the audio?\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Q&A Output ==\\n\")\n", + " print(f\"Question: {user_prompt}\")\n", + " print(f\"Answer: {result['text']}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "02b8g44i9yav", + "metadata": {}, + "source": [ + "### Example 4b: Analyze Emotional Tone" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ytigscnws1", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"What was the overall emotional tone of the speaker (e.g., frustrated, calm, excited)?\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Q&A Output ==\\n\")\n", + " print(f\"Question: {user_prompt}\")\n", + " print(f\"Answer: {result['text']}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "k2ppj1c642i", + "metadata": {}, + "source": [ + "### Example 4c: List People Mentioned" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nc3fjru4vkh", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"List the people mentioned in the audio.\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Q&A Output ==\\n\")\n", + " print(f\"Question: {user_prompt}\")\n", + " print(f\"Answer: {result['text']}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "18s3r2o71c9i", + "metadata": {}, + "source": [ + "### Example 4d: Detect Background Noise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "kkbrhrci0qi", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"Is there background noise in the audio?\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Q&A Output ==\\n\")\n", + " print(f\"Question: {user_prompt}\")\n", + " print(f\"Answer: {result['text']}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8krrufcof2y", + "metadata": {}, + "source": [ + "### Example 4e: Describe Speakers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "diuabbs86uq", + "metadata": {}, + "outputs": [], + "source": [ + "audio_path = \"media/call_1763087723216.wav\"\n", + "audio_type = \"wav\"\n", + "\n", + "user_prompt = \"Describe the speakers in the audio.\"\n", + "\n", + "try:\n", + " result = speech_to_text(audio_path, audio_type, user_prompt)\n", + "\n", + " if result[\"text\"]:\n", + " print(f\"Request ID: {result['request_id']}\\n\")\n", + " print(\"== Q&A Output ==\\n\")\n", + " print(f\"Question: {user_prompt}\")\n", + " print(f\"Answer: {result['text']}\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error occurred: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad0bc6d-f356-460e-a567-6bbbec08076c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/02-image_generation_examples.ipynb b/nova-omni/getting-started/02-image_generation_examples.ipynb new file mode 100644 index 00000000..ef3e3ae3 --- /dev/null +++ b/nova-omni/getting-started/02-image_generation_examples.ipynb @@ -0,0 +1,501 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6af6f4e6", + "metadata": {}, + "source": [ + "# Image Generation Examples\n", + "\n", + "The Amazon Nova 2 Omni model can generate an image from a simple text prompt or from a combination of text and a single input image. All image generation is achieved through text prompting alone - no complex image generation parameters needed!\n", + "\n", + "## Supported Resolutions and Aspect Ratios\n", + "\n", + "**Text-to-Image:**\n", + "- Maximum output size: 4,194,304 pixels (4 megapixels)\n", + "- Default aspect ratio: 16:9\n", + "- Landscape: 2:1 (2880x1440), 16:9 (2704x1520), 3:2 (2496x1664), 4:3 (2352x1760)\n", + "- Square: 1:1 (2048x2048)\n", + "- Portrait: 1:2 (1440x2880), 9:16 (1520x2704), 2:3 (1664x2496), 3:4 (1760x2352)\n", + "\n", + "**Image Editing:**\n", + "- Maximum output size: 1,048,576 pixels (1 megapixel)\n", + "- Output matches input aspect ratio unless explicitly requested otherwise\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "3e0c580c", + "metadata": {}, + "source": [ + "### Helper Functions\n", + "\n", + "Run the cell below to establish some helper functions used by the examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3399ca7", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "import boto3\n", + "from botocore.config import Config\n", + "from botocore.exceptions import ClientError\n", + "\n", + "import nova_notebook_utils\n", + "import nova_utils\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "# Configure the Bedrock Runtime client with an extended read timeout.\n", + "config = Config(\n", + " read_timeout=60 * 5,\n", + " retries={\"max_attempts\": 0},\n", + ")\n", + "bedrock_runtime = boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "text-to-image-section", + "metadata": {}, + "source": [ + "## Text-to-Image Generation\n", + "\n", + "Generate images from text prompts. At minimum, include a phrase that makes it clear you want an image generated.\n", + "\n", + "---\n", + "\n", + "### Example 1: Basic Text-to-Image\n", + "\n", + "Simple prompts work well with Nova 2 Omni." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aaeb43e", + "metadata": {}, + "outputs": [], + "source": [ + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"text\": \"Create an image of a cozy campfire at night\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [new_message],\n", + "}\n", + "\n", + "# == Uncomment below to include a system prompt ==\n", + "# request[\"system\"] = [{\"text\": \"Always generate images using 1:1 aspect ratio\"}]\n", + "\n", + "# == Uncomment below to enable reasoning ==\n", + "# request[\"additionalModelRequestFields\"] = {\n", + "# \"reasoningConfig\": {\n", + "# \"type\": \"enabled\",\n", + "# \"maxReasoningEffort\": \"low\", # \"low\" | \"medium\" | \"high\"\n", + "# }\n", + "# }\n", + "\n", + "# == Uncomment below to set your own inference params ==\n", + "# request[\"inferenceConfig\"] = {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 10000}\n", + "\n", + "try:\n", + " response = nova_utils.converse(\n", + " bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\"\n", + " )\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")" + ] + }, + { + "cell_type": "markdown", + "id": "aspect-ratio-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 2: Setting Aspect Ratio\n", + "\n", + "Request specific aspect ratios by mentioning them in your prompt. Default is 16:9." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aspect-ratio-code", + "metadata": {}, + "outputs": [], + "source": [ + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"text\": \"Create an image of a pyramid of ice cream scoops on a plain yellow background. Make it square.\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "visual-style-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 3: Setting Visual Style\n", + "\n", + "Dictate the visual style by describing it in your prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "visual-style-code", + "metadata": {}, + "outputs": [], + "source": [ + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"text\": \"Create a whimsical storybook illustration of a fox character wearing a backpack\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "detailed-prompt-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 4: Detailed Creative Control\n", + "\n", + "Be more descriptive about subject, environment, and action for greater control." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "detailed-prompt-code", + "metadata": {}, + "outputs": [], + "source": [ + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"text\": \"Create an image of a young girl detective. She should be wearing a beret and a checkered shirt. She is in an alley examining the ground with a magnifying glass where some dark footprints are visible. It's daytime with stark shadows. The image should be in a colorful graphic novel style with strong line work.\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "text-in-image-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 5: Generating Text in Images\n", + "\n", + "For single line text, wrap it in double quotes. For multiline text, clearly set it apart." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "text-in-image-code", + "metadata": {}, + "outputs": [], + "source": [ + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"text\": \"\"\"Generate a medium closeup photo of the front window of a charming restaurant. The words \"Trattoria Nova\" can be seen on the glass, written in a gold calligraphy font that conveys elegance\"\"\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "image-editing-section", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Image Editing with Input Images\n", + "\n", + "Nova 2 Omni allows you to include a single image as input for image generation. You can edit the image or use it as a reference.\n", + "\n", + "---\n", + "\n", + "### Example 6: Adding/Removing/Replacing Items\n", + "\n", + "Edit images by adding, removing, or replacing items." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3734d7a9", + "metadata": {}, + "outputs": [], + "source": [ + "input_image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(input_image_path)\n", + "\n", + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Add a cup of tea to the scene\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "background-change-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 7: Changing Background\n", + "\n", + "Replace or modify the background of an image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "background-change-code", + "metadata": {}, + "outputs": [], + "source": [ + "input_image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(input_image_path)\n", + "\n", + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Replace the background with a view of the mountains\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "restyle-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 8: Restyling an Image\n", + "\n", + "Change the artistic style of an image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "restyle-code", + "metadata": {}, + "outputs": [], + "source": [ + "input_image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(input_image_path)\n", + "\n", + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Change the style of the image to a 3D animated film\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "aspect-ratio-change-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 9: Changing Aspect Ratio\n", + "\n", + "Expand or change the aspect ratio of an existing image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aspect-ratio-change-code", + "metadata": {}, + "outputs": [], + "source": [ + "input_image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(input_image_path)\n", + "\n", + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Change the aspect ratio to square\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "extract-element-example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 10: Extracting Elements\n", + "\n", + "Extract specific elements from an image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "extract-element-code", + "metadata": {}, + "outputs": [], + "source": [ + "input_image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(input_image_path)\n", + "\n", + "new_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Extract the foreground object in this image and place it on a white background\"},\n", + " ],\n", + "}\n", + "\n", + "request = {\"modelId\": MODEL_ID, \"messages\": [new_message]}\n", + "\n", + "try:\n", + " response = nova_utils.converse(bedrock_runtime=bedrock_runtime, request=request, output_dir=\"output\")\n", + " nova_notebook_utils.render_response(response)\n", + "except ClientError as e:\n", + " print(f\"Error: {json.dumps(e.response, indent=4)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9821b8ae-46b6-4837-b8d5-5f1bb7eb5ae0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/03-video_understanding_examples.ipynb b/nova-omni/getting-started/03-video_understanding_examples.ipynb new file mode 100644 index 00000000..656b3609 --- /dev/null +++ b/nova-omni/getting-started/03-video_understanding_examples.ipynb @@ -0,0 +1,486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# Video Understanding Examples\n", + "\n", + "This notebook demonstrates how to use Amazon Nova 2 Omni for understanding video content. Nova 2 Omni can analyze videos, understand actions, extract insights, and classify video types.\n", + "\n", + "**Supported video formats:** mp4, mov, avi, mkv, webm\n", + "\n", + "**Note:** For audio understanding examples, see **01_speech_understanding_examples.ipynb**. For image generation examples, see **02_image_generation_examples.ipynb**.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "setup", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "### Helper Functions\n", + "\n", + "Run the cell below to establish helper functions used by the examples in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import boto3\n", + "from botocore.config import Config\n", + "from botocore.exceptions import ClientError\n", + "from IPython.display import Image, display\n", + "\n", + "import nova_utils\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "def get_bedrock_runtime():\n", + " \"\"\"Returns a properly configured Bedrock Runtime client.\"\"\"\n", + " config = Config(read_timeout=2 * 60)\n", + " bedrock = boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config,\n", + " )\n", + " return bedrock" + ] + }, + { + "cell_type": "markdown", + "id": "video-understanding-section", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Video Understanding\n", + "\n", + "Nova 2 Omni can analyze video content, understand actions, classify video types, and extract insights from moving images.\n", + "\n", + "---\n", + "\n", + "### Example 1a: Video Summarization\n", + "\n", + "Create executive summaries of video content.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0\n", + "* `topP`: 1\n", + "* Some use cases may benefit from enabling model reasoning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "video-summarize", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_VIDEO_PATH = \"media/Cheesecake.mp4\"\n", + "user_prompt = \"Can you create an executive summary of this video's content?\"\n", + "\n", + "with open(INPUT_VIDEO_PATH, \"rb\") as video_file:\n", + " video_bytes = video_file.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": user_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 10000},\n", + "}\n", + "\n", + "bedrock_runtime = get_bedrock_runtime()\n", + "\n", + "try:\n", + " response = bedrock_runtime.converse(**request)\n", + " text_content = next((item for item in response[\"output\"][\"message\"][\"content\"] if \"text\" in item), None)\n", + " \n", + " if text_content:\n", + " print(\"== Video Summary ==\")\n", + " print(text_content[\"text\"])\n", + "\n", + "except ClientError as err:\n", + " print(f\"Error occurred: {err}\")" + ] + }, + { + "cell_type": "markdown", + "id": "video-captioning", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 1b: Step-by-Step Recipe Extraction\n", + "\n", + "Extract structured recipe information from cooking videos.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0\n", + "* `topP`: 1\n", + "* Some use cases may benefit from enabling model reasoning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "video-caption", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_VIDEO_PATH = \"media/Cheesecake.mp4\"\n", + "user_prompt = \"\"\"Extract the recipe from this video. Provide:\n", + "1. Recipe name\n", + "2. Ingredients list with measurements\n", + "3. Step-by-step instructions\n", + "\n", + "Format as a clear, structured recipe.\"\"\"\n", + "\n", + "with open(INPUT_VIDEO_PATH, \"rb\") as video_file:\n", + " video_bytes = video_file.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": user_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 10000},\n", + "}\n", + "\n", + "bedrock_runtime = get_bedrock_runtime()\n", + "\n", + "try:\n", + " response = bedrock_runtime.converse(**request)\n", + " text_content = next((item for item in response[\"output\"][\"message\"][\"content\"] if \"text\" in item), None)\n", + " \n", + " if text_content:\n", + " print(\"== Extracted Recipe ==\")\n", + " print(text_content[\"text\"])\n", + "\n", + "except ClientError as err:\n", + " print(f\"Error occurred: {err}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "video-detailed-description", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 1c: Rich Visual Description\n", + "\n", + "Generate detailed descriptions focusing on visual elements, colors, composition, and cinematography.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0\n", + "* `topP`: 1\n", + "* Some use cases may benefit from enabling model reasoning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9719c6c0", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_VIDEO_PATH = \"media/Cheesecake.mp4\"\n", + "user_prompt = \"\"\"Provide a rich visual description of this video. Focus on:\n", + "- Camera angles and framing (top-down, close-up, etc.)\n", + "- Color palette and lighting\n", + "- Visual composition and layout\n", + "- Movement and transitions\n", + "- Text overlays and their styling\n", + "- Overall aesthetic and production quality\n", + "\n", + "Describe what makes this video visually engaging.\"\"\"\n", + "\n", + "with open(INPUT_VIDEO_PATH, \"rb\") as video_file:\n", + " video_bytes = video_file.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": user_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 10000},\n", + "}\n", + "\n", + "bedrock_runtime = get_bedrock_runtime()\n", + "\n", + "try:\n", + " response = bedrock_runtime.converse(**request)\n", + " text_content = next((item for item in response[\"output\"][\"message\"][\"content\"] if \"text\" in item), None)\n", + " \n", + " if text_content:\n", + " print(\"== Rich Visual Description ==\")\n", + " print(text_content[\"text\"])\n", + "\n", + "except ClientError as err:\n", + " print(f\"Error occurred: {err}\")" + ] + }, + { + "cell_type": "markdown", + "id": "video-timestamps", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 1d: Event Timestamp Extraction\n", + "\n", + "Extract timestamps for specific events in videos.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0\n", + "* `topP`: 1\n", + "* Reasoning should not be used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f80eadf", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_VIDEO_PATH = \"media/Cheesecake.mp4\"\n", + "event_query = \"mixing ingredients\"\n", + "user_prompt = f\"Please localize the moment that the event '{event_query}' happens in the video. Answer with the starting and ending time of the event in seconds. e.g. [[72, 82]]. If the event happen multiple times, list all of them. e.g. [[40, 50], [72, 82]]\"\n", + "\n", + "with open(INPUT_VIDEO_PATH, \"rb\") as video_file:\n", + " video_bytes = video_file.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": user_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 10000},\n", + "}\n", + "\n", + "bedrock_runtime = get_bedrock_runtime()\n", + "\n", + "try:\n", + " response = bedrock_runtime.converse(**request)\n", + " text_content = next((item for item in response[\"output\"][\"message\"][\"content\"] if \"text\" in item), None)\n", + " \n", + " if text_content:\n", + " print(f\"== Event Timestamps for '{event_query}' ==\")\n", + " print(text_content[\"text\"])\n", + "\n", + "except ClientError as err:\n", + " print(f\"Error occurred: {err}\")" + ] + }, + { + "cell_type": "markdown", + "id": "video-segmentation", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 1e: Video Segmentation with Timestamps\n", + "\n", + "Generate a log of video segments with timestamps and captions.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0\n", + "* `topP`: 1\n", + "* Reasoning should not be used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff981189", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_VIDEO_PATH = \"media/Cheesecake.mp4\"\n", + "user_prompt = \"Segment a video into different scenes and generate caption per scene. The output should be in the format: [STARTING TIME-ENDING TIMESTAMP] CAPTION. Timestamp in MM:SS format\"\n", + "\n", + "with open(INPUT_VIDEO_PATH, \"rb\") as video_file:\n", + " video_bytes = video_file.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": user_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 10000},\n", + "}\n", + "\n", + "bedrock_runtime = get_bedrock_runtime()\n", + "\n", + "try:\n", + " response = bedrock_runtime.converse(**request)\n", + " text_content = next((item for item in response[\"output\"][\"message\"][\"content\"] if \"text\" in item), None)\n", + " \n", + " if text_content:\n", + " print(\"== Video Segmentation ==\")\n", + " print(text_content[\"text\"])\n", + "\n", + "except ClientError as err:\n", + " print(f\"Error occurred: {err}\")" + ] + }, + { + "cell_type": "markdown", + "id": "video-classification", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Example 1f: Video Classification\n", + "\n", + "Classify videos based on predefined categories.\n", + "\n", + "**Recommended inference parameters:**\n", + "* `temperature`: 0\n", + "* `topP`: 1\n", + "* Reasoning should not be used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31a63729", + "metadata": {}, + "outputs": [], + "source": [ + "INPUT_VIDEO_PATH = \"media/Cheesecake.mp4\"\n", + "user_prompt = \"\"\"What is the most appropriate category for this video? Select your answer from the options provided:\n", + "Cooking Tutorial\n", + "Home Repair\n", + "Makeup Tutorial\n", + "Other\"\"\"\n", + "\n", + "with open(INPUT_VIDEO_PATH, \"rb\") as video_file:\n", + " video_bytes = video_file.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": user_prompt},\n", + " ],\n", + " }\n", + " ],\n", + " \"inferenceConfig\": {\"temperature\": 0, \"topP\": 1, \"maxTokens\": 100},\n", + "}\n", + "\n", + "bedrock_runtime = get_bedrock_runtime()\n", + "\n", + "try:\n", + " response = bedrock_runtime.converse(**request)\n", + " text_content = next((item for item in response[\"output\"][\"message\"][\"content\"] if \"text\" in item), None)\n", + " \n", + " if text_content:\n", + " print(\"== Video Classification ==\")\n", + " print(text_content[\"text\"])\n", + "\n", + "except ClientError as err:\n", + " print(f\"Error occurred: {err}\")" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Key Takeaways\n", + "\n", + "- **Video Summarization**: Create executive summaries of video content\n", + "- **Recipe Extraction**: Extract structured recipe information from cooking videos\n", + "- **Visual Description**: Analyze cinematography, colors, and composition\n", + "- **Timestamp Extraction**: Locate specific events within videos\n", + "- **Video Segmentation**: Break videos into timestamped segments with captions\n", + "- **Video Classification**: Categorize videos based on content\n", + "- **Temperature Settings**: Use temperature 0 for factual, consistent responses\n", + "\n", + "## Next Steps\n", + "\n", + "- Explore **01_speech_understanding_examples.ipynb** for comprehensive audio processing examples\n", + "- Check out **02_image_generation_examples.ipynb** to learn about image generation capabilities\n", + "- Experiment with different prompts and inference parameters to optimize for your use case" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a76f2685-aa4a-4cf3-b334-103d69773532", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/04-langchain_multimodal_reasoning.ipynb b/nova-omni/getting-started/04-langchain_multimodal_reasoning.ipynb new file mode 100644 index 00000000..a1ceb2f6 --- /dev/null +++ b/nova-omni/getting-started/04-langchain_multimodal_reasoning.ipynb @@ -0,0 +1,415 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# LangChain Integration with Nova 2 Omni - Multimodal Reasoning\n", + "\n", + "This notebook demonstrates how to use Amazon Nova 2 Omni with LangChain tool definitions and direct boto3 calls for reasoning. We combine LangChain's tool schema with boto3's Bedrock API to enable reasoning configuration.\n", + "\n", + "**Key Features:**\n", + "- Multimodal input processing (image, video, audio)\n", + "- Reasoning effort configuration (low, medium, high)\n", + "- LangChain tool definitions with Pydantic schemas\n", + "- Direct boto3 calls for full API control\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "setup", + "metadata": {}, + "source": [ + "## Setup and Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install", + "metadata": {}, + "outputs": [], + "source": [ + "# Install required packages\n", + "!pip install langchain langchain-aws -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import base64\n", + "import boto3\n", + "from typing import Literal\n", + "from botocore.config import Config\n", + "from botocore.exceptions import ClientError\n", + "from langchain_core.tools import tool\n", + "from pydantic import BaseModel, Field\n", + "\n", + "import nova_utils\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "def get_bedrock_runtime():\n", + " config = Config(read_timeout=2 * 60)\n", + " return boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "tool-definition", + "metadata": {}, + "source": [ + "## Define Tools with LangChain\n", + "\n", + "Use LangChain to define tool schemas, then convert them to Bedrock format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-tool", + "metadata": {}, + "outputs": [], + "source": [ + "class SafetyAssessmentInput(BaseModel):\n", + " \"\"\"Input schema for safety risk assessment.\"\"\"\n", + " identified_hazards: list[str] = Field(description=\"List of potential hazards or risks\")\n", + " risk_level: Literal[\"low\", \"medium\", \"high\", \"critical\"] = Field(\n", + " description=\"Overall risk level assessment\"\n", + " )\n", + " recommended_actions: list[str] = Field(\n", + " description=\"List of recommended safety actions or precautions\"\n", + " )\n", + "\n", + "@tool(args_schema=SafetyAssessmentInput)\n", + "def assess_safety_risks(identified_hazards: list[str], risk_level: str, recommended_actions: list[str]) -> dict:\n", + " \"\"\"Assess safety risks and hazards in a scene or situation.\n", + " \n", + " Use this tool to identify potential dangers, evaluate risk levels,\n", + " and recommend appropriate safety measures or precautions.\n", + " \"\"\"\n", + " return {\n", + " \"status\": \"assessment_complete\",\n", + " \"hazards\": identified_hazards,\n", + " \"risk_level\": risk_level,\n", + " \"actions\": recommended_actions\n", + " }\n", + "\n", + "class RecipeExtractionInput(BaseModel):\n", + " \"\"\"Input schema for extracting recipe information.\"\"\"\n", + " dish_name: str = Field(description=\"Name of the dish being prepared\")\n", + " ingredients: list[str] = Field(description=\"List of ingredients used\")\n", + " steps: list[str] = Field(description=\"Ordered list of preparation steps\")\n", + " cooking_time: str = Field(description=\"Estimated total cooking time\")\n", + " difficulty: Literal[\"easy\", \"medium\", \"hard\"] = Field(\n", + " description=\"Difficulty level of the recipe\"\n", + " )\n", + "\n", + "@tool(args_schema=RecipeExtractionInput)\n", + "def extract_recipe(dish_name: str, ingredients: list[str], steps: list[str], cooking_time: str, difficulty: str) -> dict:\n", + " \"\"\"Extract structured recipe information from cooking videos or images.\n", + " \n", + " Use this tool to parse cooking demonstrations and create structured\n", + " recipe data including ingredients, steps, timing, and difficulty.\n", + " \"\"\"\n", + " return {\n", + " \"status\": \"recipe_extracted\",\n", + " \"dish\": dish_name,\n", + " \"ingredients\": ingredients,\n", + " \"steps\": steps,\n", + " \"time\": cooking_time,\n", + " \"difficulty\": difficulty\n", + " }\n", + "\n", + "# Convert LangChain tool to Bedrock format\n", + "def langchain_tool_to_bedrock(lc_tool):\n", + " schema = lc_tool.args_schema.model_json_schema()\n", + " return {\n", + " \"toolSpec\": {\n", + " \"name\": lc_tool.name,\n", + " \"description\": lc_tool.description,\n", + " \"inputSchema\": {\n", + " \"json\": schema\n", + " }\n", + " }\n", + " }\n", + "\n", + "safety_tools = [langchain_tool_to_bedrock(assess_safety_risks)]\n", + "recipe_tools = [langchain_tool_to_bedrock(extract_recipe)]" + ] + }, + { + "cell_type": "markdown", + "id": "image-reasoning", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Example 1: Image Understanding with Medium Reasoning\n", + "\n", + "Analyze an image using medium reasoning effort." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "image-example", + "metadata": {}, + "outputs": [], + "source": [ + "# Load image\n", + "image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(image_path)\n", + "\n", + "# Create request with reasoning config\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Analyze this image for safety risks. Identify any hazards, assess the overall risk level, and recommend appropriate safety actions. Use the assess_safety_risks tool to provide your assessment.\"}\n", + " ]\n", + " }\n", + " ],\n", + " \"toolConfig\": {\"tools\": safety_tools},\n", + " \"additionalModelRequestFields\": {\n", + " \"reasoningConfig\": {\n", + " \"type\": \"enabled\",\n", + " \"maxReasoningEffort\": \"medium\"\n", + " }\n", + " }\n", + "}\n", + "\n", + "bedrock = get_bedrock_runtime()\n", + "response = bedrock.converse(**request)\n", + "\n", + "print(\"=== Image Analysis Response ===\")\n", + "for content in response[\"output\"][\"message\"][\"content\"]:\n", + " if \"text\" in content:\n", + " print(f\"Text: {content['text']}\")\n", + " elif \"toolUse\" in content:\n", + " tool_use = content[\"toolUse\"]\n", + " print(f\"\\nTool: {tool_use['name']}\")\n", + " print(f\"Arguments: {json.dumps(tool_use['input'], indent=2)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "video-reasoning", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Example 2: Video Analysis with High Reasoning\n", + "\n", + "Analyze video content with high reasoning effort." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "video-example", + "metadata": {}, + "outputs": [], + "source": [ + "# Load video\n", + "video_path = \"media/Cheesecake.mp4\"\n", + "with open(video_path, \"rb\") as f:\n", + " video_bytes = f.read()\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"video\": {\"format\": \"mp4\", \"source\": {\"bytes\": video_bytes}}},\n", + " {\"text\": \"Watch this cooking video and extract the complete recipe. Identify the dish name, all ingredients used, step-by-step instructions, total cooking time, and difficulty level. Use the extract_recipe tool to provide the structured recipe data.\"}\n", + " ]\n", + " }\n", + " ],\n", + " \"toolConfig\": {\"tools\": recipe_tools},\n", + " \"additionalModelRequestFields\": {\n", + " \"reasoningConfig\": {\n", + " \"type\": \"enabled\",\n", + " \"maxReasoningEffort\": \"medium\"\n", + " }\n", + " }\n", + "}\n", + "\n", + "bedrock = get_bedrock_runtime()\n", + "response = bedrock.converse(**request)\n", + "\n", + "print(\"=== Video Analysis Response ===\")\n", + "for content in response[\"output\"][\"message\"][\"content\"]:\n", + " if \"text\" in content:\n", + " print(f\"Text: {content['text']}\")\n", + " elif \"toolUse\" in content:\n", + " tool_use = content[\"toolUse\"]\n", + " print(f\"\\nTool: {tool_use['name']}\")\n", + " print(f\"Arguments: {json.dumps(tool_use['input'], indent=2)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "mmmu-pattern", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Example 3: MMMU-Style Multiple Choice\n", + "\n", + "Implement MMMU-style evaluation with multiple choice questions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mmmu-tool", + "metadata": {}, + "outputs": [], + "source": [ + "class MultipleChoiceAnswerInput(BaseModel):\n", + " \"\"\"Input schema for multiple choice answer submission.\"\"\"\n", + " selected_option: Literal[\"A\", \"B\", \"C\", \"D\"] = Field(\n", + " description=\"The selected answer option (A, B, C, or D)\"\n", + " )\n", + " reasoning_steps: str = Field(\n", + " description=\"Step-by-step reasoning that led to this answer\"\n", + " )\n", + "\n", + "@tool(args_schema=MultipleChoiceAnswerInput)\n", + "def submit_multiple_choice_answer(selected_option: str, reasoning_steps: str) -> dict:\n", + " \"\"\"Submit the final answer for a multiple choice question.\n", + " \n", + " Use this tool after carefully analyzing the question and all options.\n", + " Provide your reasoning steps before selecting the final answer.\n", + " \"\"\"\n", + " return {\n", + " \"status\": \"submitted\",\n", + " \"answer\": selected_option,\n", + " \"reasoning\": reasoning_steps\n", + " }\n", + "\n", + "mmmu_bedrock_tools = [langchain_tool_to_bedrock(submit_multiple_choice_answer)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mmmu-example", + "metadata": {}, + "outputs": [], + "source": [ + "# Load image\n", + "image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(image_path)\n", + "\n", + "question = \"\"\"Based on the image, what is the most appropriate action for the person to take?\n", + "\n", + "A) Continue walking without looking\n", + "B) Check for traffic before crossing\n", + "C) Run across the street quickly\n", + "D) Wait for a green light signal\n", + "\n", + "Use the submit_multiple_choice_answer tool to provide your answer.\"\"\"\n", + "\n", + "request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": question}\n", + " ]\n", + " }\n", + " ],\n", + " \"toolConfig\": {\"tools\": mmmu_bedrock_tools},\n", + " \"additionalModelRequestFields\": {\n", + " \"reasoningConfig\": {\n", + " \"type\": \"enabled\",\n", + " \"maxReasoningEffort\": \"medium\"\n", + " }\n", + " }\n", + "}\n", + "\n", + "bedrock = get_bedrock_runtime()\n", + "response = bedrock.converse(**request)\n", + "\n", + "print(\"=== MMMU-Style Question Response ===\")\n", + "for content in response[\"output\"][\"message\"][\"content\"]:\n", + " if \"toolUse\" in content:\n", + " tool_use = content[\"toolUse\"]\n", + " args = tool_use[\"input\"]\n", + " print(f\"Selected Option: {args.get('selected_option')}\")\n", + " print(f\"Reasoning: {args.get('reasoning_steps')}\")" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Key Takeaways\n", + "\n", + "- **Hybrid Approach**: Use LangChain for tool definitions, boto3 for API calls with reasoning\n", + "- **Reasoning Effort**: Use `low`, `medium`, or `high` based on task complexity\n", + "- **Tool Conversion**: Convert LangChain tools to Bedrock format with `langchain_tool_to_bedrock`\n", + "- **Full API Control**: Direct boto3 calls enable all Bedrock features including reasoning\n", + "- **Temperature Settings**: Use 0.0-0.1 for factual tasks\n", + "\n", + "## Next Steps\n", + "\n", + "- Explore **05_langgraph_multimodal_reasoning.ipynb** for stateful workflows\n", + "- Check **06_strands_multimodal_reasoning.ipynb** for multi-agent patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c04951d-3add-49e1-b1f5-a29f967ec9fe", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/05-langgraph_multimodal_reasoning.ipynb b/nova-omni/getting-started/05-langgraph_multimodal_reasoning.ipynb new file mode 100644 index 00000000..0606fe76 --- /dev/null +++ b/nova-omni/getting-started/05-langgraph_multimodal_reasoning.ipynb @@ -0,0 +1,303 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# LangGraph Integration with Nova 2 Omni - Stateful Multimodal Reasoning\n", + "\n", + "This notebook demonstrates using Amazon Nova 2 Omni with LangGraph for stateful workflows. We use direct boto3 calls for reasoning configuration and LangGraph for state management.\n", + "\n", + "**Key Features:**\n", + "- Stateful workflow management with LangGraph\n", + "- Direct boto3 calls with reasoning configuration\n", + "- Multi-step reasoning with state persistence\n", + "- Conditional routing based on outputs\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "setup", + "metadata": {}, + "source": [ + "## Setup and Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "install", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langgraph langchain-core -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import boto3\n", + "from typing import Annotated, Literal, TypedDict\n", + "from botocore.config import Config\n", + "from langgraph.graph import StateGraph, END\n", + "from langgraph.graph.message import add_messages\n", + "from pydantic import BaseModel, Field\n", + "from langchain_core.tools import tool\n", + "\n", + "import nova_utils\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "def get_bedrock_runtime():\n", + " config = Config(read_timeout=2 * 60)\n", + " return boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "state-tools", + "metadata": {}, + "source": [ + "## Define State and Tools\n", + "\n", + "Create state schema and tool definitions for the workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-state-tools", + "metadata": {}, + "outputs": [], + "source": [ + "class ReasoningState(TypedDict):\n", + " \"\"\"State for multimodal reasoning workflow.\"\"\"\n", + " messages: list\n", + " analysis_complete: bool\n", + " final_answer: str\n", + "\n", + "class VideoSegmentInput(BaseModel):\n", + " \"\"\"Schema for video segment analysis.\"\"\"\n", + " timestamp_range: str = Field(description=\"Time range in format MM:SS-MM:SS\")\n", + " action_description: str = Field(description=\"What action is happening\")\n", + " key_objects: list[str] = Field(description=\"Key objects visible in segment\")\n", + "\n", + "@tool(args_schema=VideoSegmentInput)\n", + "def log_video_segment(timestamp_range: str, action_description: str, key_objects: list[str]) -> dict:\n", + " \"\"\"Log analysis of a video segment with timestamp and details.\"\"\"\n", + " return {\n", + " \"status\": \"segment_logged\",\n", + " \"timestamp\": timestamp_range,\n", + " \"action\": action_description,\n", + " \"objects\": key_objects\n", + " }\n", + "\n", + "class VideoSummaryInput(BaseModel):\n", + " \"\"\"Schema for complete video summary.\"\"\"\n", + " title: str = Field(description=\"Title or main topic of video\")\n", + " total_segments: int = Field(description=\"Number of segments analyzed\")\n", + " key_takeaways: list[str] = Field(description=\"Main takeaways from video\")\n", + "\n", + "@tool(args_schema=VideoSummaryInput)\n", + "def submit_video_summary(title: str, total_segments: int, key_takeaways: list[str]) -> dict:\n", + " \"\"\"Submit final video summary after analyzing all segments.\"\"\"\n", + " return {\n", + " \"status\": \"summary_complete\",\n", + " \"title\": title,\n", + " \"segments\": total_segments,\n", + " \"takeaways\": key_takeaways\n", + " }\n", + "\n", + "def langchain_tool_to_bedrock(lc_tool):\n", + " schema = lc_tool.args_schema.model_json_schema()\n", + " return {\n", + " \"toolSpec\": {\n", + " \"name\": lc_tool.name,\n", + " \"description\": lc_tool.description,\n", + " \"inputSchema\": {\"json\": schema}\n", + " }\n", + " }\n", + "\n", + "bedrock_tools = [langchain_tool_to_bedrock(log_video_segment), langchain_tool_to_bedrock(submit_video_summary)]" + ] + }, + { + "cell_type": "markdown", + "id": "graph-definition", + "metadata": {}, + "source": [ + "## Build the Reasoning Graph\n", + "\n", + "Create a stateful graph with reasoning nodes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "build-graph", + "metadata": {}, + "outputs": [], + "source": [ + "bedrock = get_bedrock_runtime()\n", + "\n", + "def reasoning_node(state: ReasoningState):\n", + " \"\"\"Main reasoning node using direct boto3 calls.\"\"\"\n", + " # Only use the first user message for reasoning (no assistant prefill)\n", + " user_messages = [msg for msg in state[\"messages\"] if msg[\"role\"] == \"user\"]\n", + " request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": user_messages,\n", + " \"toolConfig\": {\"tools\": bedrock_tools},\n", + " \"additionalModelRequestFields\": {\n", + " \"reasoningConfig\": {\n", + " \"type\": \"enabled\",\n", + " \"maxReasoningEffort\": \"medium\"\n", + " }\n", + " }\n", + " }\n", + " \n", + " response = bedrock.converse(**request)\n", + " \n", + " # Extract content, filtering out reasoningContent blocks\n", + " content = []\n", + " for item in response[\"output\"][\"message\"][\"content\"]:\n", + " if \"reasoningContent\" not in item:\n", + " content.append(item)\n", + " \n", + " new_message = {\n", + " \"role\": \"assistant\",\n", + " \"content\": content\n", + " }\n", + " \n", + " return {\"messages\": state[\"messages\"] + [new_message]}\n", + "\n", + "def should_continue(state: ReasoningState) -> str:\n", + " \"\"\"Determine if workflow should continue.\"\"\"\n", + " last_message = state[\"messages\"][-1]\n", + " \n", + " for content in last_message.get(\"content\", []):\n", + " if \"toolUse\" in content:\n", + " if content[\"toolUse\"][\"name\"] == \"submit_video_summary\":\n", + " return \"end\"\n", + " return \"continue\"\n", + "\n", + "workflow = StateGraph(ReasoningState)\n", + "workflow.add_node(\"reasoning\", reasoning_node)\n", + "workflow.set_entry_point(\"reasoning\")\n", + "workflow.add_conditional_edges(\n", + " \"reasoning\",\n", + " should_continue,\n", + " {\"continue\": \"reasoning\", \"end\": END}\n", + ")\n", + "\n", + "app = workflow.compile()\n", + "print(\"✅ Reasoning graph compiled\")" + ] + }, + { + "cell_type": "markdown", + "id": "example-1", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Example: Video Analysis Workflow\n", + "\n", + "Run a stateful reasoning workflow for video analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "video-workflow", + "metadata": {}, + "outputs": [], + "source": [ + "# Load image\n", + "image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(image_path)\n", + "\n", + "initial_state = {\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Analyze this image and provide a summary. Use submit_video_summary with a title describing the scene, number of key elements (3-5), and key observations.\"}\n", + " ]\n", + " }\n", + " ],\n", + " \"analysis_complete\": False,\n", + " \"final_answer\": \"\"\n", + "}\n", + "\n", + "print(\"=== Running Workflow ===\")\n", + "final_state = app.invoke(initial_state)\n", + "\n", + "print(\"\\n=== Final Answer ===\")\n", + "for message in final_state[\"messages\"]:\n", + " if message.get(\"role\") == \"assistant\":\n", + " for content in message.get(\"content\", []):\n", + " if \"toolUse\" in content:\n", + " tool_use = content[\"toolUse\"]\n", + " print(f\"Tool: {tool_use['name']}\")\n", + " print(json.dumps(tool_use[\"input\"], indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Key Takeaways\n", + "\n", + "- **Stateful Workflows**: LangGraph maintains state across reasoning steps\n", + "- **Direct boto3 Calls**: Enable reasoning configuration with additionalModelRequestFields\n", + "- **Conditional Routing**: Route based on tool calls and outputs\n", + "- **Hybrid Approach**: Combine LangGraph structure with boto3 API control\n", + "\n", + "## Next Steps\n", + "\n", + "- Explore **06_strands_multimodal_reasoning.ipynb** for multi-agent patterns\n", + "- Experiment with different reasoning effort levels\n", + "- Build custom workflows for your use cases" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/06-strands_multimodal_reasoning.ipynb b/nova-omni/getting-started/06-strands_multimodal_reasoning.ipynb new file mode 100644 index 00000000..1e8ce905 --- /dev/null +++ b/nova-omni/getting-started/06-strands_multimodal_reasoning.ipynb @@ -0,0 +1,329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# Multi-Agent Multimodal Reasoning with Nova 2 Omni\n", + "\n", + "This notebook demonstrates multi-agent patterns for multimodal reasoning using Amazon Nova 2 Omni. We use direct boto3 calls with reasoning configuration and implement agent coordination patterns.\n", + "\n", + "**Key Features:**\n", + "- Multi-agent orchestration patterns\n", + "- Specialized agents for different modalities\n", + "- Direct boto3 calls with reasoning configuration\n", + "- Agent coordination and result synthesis\n", + "\n", + "**Note:** This notebook demonstrates multi-agent patterns without requiring the Strands framework.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "setup", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import boto3\n", + "from typing import Literal\n", + "from botocore.config import Config\n", + "from pydantic import BaseModel, Field\n", + "from langchain_core.tools import tool\n", + "\n", + "import nova_utils\n", + "\n", + "MODEL_ID = \"us.amazon.nova-2-omni-v1:0\"\n", + "REGION_ID = \"us-west-2\"\n", + "\n", + "def get_bedrock_runtime():\n", + " config = Config(read_timeout=2 * 60)\n", + " return boto3.client(\n", + " service_name=\"bedrock-runtime\",\n", + " region_name=REGION_ID,\n", + " config=config\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "tools-definition", + "metadata": {}, + "source": [ + "## Define Agent Tools\n", + "\n", + "Create tools for agents to submit their findings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-tools", + "metadata": {}, + "outputs": [], + "source": [ + "class SafetyAssessmentResult(BaseModel):\n", + " \"\"\"Schema for safety assessment results.\"\"\"\n", + " identified_hazards: list[str] = Field(description=\"List of hazards identified\")\n", + " risk_level: Literal[\"low\", \"medium\", \"high\", \"critical\"] = Field(description=\"Overall risk level\")\n", + " recommended_actions: list[str] = Field(description=\"Recommended safety actions\")\n", + "\n", + "@tool(args_schema=SafetyAssessmentResult)\n", + "def submit_safety_assessment(identified_hazards: list[str], risk_level: str, recommended_actions: list[str]) -> dict:\n", + " \"\"\"Submit safety assessment results.\"\"\"\n", + " return {\n", + " \"agent\": \"safety_analyzer\",\n", + " \"hazards\": identified_hazards,\n", + " \"risk_level\": risk_level,\n", + " \"actions\": recommended_actions\n", + " }\n", + "\n", + "class ComprehensiveReport(BaseModel):\n", + " \"\"\"Schema for comprehensive report.\"\"\"\n", + " summary: str = Field(description=\"Overall summary of findings\")\n", + " key_insights: list[str] = Field(description=\"Key insights from all agents\")\n", + " recommendations: list[str] = Field(description=\"Final recommendations\")\n", + "\n", + "@tool(args_schema=ComprehensiveReport)\n", + "def submit_comprehensive_report(summary: str, key_insights: list[str], recommendations: list[str]) -> dict:\n", + " \"\"\"Submit comprehensive report synthesizing all agent findings.\"\"\"\n", + " return {\n", + " \"agent\": \"coordinator\",\n", + " \"summary\": summary,\n", + " \"insights\": key_insights,\n", + " \"recommendations\": recommendations\n", + " }\n", + "\n", + "def langchain_tool_to_bedrock(lc_tool):\n", + " schema = lc_tool.args_schema.model_json_schema()\n", + " return {\n", + " \"toolSpec\": {\n", + " \"name\": lc_tool.name,\n", + " \"description\": lc_tool.description,\n", + " \"inputSchema\": {\"json\": schema}\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "agent-class", + "metadata": {}, + "source": [ + "## Define Agent Class\n", + "\n", + "Create a simple agent class using direct boto3 calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-agent", + "metadata": {}, + "outputs": [], + "source": [ + "class MultimodalAgent:\n", + " \"\"\"Agent using direct boto3 calls with reasoning.\"\"\"\n", + " \n", + " def __init__(self, name: str, system_prompt: str, tools: list, reasoning_effort: str = \"medium\"):\n", + " self.name = name\n", + " self.system_prompt = system_prompt\n", + " self.bedrock_tools = [langchain_tool_to_bedrock(t) for t in tools]\n", + " self.reasoning_effort = reasoning_effort\n", + " self.bedrock = get_bedrock_runtime()\n", + " \n", + " def analyze(self, content: list) -> dict:\n", + " \"\"\"Analyze content and return results.\"\"\"\n", + " request = {\n", + " \"modelId\": MODEL_ID,\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": content\n", + " }\n", + " ],\n", + " \"system\": [{\"text\": self.system_prompt}],\n", + " \"toolConfig\": {\"tools\": self.bedrock_tools},\n", + " \"additionalModelRequestFields\": {\n", + " \"reasoningConfig\": {\n", + " \"type\": \"enabled\",\n", + " \"maxReasoningEffort\": self.reasoning_effort\n", + " }\n", + " }\n", + " }\n", + " \n", + " response = self.bedrock.converse(**request)\n", + " return response\n", + "\n", + "safety_agent = MultimodalAgent(\n", + " name=\"SafetyAnalyzer\",\n", + " system_prompt=\"You are a safety assessment expert. Analyze images for hazards, evaluate risk levels, and recommend safety actions. Use submit_safety_assessment to report findings.\",\n", + " tools=[submit_safety_assessment],\n", + " reasoning_effort=\"high\"\n", + ")\n", + "\n", + "coordinator_agent = MultimodalAgent(\n", + " name=\"Coordinator\",\n", + " system_prompt=\"You synthesize analyses from multiple agents. Review findings and provide comprehensive report. Use submit_comprehensive_report.\",\n", + " tools=[submit_comprehensive_report],\n", + " reasoning_effort=\"high\"\n", + ")\n", + "\n", + "print(\"✅ Agents initialized\")" + ] + }, + { + "cell_type": "markdown", + "id": "orchestrator", + "metadata": {}, + "source": [ + "## Multi-Agent Orchestrator\n", + "\n", + "Coordinate multiple agents for collaborative reasoning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "define-orchestrator", + "metadata": {}, + "outputs": [], + "source": [ + "class MultiAgentOrchestrator:\n", + " \"\"\"Orchestrates multiple agents.\"\"\"\n", + " \n", + " def __init__(self, agents: dict, coordinator: MultimodalAgent):\n", + " self.agents = agents\n", + " self.coordinator = coordinator\n", + " \n", + " def run(self, tasks: dict) -> dict:\n", + " \"\"\"Run agents and synthesize results.\"\"\"\n", + " agent_results = {}\n", + " \n", + " for agent_name, task_content in tasks.items():\n", + " if agent_name in self.agents:\n", + " print(f\"\\n=== Running {agent_name} ===\")\n", + " agent = self.agents[agent_name]\n", + " response = agent.analyze(task_content)\n", + " \n", + " for content in response[\"output\"][\"message\"][\"content\"]:\n", + " if \"toolUse\" in content:\n", + " agent_results[agent_name] = content[\"toolUse\"][\"input\"]\n", + " print(f\"Results: {json.dumps(agent_results[agent_name], indent=2)}\")\n", + " \n", + " print(\"\\n=== Running Coordinator ===\")\n", + " synthesis_prompt = f\"\"\"Synthesize these analyses:\n", + "\n", + "{json.dumps(agent_results, indent=2)}\n", + "\n", + "Provide final answer integrating all findings.\"\"\"\n", + " \n", + " coordinator_response = self.coordinator.analyze([{\"text\": synthesis_prompt}])\n", + " \n", + " for content in coordinator_response[\"output\"][\"message\"][\"content\"]:\n", + " if \"toolUse\" in content:\n", + " final_result = content[\"toolUse\"][\"input\"]\n", + " print(f\"Final Answer: {json.dumps(final_result, indent=2)}\")\n", + " return final_result\n", + " \n", + " return {\"error\": \"No final answer\"}\n", + "\n", + "orchestrator = MultiAgentOrchestrator(\n", + " agents={\"safety\": safety_agent},\n", + " coordinator=coordinator_agent\n", + ")\n", + "print(\"✅ Orchestrator initialized\")" + ] + }, + { + "cell_type": "markdown", + "id": "example", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Example: Multi-Agent Image Analysis\n", + "\n", + "Use multiple agents to analyze an image collaboratively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "image-example", + "metadata": {}, + "outputs": [], + "source": [ + "# Load image\n", + "image_path = \"media/man_crossing_street.png\"\n", + "image_bytes, image_format = nova_utils.load_image_as_bytes(image_path)\n", + "\n", + "tasks = {\n", + " \"safety\": [\n", + " {\"image\": {\"format\": image_format, \"source\": {\"bytes\": image_bytes}}},\n", + " {\"text\": \"Analyze this image for safety risks. Identify all hazards, evaluate the overall risk level, and recommend appropriate safety actions.\"}\n", + " ]\n", + "}\n", + "\n", + "print(\"=== Starting Multi-Agent Analysis ===\")\n", + "result = orchestrator.run(tasks)\n", + "\n", + "print(\"\\n=== Final Synthesized Result ===\")\n", + "print(json.dumps(result, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Key Takeaways\n", + "\n", + "- **Multi-Agent Patterns**: Coordinate specialized agents for complex tasks\n", + "- **Direct boto3 Calls**: Enable reasoning configuration with additionalModelRequestFields\n", + "- **Agent Specialization**: Agents focus on specific modalities or tasks\n", + "- **Result Synthesis**: Coordinator agent integrates findings from all agents\n", + "\n", + "## Next Steps\n", + "\n", + "- Add more specialized agents for different modalities\n", + "- Implement voting mechanisms for consensus\n", + "- Build custom orchestration patterns" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nova-omni/getting-started/07-financial_document_analysis.ipynb b/nova-omni/getting-started/07-financial_document_analysis.ipynb new file mode 100644 index 00000000..815dc2fd --- /dev/null +++ b/nova-omni/getting-started/07-financial_document_analysis.ipynb @@ -0,0 +1,486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Financial Document Analysis with Amazon Nova Omni\n", + "\n", + "This notebook demonstrates Nova Omni's advanced multimodal capabilities for analyzing complex financial documents containing:\n", + "- Financial tables and metrics\n", + "- Charts and graphs\n", + "- Multi-page structured data\n", + "- Cross-referencing between text and visuals\n", + "\n", + "We'll use Amazon's Q3 2025 Earnings Release as our example document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-west-2')\n", + "model_id = \"us.amazon.nova-2-omni-v1:0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_document_as_bytes(file_path):\n", + " \"\"\"Load document file as bytes\"\"\"\n", + " with open(file_path, 'rb') as f:\n", + " return f.read()\n", + "\n", + "# Load the earnings report\n", + "earnings_doc = load_document_as_bytes('media/AMZN-Q3-2025-Earnings-Release.pdf')\n", + "print(f\"Loaded document: {len(earnings_doc)} bytes\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1: Extract Key Financial Metrics\n", + "\n", + "Extract structured financial data from tables across multiple pages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Extract the following key financial metrics from this earnings report:\n", + " - Net sales (Q3 2025 and Q3 2024)\n", + " - Operating income\n", + " - Net income\n", + " - Earnings per share (diluted)\n", + " - Operating cash flow\n", + " - Free cash flow\n", + " \n", + " Return as JSON with year-over-year comparison.\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2: Segment-Level Revenue Analysis\n", + "\n", + "Extract and analyze revenue breakdown by business segment (AWS, North America, International)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Analyze the segment information:\n", + " 1. Extract revenue for each segment (North America, International, AWS)\n", + " 2. Calculate year-over-year growth rate for each segment\n", + " 3. Identify which segment had the highest growth\n", + " 4. Extract operating income by segment\n", + " \n", + " Present as a structured analysis.\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 3: Table Extraction to Structured JSON\n", + "\n", + "Extract complex financial tables and convert to machine-readable JSON format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Find the consolidated statements of operations table and convert it to JSON format.\n", + " Include all line items with Q3 2025 and Q3 2024 values.\n", + " Preserve the hierarchical structure (revenue, costs, operating income, etc.).\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 4: Multi-Page Context Understanding\n", + "\n", + "Answer questions requiring information from multiple pages and cross-referencing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Provide a comprehensive analysis:\n", + " 1. What was Amazon's total revenue growth in Q3 2025?\n", + " 2. Which geographic segment contributed most to this growth?\n", + " 3. How did AWS performance compare to the overall company?\n", + " 4. What were the key drivers mentioned in the report?\n", + " 5. Compare operating margin improvements across segments.\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 5: Chart and Graph Understanding\n", + "\n", + "If the document contains charts or graphs, Nova Omni can interpret visual data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Identify any charts, graphs, or visual elements in this document.\n", + " For each visual element found:\n", + " 1. Describe what it shows\n", + " 2. Extract key data points\n", + " 3. Explain the trend or insight it conveys\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 6: Financial Ratio Calculation\n", + "\n", + "Use extracted data to calculate financial ratios and metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Calculate the following financial metrics:\n", + " 1. Operating margin (%) for Q3 2025 vs Q3 2024\n", + " 2. Net profit margin (%)\n", + " 3. Revenue growth rate (%)\n", + " 4. AWS operating margin\n", + " 5. Free cash flow margin\n", + " \n", + " Show calculations and year-over-year changes.\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 7: Comparative Analysis with Structured Output\n", + "\n", + "Generate a structured comparison report in JSON format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Create a JSON report with this structure:\n", + " {\n", + " \"company\": \"Amazon\",\n", + " \"period\": \"Q3 2025\",\n", + " \"financial_highlights\": {\n", + " \"revenue\": {\"current\": 0, \"prior\": 0, \"growth_pct\": 0},\n", + " \"operating_income\": {\"current\": 0, \"prior\": 0, \"growth_pct\": 0},\n", + " \"net_income\": {\"current\": 0, \"prior\": 0, \"growth_pct\": 0}\n", + " },\n", + " \"segments\": [\n", + " {\"name\": \"\", \"revenue\": 0, \"growth_pct\": 0, \"operating_income\": 0}\n", + " ],\n", + " \"key_metrics\": {\n", + " \"operating_margin_pct\": 0,\n", + " \"free_cash_flow\": 0\n", + " }\n", + " }\n", + " \n", + " Fill in all values from the earnings report.\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "result = response['output']['message']['content'][0]['text']\n", + "print(result)\n", + "\n", + "# Try to parse as JSON\n", + "try:\n", + " # Extract JSON from markdown code blocks if present\n", + " if '```json' in result:\n", + " json_str = result.split('```json')[1].split('```')[0].strip()\n", + " elif '```' in result:\n", + " json_str = result.split('```')[1].split('```')[0].strip()\n", + " else:\n", + " json_str = result\n", + " \n", + " parsed = json.loads(json_str)\n", + " print(\"\\n✓ Successfully parsed as JSON\")\n", + " print(json.dumps(parsed, indent=2))\n", + "except:\n", + " print(\"\\nNote: Response contains structured data but may need formatting\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 8: Executive Summary Generation\n", + "\n", + "Generate a concise executive summary from the full earnings report." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = bedrock_runtime.converse(\n", + " modelId=model_id,\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"document\": {\n", + " \"format\": \"pdf\",\n", + " \"name\": \"earnings_report\",\n", + " \"source\": {\"bytes\": earnings_doc}\n", + " }\n", + " },\n", + " {\n", + " \"text\": \"\"\"Create a concise executive summary (5-7 bullet points) highlighting:\n", + " - Overall financial performance\n", + " - Key growth drivers\n", + " - Segment performance highlights\n", + " - Notable year-over-year changes\n", + " - Any forward-looking statements or guidance\"\"\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + ")\n", + "\n", + "print(response['output']['message']['content'][0]['text'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Key Capabilities Demonstrated\n", + "\n", + "This notebook showcases Nova Omni's ability to:\n", + "\n", + "1. **Multi-page document understanding** - Process and analyze 13-page financial reports\n", + "2. **Table extraction** - Extract complex financial tables with hierarchical structure\n", + "3. **Numerical reasoning** - Calculate growth rates, margins, and financial ratios\n", + "4. **Cross-referencing** - Connect information across multiple pages and sections\n", + "5. **Structured output** - Generate JSON-formatted data from unstructured documents\n", + "6. **Visual understanding** - Interpret charts, graphs, and visual elements (if present)\n", + "7. **Contextual analysis** - Provide insights beyond raw data extraction\n", + "8. **Summarization** - Distill key information into executive summaries\n", + "\n", + "These capabilities make Nova Omni ideal for:\n", + "- Financial document processing\n", + "- Automated earnings analysis\n", + "- Investment research\n", + "- Regulatory compliance\n", + "- Business intelligence" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.19" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nova-omni/getting-started/README.md b/nova-omni/getting-started/README.md new file mode 100644 index 00000000..1f6a29e8 --- /dev/null +++ b/nova-omni/getting-started/README.md @@ -0,0 +1,185 @@ +## Amazon Nova 2 Omni Model + +Welcome to the Amazon Nova 2 Omni Model Getting Started! + +### Model Overview + +Amazon Nova 2 Omni is a multimodal foundation model that can understand and generate content across text, images, and audio. This model excels at: + +- **Speech Understanding**: Transcribe, summarize, analyze, and answer questions about audio content +- **Image Generation**: Create high-quality images from text descriptions +- **Multimodal Reasoning**: Process and understand multiple input modalities simultaneously + +| Model Characteristics | Amazon Nova 2 Omni | +| --------------------- | ------------------- | +| Model ID | us.amazon.nova-2-omni-v1:0 | +| Input Modalities | Text, Audio, Image | +| Output Modalities | Text, Image | +| Context Window | 1M tokens | +| Max Output Tokens | 10k | +| Supported Audio Formats | mp3, opus, wav, aac, flac, m4a, ogg, mka | +| Regions | us-east-1 | +| Converse API | Yes | +| InvokeAPI | Yes | +| Streaming | Yes | +| Batch Inference | Yes | + +This is a collection of Jupyter Notebooks that will help you explore the capabilities and syntax of the Amazon Nova 2 Omni model. There are just a few setup steps you need to follow before using the sample code provided in these notebooks. + +## Prerequisites + +Ensure you have met the following requirements before continuing: + +- Python 3.12+ is installed +- [AWS CLI](https://aws.amazon.com/cli/) is installed +- AWS CLI is [configured with IAM credentials](https://docs.aws.amazon.com/cli/v1/userguide/cli-chap-configure.html) + +## Enable the Nova 2 Omni Model in the Amazon Bedrock Console + +Before you can make API requests to the Nova 2 Omni model, you need to enable the model in your account using the Amazon Bedrock console. Follow [the instructions here](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access-modify.html) to enable the model called "Amazon > Nova 2 Omni". + +## Configure IAM Permissions + +Ensure the IAM role you are using has been given the following permissions: + +* bedrock:InvokeModel + +## Install Dependencies + +We recommend using a Python virtual environment when running this code. Follow these steps to create a virtual environment. + +1. Navigate to folder: + +```bash +cd path/to/nova-omni/getting-started +``` + +2. Create virtual environment: + +```bash +python -m venv .venv +``` + +3. Activate virtual environment: + +- On Windows: + +```bash +.venv\Scripts\activate +``` + +- On macOS/Linux: + +```bash +source .venv/bin/activate +``` + +4. Install dependencies. Note, this will install a special unreleased version of the Boto3 SDK which is required to support the new Nova 2 Omni model capabilities. + +```bash +pip install -r requirements.txt +``` + +## Running the Notebooks + +Jupyter Notebooks can be run in a number of ways. Choose the method you prefer from the following options. + +### Microsoft VS Code + +[Microsoft VS Code](https://code.visualstudio.com/) has great support for Jupyter Notebooks with a very user-friendly UI. Just install the ["Jupyter" extension](https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter) installed. After launching VS Code, choose **"Open Folder..."** and open this *"nova-omni/getting-started"* folder. + +### From the Command Line + +The setup steps above installed the command line version of Jupyter Notebook server. To use this option, do the following from the command line: + +```bash +cd path/to/nova-omni/getting-started +``` +```bash +source .venv/bin/activate +``` +```bash +jupyter notebook +``` + +This will automatically open a browser-based UI that you can use to run the notebooks. + +### Use Amazon SageMaker Notebooks + +[Amazon SageMaker Notebooks](https://aws.amazon.com/sagemaker/ai/notebooks/) offers a way to run Jupyter Notebooks in the cloud. If you choose this option, you will need to edit the permissions of the SageMaker IAM role allow it access to Bedrock as described in the [Configure IAM Permissions](#configure-iam-permissions) section. + +## Notebook Overview + +### 00 - Setup +Verify your environment setup and test connection to Amazon Bedrock: +- Check Python version and dependencies +- Validate AWS credentials and configuration +- Test Bedrock connection and model access + +### 01 - Speech Understanding Examples +Explore Nova 2 Omni's speech understanding capabilities including: +- Audio transcription with and without speaker diarization +- Audio summarization +- Call analytics with structured output +- Question answering about audio content + +### 02 - Image Generation Examples +Learn how to use Nova 2 Omni for image generation tasks including: +- Text-to-image generation +- Image editing and manipulation +- Style transfer and artistic effects + +### 03 - Multimodal Understanding Examples +Discover how to analyze images, videos, and audio: +- Image question answering and classification +- Video content analysis and classification +- Using system prompts vs user prompts +- Audio content understanding + +### 04 - LangChain Multimodal Reasoning +Integrate Nova 2 Omni with LangChain for structured reasoning: +- Tool use with Pydantic schemas +- Reasoning effort configuration (low, medium, high) +- MMMU-style evaluation with submit_answer pattern +- Multimodal input processing with structured outputs + +### 05 - LangGraph Multimodal Reasoning +Build stateful reasoning workflows with LangGraph: +- Stateful workflow management with checkpoints +- Multi-step reasoning chains +- Conditional routing based on tool calls +- MMMU evaluation graphs with reasoning traces + +### 06 - Strands Multimodal Reasoning +Orchestrate multi-agent systems for collaborative reasoning: +- Specialized agents for different modalities +- Multi-agent orchestration and coordination +- Collaborative reasoning with result synthesis +- MMMU-style multi-agent evaluation patterns + +### 07 - Document Understanding Examples +Process and analyze document content with Nova 2 Omni: +- OCR (Optical Character Recognition) from PDF documents +- Key Information Extraction (KIE) with structured JSON output +- Object detection and counting in images +- Document format support for PDF processing + +### 08 - Financial Document Analysis +Showcase advanced multimodal capabilities with complex financial documents: +- Multi-page document understanding (13-page earnings reports) +- Financial table extraction and conversion to JSON +- Segment-level revenue analysis and calculations +- Chart and graph interpretation +- Cross-referencing information across multiple pages +- Financial ratio calculations and comparative analysis +- Executive summary generation from full reports + +## Key Features Demonstrated + +- **Speech Transcription**: Convert audio to text with optional speaker identification +- **Audio Analysis**: Extract insights, sentiment, and structured data from audio +- **Image Generation**: Create images from text descriptions +- **Multimodal Understanding**: Process multiple input types simultaneously +- **Reasoning Capabilities**: Enable advanced reasoning for complex tasks +- **Document Processing**: Extract text and data from PDF documents +- **Financial Analysis**: Analyze complex financial reports with tables and charts diff --git a/nova-omni/getting-started/media/AMZN-Q3-2025-Earnings-Release.pdf b/nova-omni/getting-started/media/AMZN-Q3-2025-Earnings-Release.pdf new file mode 100644 index 00000000..8f64316b Binary files /dev/null and b/nova-omni/getting-started/media/AMZN-Q3-2025-Earnings-Release.pdf differ diff --git a/nova-omni/getting-started/media/Cheesecake.mp4 b/nova-omni/getting-started/media/Cheesecake.mp4 new file mode 100644 index 00000000..5f65fa78 Binary files /dev/null and b/nova-omni/getting-started/media/Cheesecake.mp4 differ diff --git a/nova-omni/getting-started/media/call_1763087723216.wav b/nova-omni/getting-started/media/call_1763087723216.wav new file mode 100644 index 00000000..95880eb7 Binary files /dev/null and b/nova-omni/getting-started/media/call_1763087723216.wav differ diff --git a/nova-omni/getting-started/media/man_crossing_street.png b/nova-omni/getting-started/media/man_crossing_street.png new file mode 100644 index 00000000..4a033d57 Binary files /dev/null and b/nova-omni/getting-started/media/man_crossing_street.png differ diff --git a/nova-omni/getting-started/media/man_in_tshirt.jpg b/nova-omni/getting-started/media/man_in_tshirt.jpg new file mode 100644 index 00000000..0d956d9a Binary files /dev/null and b/nova-omni/getting-started/media/man_in_tshirt.jpg differ diff --git a/nova-omni/getting-started/media/output-dog.png b/nova-omni/getting-started/media/output-dog.png new file mode 100644 index 00000000..be70b4d9 Binary files /dev/null and b/nova-omni/getting-started/media/output-dog.png differ diff --git a/nova-omni/getting-started/nova_notebook_utils.py b/nova-omni/getting-started/nova_notebook_utils.py new file mode 100644 index 00000000..dffdb6bb --- /dev/null +++ b/nova-omni/getting-started/nova_notebook_utils.py @@ -0,0 +1,46 @@ +import json + +from IPython.display import Image, Markdown, display + +import nova_utils + + +def _render_text_content(text_node, as_markdown=True): + text = text_node["text"] + if len(text) == 0: + text = f"WARNING: Found unexpected empty text content:\n{json.dumps(text_node, indent=2)}\n\n" + + if as_markdown: + display(Markdown(text)) + else: + print(text) + + +def _render_reasoning_content(reasoning_node): + reasoning_text = reasoning_node["reasoningContent"]["reasoningText"]["text"] + print(reasoning_text) + + +def _render_image_content(image_node): + image_bytes = image_node["image"]["source"]["bytes"] + display(Image(data=image_bytes)) + + +def render_content(content_list): + for node_index, node in enumerate(content_list): + if "text" in node: + _render_text_content(node) + elif "image" in node: + _render_image_content(node) + elif "reasoningContent" in node: + _render_reasoning_content(node) + else: + raise ValueError(f"Unknown node type: {node}") + + +def render_response(response): + request_id = nova_utils.extract_response_request_id(response) + print(f"Request ID: {request_id}") + + content_list = response["output"]["message"]["content"] + render_content(content_list) diff --git a/nova-omni/getting-started/nova_utils.py b/nova-omni/getting-started/nova_utils.py new file mode 100644 index 00000000..916f1ee6 --- /dev/null +++ b/nova-omni/getting-started/nova_utils.py @@ -0,0 +1,240 @@ +import json +import os +from pathlib import Path + + +def converse(bedrock_runtime, request, output_dir=None): + try: + response = bedrock_runtime.converse(**request) + + if output_dir: + save_request_and_response(request, response, output_folder=output_dir) + + return response + + except Exception as err: + if output_dir: + save_request_and_response( + request=request, exception=err, output_folder=output_dir + ) + raise err + + +def extract_response_text(response): + content_list = response["output"]["message"]["content"] + text_block = next((item for item in content_list if "text" in item), None) + + if text_block is None: + return None + + text = text_block["text"] + return text + + +def extract_response_request_id(response): + request_id = response["ResponseMetadata"]["RequestId"] + return request_id + + +def extract_response_reasoning(response): + content_list = response["output"]["message"]["content"] + reasoning_block = next( + (item for item in content_list if "reasoningContent" in item), None + ) + + if reasoning_block is None: + return None + + text = reasoning_block["reasoningContent"]["reasoningText"]["text"] + return text + + +def extract_response_image(response): + content_list = response["output"]["message"]["content"] + image_block = next((item for item in content_list if "image" in item), None) + + if image_block is None: + return None + + image_bytes = image_block["image"]["source"]["bytes"] + return image_bytes + + +def load_audio_as_bytes(audio_path): + """ + Load audio from disk as a byte array + """ + with open(audio_path, "rb") as f: + result = f.read() + + return result + + +def load_image_as_bytes(image_path): + """ + Load image from disk as a byte array + """ + with open(image_path, "rb") as f: + result = f.read(), get_image_format(image_path) + + return result + + +def get_image_format(image_path): + """ + Load image bytes from disk + """ + # Determine the image format based on the file extension + image_format = os.path.splitext(image_path)[1].lower().replace(".", "").lower() + if image_format == "jpg": + image_format = "jpeg" + + return image_format + + +def create_output_folder(parent_folder_path="output", folder_suffix=None): + """ + Creates an output folder whose name is in the format "YYYY-MM-DD_HH-MM-SS + """ + import datetime + import os + + # Get the current date and time + current_datetime = datetime.datetime.now() + + # Format the date and time as a string + formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S") + + folder_name = ( + f"{formatted_datetime}" + if folder_suffix is None + else f"{formatted_datetime}-{folder_suffix}" + ) + + # Create the full path for the new folder + new_folder_path = os.path.join(parent_folder_path, folder_name) + + # Create the new folder + os.makedirs(new_folder_path, exist_ok=True) + + # Absolute path to output folder + abs_path = Path(new_folder_path).resolve() + + return abs_path + + +def save_dict_as_json(dict, folder_path, base_filename): + """ + Saves the dict to disk formatted as JSON + """ + import os + + # Create the full path for the request file + request_file_path = os.path.join(folder_path, f"{base_filename}.json") + + serializable_dict = make_serializable(dict) + + # Write the request to the file as JSON + with open(request_file_path, "w") as f: + json.dump(serializable_dict, f, indent=4) + + +def save_text(text, folder_path, base_filename, file_extension="txt"): + """ + Saves the text to disk + """ + import os + + # Create the full path for the request file + request_file_path = os.path.join(folder_path, f"{base_filename}.{file_extension}") + + # Write the request to the file as JSON + with open(request_file_path, "w") as f: + f.write(text) + + +def save_request_and_response( + request, response=None, exception=None, output_folder="output" +): + output_folder = create_output_folder(parent_folder_path=output_folder) + save_dict_as_json(request, output_folder, "request") + + if exception is not None: + if hasattr(exception, "response"): + save_dict_as_json(exception.response, output_folder, "exception") + else: + save_text(str(exception), output_folder, "exception", "txt") + return + + if response is None: + return + + save_dict_as_json(response, output_folder, "response") + + if reasoning_text := extract_response_reasoning(response): + save_text(reasoning_text, output_folder, "output-reasoning") + + if text := extract_response_text(response): + save_text(text, output_folder, "output-text", "md") + + # Save any images that were generated. + content_list = response["output"]["message"]["content"] + image_num = 0 + for node in content_list: + if "image" in node: + image_num += 1 + image_bytes = node["image"]["source"]["bytes"] + image_path = Path(output_folder) / f"output-image-{image_num}.png" + with open(image_path, "wb") as f: + f.write(image_bytes) + + print(f"""Outputs saved to {output_folder}""") + + return output_folder + + +def make_serializable(obj, omit_byte_data=False): + """ + Convert objects to JSON-serializable format. + + Args: + obj: Object to make serializable + + Returns: + JSON-serializable representation of the object + + Handles binary data, complex objects, and other non-serializable types + with appropriate fallback representations. + """ + if isinstance(obj, dict): + return { + key: make_serializable(value, omit_byte_data) for key, value in obj.items() + } + elif isinstance(obj, list): + return [make_serializable(item, omit_byte_data) for item in obj] + elif isinstance(obj, bytes): + # Convert bytes to base64 string with metadata + import base64 + + if not omit_byte_data: + return { + "bytes": "", + "_type": "bytes", + "_length": len(obj), + "_data_as_base64": base64.b64encode(obj).decode("utf-8"), + } + else: + return { + "bytes": "", + } + elif hasattr(obj, "__dict__"): + # Handle custom objects by converting to dict + return { + "_type": type(obj).__name__, + "_data": make_serializable(obj.__dict__, omit_byte_data), + } + elif not isinstance(obj, (str, int, float, bool, type(None))): + # Fallback for other non-serializable types + return {"_type": type(obj).__name__, "_repr": str(obj)} + else: + return obj diff --git a/nova-omni/getting-started/requirements.txt b/nova-omni/getting-started/requirements.txt new file mode 100644 index 00000000..29b3c617 --- /dev/null +++ b/nova-omni/getting-started/requirements.txt @@ -0,0 +1,10 @@ +boto3==1.42.4 +botocore==1.42.4 +requests +ipykernel +jupyter +langchain>=0.1.0 +langchain-aws>=0.1.0 +langgraph>=0.0.20 +strands-framework>=0.1.0 +pydantic>=2.0.0 \ No newline at end of file