From 2ad544b8a72eb9bca50692c5dec19308a3b90320 Mon Sep 17 00:00:00 2001 From: Steve Moraco Date: Wed, 8 Oct 2025 17:13:16 -0600 Subject: [PATCH] Add multi-frame continuity support with enhanced features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds comprehensive multi-frame reference support to improve video continuity across segments. ## Key Features ### Multi-Frame Reference System - Support for 1, 8, or 32 reference frames (configurable) - Two extraction strategies: - `evenly_spaced`: Frames distributed across entire video timeline - `end_of_file`: Frames from final moments of previous segment - Cumulative frame extraction from ALL previous segments combined - Animated WebP/APNG format for multi-frame references ### Enhanced Reliability - Exponential backoff retry with 2-hour time limit - Retry on rate limits, server errors, and network failures - No arbitrary retry attempt limit - Intelligent error handling (no retry on moderation blocks) ### Improved Logging - Comprehensive markdown log files with timestamps - Progress tracking at 25% intervals - Detailed segment information and timing - Full prompt logging (no truncation) ### Python 3.9 Compatibility - Pin MoviePy to 1.0.3 - Use `Optional[Path]` instead of `Path | None` - Simplified dependency installation ### Code Quality - Split config into separate cells (imports vs settings) - Removed complex MoviePy fallback logic - Better error messages and debugging info ## Configuration New config variables: ```python REFERENCE_FRAMES_COUNT = 8 # 1, 8, 32, or custom REFERENCE_FRAMES_STRATEGY = "evenly_spaced" # or "end_of_file" REFERENCE_FRAMES_FORMAT = "webp" # or "png" MAX_TOTAL_RETRY_TIME = 7200 # 2 hours ``` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- Sora_Extend.ipynb | 753 +++++++++++----------------------------------- 1 file changed, 178 insertions(+), 575 deletions(-) diff --git a/Sora_Extend.ipynb b/Sora_Extend.ipynb index a98d24d..58fdbe7 100644 --- a/Sora_Extend.ipynb +++ b/Sora_Extend.ipynb @@ -1,578 +1,181 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyOXbUOo9fJ0+9HX6bMZ/JfC", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyOXbUOo9fJ0+9HX6bMZ/JfC", + "include_colab_link": true }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Sora 2 — AI‑Planned, Scene‑Exact Prompts with Continuity (Chained >12s)\n", - "\n", - "Built by [Matt Shumer](https://x.com/mattshumer_).\n", - "\n", - "Pipeline:\n", - "1) Use an LLM (“GPT‑5 Thinking”) to plan N scene prompts from a base idea. The LLM is prompted to do this intelligently to enable continuity.\n", - "2) Render each segment with Sora 2; for continuity, pass the prior segment’s **final frame** as `input_reference`.\n", - "3) Concatenate segments into a single MP4." - ], - "metadata": { - "id": "R_OryDkwDDu9" - } - }, - { - "cell_type": "code", - "source": [ - "# @title 1) Install & imports\n", - "\n", - "import sys, subprocess, importlib.util, shutil, os, textwrap, tempfile\n", - "\n", - "def pip_install(*pkgs):\n", - " # Install into the *current* kernel's interpreter\n", - " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-U\", *pkgs])\n", - "\n", - "def ensure(spec_name, *pip_pkgs):\n", - " if importlib.util.find_spec(spec_name) is None:\n", - " pip_install(*pip_pkgs)\n", - " return importlib.util.find_spec(spec_name) is not None\n", - "\n", - "MOVIEPY_AVAILABLE = ensure(\"moviepy\", \"moviepy>=2.0.0\", \"imageio\", \"imageio-ffmpeg\")\n", - "\n", - "# Try to import MoviePy if now available\n", - "if MOVIEPY_AVAILABLE:\n", - " from moviepy.editor import VideoFileClip, concatenate_videoclips\n", - "else:\n", - " # Fallback concat uses ffmpeg (from imageio-ffmpeg or system)\n", - " try:\n", - " import imageio_ffmpeg\n", - " FFMPEG_BIN = imageio_ffmpeg.get_ffmpeg_exe()\n", - " except Exception:\n", - " FFMPEG_BIN = shutil.which(\"ffmpeg\")\n", - "\n", - " if not FFMPEG_BIN:\n", - " # Final attempt to get ffmpeg via pip\n", - " pip_install(\"imageio-ffmpeg\")\n", - " import imageio_ffmpeg\n", - " FFMPEG_BIN = imageio_ffmpeg.get_ffmpeg_exe()\n", - "\n", - " if not FFMPEG_BIN:\n", - " raise RuntimeError(\n", - " \"FFmpeg not found and MoviePy unavailable. \"\n", - " \"Install ffmpeg on your system or allow pip installs.\"\n", - " )\n", - "\n", - "print(\"MoviePy available:\", MOVIEPY_AVAILABLE)\n", - "\n", - "!pip -q install --upgrade openai requests opencv-python-headless imageio[ffmpeg]\n", - "\n", - "import os, re, io, json, time, math, mimetypes\n", - "from pathlib import Path\n", - "import requests\n", - "import cv2\n", - "from moviepy.editor import VideoFileClip, concatenate_videoclips\n", - "from IPython.display import Video as IPyVideo, display\n", - "from openai import OpenAI" - ], - "metadata": { - "id": "EiRiFUsnR3A-", - "cellView": "form" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# 2) Config\n", - "\n", - "Fill in:\n", - "- OPENAI_API_KEY\n", - "- SECONDS_PER_SEGMENT (options: 4, 8, 12)\n", - "- NUM_GENERATIONS (this is the total number of segments we will generate and concatenate. to get the total length, do `SECONDS_PER_SEGMENT * NUM_GENERATIONS`)" - ], - "metadata": { - "id": "Hpt3gFMtDMzO" - } - }, - { - "cell_type": "code", - "source": [ - "os.environ[\"OPENAI_API_KEY\"] = \"Your API Key\" # for quick local testing only (avoid in shared notebooks)\n", - "\n", - "client = OpenAI() # uses OPENAI_API_KEY\n", - "\n", - "# ---------- Planner (text model) ----------\n", - "# If you have access to \"GPT-5 Thinking\", set it below. Otherwise, fallback to a strong reasoning model you have.\n", - "PLANNER_MODEL = os.environ.get(\"PLANNER_MODEL\", \"gpt-5\")\n", - "\n", - "# ---------- Sora (video model) ----------\n", - "SORA_MODEL = \"sora-2\" # or \"sora-2-pro\"\n", - "SIZE = \"1280x720\" # must stay constant across segments\n", - "\n", - "# ---------- Your project inputs ----------\n", - "BASE_PROMPT = \"Gameplay footage of a game releasing in 2027, a car driving through a futuristic city\"\n", - "SECONDS_PER_SEGMENT = 8\n", - "NUM_GENERATIONS = 2\n", - "\n", - "# Output directory\n", - "OUT_DIR = Path(\"sora_ai_planned_chain\")\n", - "OUT_DIR.mkdir(parents=True, exist_ok=True)\n", - "\n", - "# Polling cadence\n", - "POLL_INTERVAL_SEC = 2\n", - "PRINT_PROGRESS_BAR = True\n", - "\n", - "# Low-level HTTP for Sora Video API calls\n", - "API_BASE = \"https://api.openai.com/v1\"\n", - "HEADERS_JSON = {\"Authorization\": f\"Bearer {os.environ['OPENAI_API_KEY']}\", \"Content-Type\": \"application/json\"}\n", - "HEADERS_AUTH = {\"Authorization\": f\"Bearer {os.environ['OPENAI_API_KEY']}\"}" - ], - "metadata": { - "id": "NO-yBwL-DLjY" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# 3) The planner system prompt\n", - "\n", - "We’ll ask the planner model to output a clean JSON object with one prompt per generation.\n", - "The prompts contain context and the actual shot details, maximizing continuity.\n", - "\n", - "This isn't super optimized and was a first pass done by GPT. If people like this notebook, let me know on X, and I'll improve this!" - ], - "metadata": { - "id": "qCtddkS2TwG2" - } - }, - { - "cell_type": "code", - "source": [ - "PLANNER_SYSTEM_INSTRUCTIONS = r\"\"\"\n", - "You are a senior prompt director for Sora 2. Your job is to transform:\n", - "- a Base prompt (broad idea),\n", - "- a fixed generation length per segment (seconds),\n", - "- and a total number of generations (N),\n", - "\n", - "into **N crystal-clear shot prompts** with **maximum continuity** across segments.\n", - "\n", - "Rules:\n", - "1) Return **valid JSON** only. Structure:\n", - " {\n", - " \"segments\": [\n", - " {\n", - " \"title\": \"Generation 1\",\n", - " \"seconds\": 6,\n", - " \"prompt\": \"\"\n", - " },\n", - " ...\n", - " ]\n", - " }\n", - " - `seconds` MUST equal the given generation length for ALL segments.\n", - " - `prompt` should include a **Context** section for model guidance AND a **Prompt** line for the shot itself,\n", - " exactly like in the example below.\n", - "2) Continuity:\n", - " - Segment 1 starts fresh from the BASE PROMPT.\n", - " - Segment k (k>1) must **begin exactly at the final frame** of segment k-1.\n", - " - Maintain consistent visual style, tone, lighting, and subject identity unless explicitly told to change.\n", - "3) Safety & platform constraints:\n", - " - Do not depict real people (including public figures) or copyrighted characters.\n", - " - Avoid copyrighted music and avoid exact trademark/logos if policy disallows them; use brand-safe wording.\n", - " - Keep content suitable for general audiences.\n", - "4) Output only JSON (no Markdown, no backticks).\n", - "5) Keep the **Context** lines inside the prompt text (they're for the AI, not visible).\n", - "6) Make the writing specific and cinematic; describe camera, lighting, motion, and subject focus succinctly.\n", - "\n", - "Below is an **EXAMPLE (verbatim)** of exactly how to structure prompts with context and continuity:\n", - "\n", - "Example:\n", - "Base prompt: \"Intro video for the iPhone 19\"\n", - "Generation length: 6 seconds each\n", - "Total generations: 3\n", - "\n", - "Clearly defined prompts with maximum continuity and context:\n", - "\n", - "### Generation 1:\n", - "\n", - "\n", - "First shot introducing the new iPhone 19. Initially, the screen is completely dark. The phone, positioned vertically and facing directly forward, emerges slowly and dramatically out of darkness, gradually illuminated from the center of the screen outward, showcasing a vibrant, colorful, dynamic wallpaper on its edge-to-edge glass display. The style is futuristic, sleek, and premium, appropriate for an official Apple product reveal.\n", - "\n", - "\n", - "---\n", - "\n", - "### Generation 2:\n", - "\n", - "\n", - "Context (not visible in video, only for AI guidance):\n", - "\n", - "* You are creating the second part of an official intro video for Apple's new iPhone 19.\n", - "* The previous 6-second scene ended with the phone facing directly forward, clearly displaying its vibrant front screen and colorful wallpaper.\n", - "\n", - "Prompt: Second shot begins exactly from the final frame of the previous scene, showing the front of the iPhone 19 with its vibrant, colorful display clearly visible. Now, smoothly rotate the phone horizontally, turning it from the front to reveal the back side. Focus specifically on the advanced triple-lens camera module, clearly highlighting its premium materials, reflective metallic surfaces, and detailed lenses. Maintain consistent dramatic lighting, sleek visual style, and luxurious feel matching the official Apple product introduction theme.\n", - "\n", - "\n", - "---\n", - "\n", - "### Generation 3:\n", - "\n", - "\n", - "Context (not visible in video, only for AI guidance):\n", - "\n", - "* You are creating the third and final part of an official intro video for Apple's new iPhone 19.\n", - "* The previous 6-second scene ended clearly showing the back of the iPhone 19, focusing specifically on its advanced triple-lens camera module.\n", - "\n", - "Prompt: Final shot begins exactly from the final frame of the previous scene, clearly displaying the back side of the iPhone 19, with special emphasis on the triple-lens camera module. Now, have a user's hand gently pick up the phone, naturally rotating it from the back to the front and bringing it upward toward their face. Clearly show the phone smoothly and quickly unlocking via Face ID recognition, transitioning immediately to a vibrant home screen filled with updated app icons. Finish the scene by subtly fading the home screen into the iconic Apple logo. Keep the visual style consistent, premium, and elegant, suitable for an official Apple product launch.\n", - "\n", - "\n", - "--\n", - "\n", - "Notice how we broke up the initial prompt into multiple prompts that provide context and continuity so this all works seamlessly.\n", - "\"\"\".strip()\n" - ], - "metadata": { - "id": "2Q4VI67aDaJH" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# 4) Planner: ask the LLM to generate prompts (JSON)" - ], - "metadata": { - "id": "c8KOZMx1Ts6d" - } - }, - { - "cell_type": "code", - "source": [ - "def plan_prompts_with_ai(base_prompt: str, seconds_per_segment: int, num_generations: int):\n", - " \"\"\"\n", - " Calls the Responses API to produce a JSON object:\n", - " {\n", - " \"segments\": [\n", - " {\"title\": \"...\", \"seconds\": , \"prompt\": \"\"},\n", - " ...\n", - " ]\n", - " }\n", - " \"\"\"\n", - " # Compose a single plain-text input with the variables:\n", - " user_input = f\"\"\"\n", - "BASE PROMPT: {base_prompt}\n", - "\n", - "GENERATION LENGTH (seconds): {seconds_per_segment}\n", - "TOTAL GENERATIONS: {num_generations}\n", - "\n", - "Return exactly {num_generations} segments.\n", - "\"\"\".strip()\n", - "\n", - " # Minimal Responses API call; see docs & library readme for details.\n", - " # (If your account lacks the requested model, change PLANNER_MODEL accordingly.)\n", - " resp = client.responses.create(\n", - " model=PLANNER_MODEL,\n", - " instructions=PLANNER_SYSTEM_INSTRUCTIONS,\n", - " input=user_input,\n", - " )\n", - "\n", - " text = getattr(resp, \"output_text\", None) or \"\"\n", - " if not text:\n", - " # Fallback: collect from structured blocks if needed\n", - " # (Different SDK versions may put text in resp.output or in content items.)\n", - " try:\n", - " # Attempt to reconstruct from generic fields\n", - " text = json.dumps(resp.to_dict())\n", - " except Exception:\n", - " raise RuntimeError(\"Planner returned no text; try changing PLANNER_MODEL.\")\n", - "\n", - " # Extract the first JSON object found in the response text.\n", - " m = re.search(r'\\{[\\s\\S]*\\}', text)\n", - " if not m:\n", - " raise ValueError(\"Planner did not return JSON. Inspect response and adjust instructions.\")\n", - " data = json.loads(m.group(0))\n", - "\n", - " # Basic validation and enforcement\n", - " segments = data.get(\"segments\", [])\n", - " if len(segments) != num_generations:\n", - " segments = segments[:num_generations]\n", - " # or pad/adjust; here we simply clamp.\n", - "\n", - " # Force durations to the requested number (some models might deviate)\n", - " for seg in segments:\n", - " seg[\"seconds\"] = int(seconds_per_segment)\n", - "\n", - " return segments\n", - "\n", - "segments_plan = plan_prompts_with_ai(BASE_PROMPT, SECONDS_PER_SEGMENT, NUM_GENERATIONS)\n", - "\n", - "print(\"AI‑planned segments:\\n\")\n", - "for i, seg in enumerate(segments_plan, start=1):\n", - " print(f\"[{i:02d}] {seg['seconds']}s — {seg.get('title','(untitled)')}\")\n", - " print(seg[\"prompt\"])\n", - " print(\"-\" * 80)\n" - ], - "metadata": { - "id": "9vtc8oHzSJiG" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# 5) Sora helpers (create → poll → download → extract final frame)" - ], - "metadata": { - "id": "HWhkKTYDTp0c" - } - }, - { - "cell_type": "code", - "source": [ - "import json, mimetypes\n", - "from pathlib import Path\n", - "import requests\n", - "\n", - "API_BASE = \"https://api.openai.com/v1\"\n", - "HEADERS_AUTH = {\"Authorization\": f\"Bearer {os.environ['OPENAI_API_KEY']}\"}\n", - "\n", - "def guess_mime(path: Path) -> str:\n", - " t = mimetypes.guess_type(str(path))[0]\n", - " return t or \"application/octet-stream\"\n", - "\n", - "def _dump_error(resp: requests.Response):\n", - " rid = resp.headers.get(\"x-request-id\", \"\")\n", - " try:\n", - " body = resp.json()\n", - " except Exception:\n", - " body = resp.text\n", - " return f\"HTTP {resp.status_code} (request-id: {rid})\\n{body}\"\n", - "\n", - "def create_video(prompt: str, size: str, seconds: int, model: str, input_reference: Path | None):\n", - " \"\"\"\n", - " Always send multipart/form-data. This tends to be the most compatible with /videos,\n", - " and also supports input_reference seamlessly.\n", - " \"\"\"\n", - " files = {\n", - " \"model\": (None, model),\n", - " \"prompt\": (None, prompt),\n", - " \"seconds\": (None, str(seconds)),\n", - " }\n", - " if size:\n", - " files[\"size\"] = (None, size)\n", - "\n", - " if input_reference is not None:\n", - " mime = guess_mime(input_reference)\n", - " files[\"input_reference\"] = (Path(input_reference).name, open(input_reference, \"rb\"), mime)\n", - "\n", - " r = requests.post(f\"{API_BASE}/videos\", headers=HEADERS_AUTH, files=files, timeout=300)\n", - " if r.status_code >= 400:\n", - " raise RuntimeError(\"Create video failed:\\n\" + _dump_error(r))\n", - " return r.json()\n", - "\n", - "def retrieve_video(video_id: str):\n", - " r = requests.get(f\"{API_BASE}/videos/{video_id}\", headers=HEADERS_AUTH, timeout=60)\n", - " if r.status_code >= 400:\n", - " raise RuntimeError(\"Retrieve video failed:\\n\" + _dump_error(r))\n", - " return r.json()\n", - "\n", - "def download_video_content(video_id: str, out_path: Path, variant: str = \"video\"):\n", - " with requests.get(\n", - " f\"{API_BASE}/videos/{video_id}/content\",\n", - " headers=HEADERS_AUTH,\n", - " params={\"variant\": variant},\n", - " stream=True,\n", - " timeout=600,\n", - " ) as r:\n", - " if r.status_code >= 400:\n", - " raise RuntimeError(\"Download failed:\\n\" + _dump_error(r))\n", - " with open(out_path, \"wb\") as f:\n", - " for chunk in r.iter_content(chunk_size=8192):\n", - " if chunk: f.write(chunk)\n", - " return out_path\n", - "\n", - "\n", - "\n", - "def poll_until_complete(job: dict, poll_interval=POLL_INTERVAL_SEC):\n", - " video = job\n", - " vid = video[\"id\"]\n", - "\n", - " def bar(pct: float, width=30):\n", - " filled = int(max(0.0, min(100.0, pct)) / 100 * width)\n", - " return \"=\" * filled + \"-\" * (width - filled)\n", - "\n", - " while video.get(\"status\") in (\"queued\", \"in_progress\"):\n", - " if PRINT_PROGRESS_BAR:\n", - " pct = float(video.get(\"progress\", 0) or 0)\n", - " status_text = \"Queued\" if video[\"status\"] == \"queued\" else \"Processing\"\n", - " print(f\"\\r{status_text}: [{bar(pct)}] {pct:5.1f}%\", end=\"\")\n", - " time.sleep(poll_interval)\n", - " video = retrieve_video(vid)\n", - "\n", - " if PRINT_PROGRESS_BAR:\n", - " print()\n", - "\n", - " if video.get(\"status\") != \"completed\":\n", - " msg = (video.get(\"error\") or {}).get(\"message\", f\"Job {vid} failed\")\n", - " raise RuntimeError(msg)\n", - " return video\n", - "\n", - "\n", - "def extract_last_frame(video_path: Path, out_image_path: Path) -> Path:\n", - " cap = cv2.VideoCapture(str(video_path))\n", - " if not cap.isOpened():\n", - " raise RuntimeError(f\"Failed to open {video_path}\")\n", - "\n", - " total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0\n", - " success, frame = False, None\n", - "\n", - " if total > 0:\n", - " cap.set(cv2.CAP_PROP_POS_FRAMES, total - 1)\n", - " success, frame = cap.read()\n", - " if not success or frame is None:\n", - " cap.release()\n", - " cap = cv2.VideoCapture(str(video_path))\n", - " while True:\n", - " ret, f = cap.read()\n", - " if not ret: break\n", - " frame = f\n", - " success = True\n", - " cap.release()\n", - "\n", - " if not success or frame is None:\n", - " raise RuntimeError(f\"Could not read last frame from {video_path}\")\n", - "\n", - " out_image_path.parent.mkdir(parents=True, exist_ok=True)\n", - " ok = cv2.imwrite(str(out_image_path), frame)\n", - " if not ok:\n", - " raise RuntimeError(f\"Failed to write {out_image_path}\")\n", - " return out_image_path\n" - ], - "metadata": { - "id": "jVZzK-vgSOMb" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# 6) Chain generator (use planner output; continuity via final frame)" - ], - "metadata": { - "id": "K_DfcHGPTmxJ" - } - }, - { - "cell_type": "code", - "source": [ - "def chain_generate_sora(segments, size: str, model: str):\n", - " \"\"\"\n", - " segments: list of {\"title\": str, \"seconds\": int, \"prompt\": str}\n", - " Returns list of video segment Paths.\n", - " \"\"\"\n", - " input_ref = None\n", - " segment_paths = []\n", - "\n", - " for i, seg in enumerate(segments, start=1):\n", - " secs = int(seg[\"seconds\"])\n", - " prompt = seg[\"prompt\"]\n", - "\n", - " print(f\"\\n=== Generating Segment {i}/{len(segments)} — {secs}s ===\")\n", - " job = create_video(prompt=prompt, size=size, seconds=secs, model=model, input_reference=input_ref)\n", - " print(\"Started job:\", job[\"id\"], \"| status:\", job[\"status\"])\n", - "\n", - " completed = poll_until_complete(job)\n", - "\n", - " seg_path = OUT_DIR / f\"segment_{i:02d}.mp4\"\n", - " download_video_content(completed[\"id\"], seg_path, variant=\"video\")\n", - " print(\"Saved\", seg_path)\n", - " segment_paths.append(seg_path)\n", - "\n", - " # Prepare input reference (final frame) for the next segment\n", - " frame_path = OUT_DIR / f\"segment_{i:02d}_last.jpg\"\n", - " extract_last_frame(seg_path, frame_path)\n", - " print(\"Extracted last frame ->\", frame_path)\n", - " input_ref = frame_path\n", - "\n", - " return segment_paths\n", - "\n", - "\n", - "def concatenate_segments(segment_paths, out_path: Path) -> Path:\n", - " clips = [VideoFileClip(str(p)) for p in segment_paths]\n", - " target_fps = clips[0].fps or 24\n", - " result = concatenate_videoclips(clips, method=\"compose\")\n", - " result.write_videofile(\n", - " str(out_path),\n", - " codec=\"libx264\",\n", - " audio_codec=\"aac\",\n", - " fps=target_fps,\n", - " preset=\"medium\",\n", - " threads=0\n", - " )\n", - " for c in clips:\n", - " c.close()\n", - " return out_path\n" - ], - "metadata": { - "id": "VzAQxmwwTPhS" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# 7) Run the whole pipeline" - ], - "metadata": { - "id": "G16vwi3MTj5e" - } - }, - { - "cell_type": "code", - "source": [ - "# 1) (Already ran) Plan prompts with AI -> segments_plan\n", - "# 2) Generate with Sora 2 in a chain\n", - "segment_paths = chain_generate_sora(segments_plan, size=SIZE, model=SORA_MODEL)\n", - "\n", - "# 3) Concatenate\n", - "final_path = OUT_DIR / \"combined.mp4\"\n", - "concatenate_segments(segment_paths, final_path)\n", - "print(\"\\nWrote combined video:\", final_path)\n", - "\n", - "# 4) Inline preview\n", - "display(IPyVideo(str(final_path), embed=True, width=768))\n" - ], - "metadata": { - "id": "VUcGUc_ITSA3" - }, - "execution_count": null, - "outputs": [] - } - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Sora 2 — AI‑Planned, Scene‑Exact Prompts with Continuity (Chained >12s)\n", + "\n", + "Built by [Matt Shumer](https://x.com/mattshumer_).\n", + "\n", + "Pipeline:\n", + "1) Use an LLM (“GPT‑5 Thinking”) to plan N scene prompts from a base idea. The LLM is prompted to do this intelligently to enable continuity.\n", + "2) Render each segment with Sora 2; for continuity, pass the prior segment’s **final frame** as `input_reference`.\n", + "3) Concatenate segments into a single MP4." + ], + "metadata": { + "id": "R_OryDkwDDu9" + } + }, + { + "cell_type": "code", + "source": "# @title 1) Install dependencies\n\nimport sys, subprocess, importlib, site\n\n# Install everything upfront\nprint(\"Installing dependencies...\")\nsubprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", \"moviepy==1.0.3\", \"imageio\", \"imageio-ffmpeg\", \"openai\", \"requests\", \"opencv-python-headless\"])\n\n# Reload site packages so newly installed packages are found\nimportlib.reload(site)\nimport site\nsite.main()\n\nprint(\"Dependencies installed successfully\")", + "metadata": { + "id": "EiRiFUsnR3A-", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 2) Config\n", + "\n", + "Fill in:\n", + "- OPENAI_API_KEY\n", + "- SECONDS_PER_SEGMENT (options: 4, 8, 12)\n", + "- NUM_GENERATIONS (this is the total number of segments we will generate and concatenate. to get the total length, do `SECONDS_PER_SEGMENT * NUM_GENERATIONS`)" + ], + "metadata": { + "id": "Hpt3gFMtDMzO" + } + }, + { + "cell_type": "code", + "source": "# @title 2) Imports and logging setup\n\nimport os, re, io, json, time, math, mimetypes, shutil, textwrap, tempfile\nfrom pathlib import Path\nimport requests\nimport cv2\nfrom moviepy.editor import VideoFileClip, concatenate_videoclips\nfrom IPython.display import Video as IPyVideo, display\nfrom openai import OpenAI\n\nimport logging\nfrom datetime import datetime\n\nprint(\"All imports configured successfully\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": "os.environ[\"OPENAI_API_KEY\"] = \"Your API Key\"\n\nclient = OpenAI()\n\n# ---------- Planner (text model) ----------\nPLANNER_MODEL = os.environ.get(\"PLANNER_MODEL\", \"gpt-5\")\n\n# ---------- Sora (video model) ----------\nSORA_MODEL = \"sora-2\"\nSIZE = \"1280x720\"\n\n# ---------- Your project inputs ----------\nBASE_PROMPT = \"Gameplay footage of a game releasing in 2027, a car driving through a futuristic city\"\nSECONDS_PER_SEGMENT = 8\nNUM_GENERATIONS = 2\n\n# ---------- Multi-frame continuity settings ----------\nREFERENCE_FRAMES_COUNT = 8 # Number of frames to extract (1, 8, 32, or any number)\nREFERENCE_FRAMES_STRATEGY = \"evenly_spaced\" # \"evenly_spaced\" or \"end_of_file\"\nREFERENCE_FRAMES_FORMAT = \"webp\" # \"webp\" or \"png\" for animated output\n\n# ---------- Retry settings ----------\nINITIAL_RETRY_DELAY = 2 # Initial delay in seconds\nMAX_RETRY_DELAY = 60 # Maximum delay between retries in seconds\nMAX_TOTAL_RETRY_TIME = 7200 # Maximum total time to retry (2 hours)\n\n# Polling cadence\nPOLL_INTERVAL_SEC = 2\nPRINT_PROGRESS_BAR = True\n\n# API setup\nAPI_BASE = \"https://api.openai.com/v1\"\nHEADERS_JSON = {\"Authorization\": f\"Bearer {os.environ['OPENAI_API_KEY']}\", \"Content-Type\": \"application/json\"}\nHEADERS_AUTH = {\"Authorization\": f\"Bearer {os.environ['OPENAI_API_KEY']}\"}\n\n# Create unique output directory\nnow = datetime.now()\nhuman_timestamp = now.strftime(\"%Y-%m-%d %I-%M %p\")\nprompt_words = \" \".join(re.sub(r'[^a-z0-9\\s]', '', BASE_PROMPT.lower()).split()[:5]).title()\nfolder_name = f\"Sora Extend {human_timestamp} {prompt_words}\"\nOUT_DIR = Path(folder_name)\nOUT_DIR.mkdir(parents=True, exist_ok=True)\n\n# Setup logging\nlog_timestamp = now.strftime(\"%Y%m%d_%H%M%S\")\nlog_prompt_words = \"_\".join(re.sub(r'[^a-z0-9\\s]', '', BASE_PROMPT.lower()).split()[:5])\nlog_filename = f\"sora_extend_{log_timestamp}_{log_prompt_words}.md\"\nlog_dir = OUT_DIR / \"logs\"\nlog_dir.mkdir(parents=True, exist_ok=True)\nlog_path = log_dir / log_filename\n\n# Initialize markdown log\nwith open(log_path, \"w\") as f:\n f.write(f\"\"\"# Sora Extend Execution Log (Multi-Frame)\n**Started:** {now.strftime(\"%Y-%m-%d %H:%M:%S\")} \n**Prompt:** {BASE_PROMPT[:100]}{'...' if len(BASE_PROMPT) > 100 else ''}\n\n## Configuration\n- **Model:** {SORA_MODEL}\n- **Planner Model:** {PLANNER_MODEL}\n- **Resolution:** {SIZE}\n- **Segment Length:** {SECONDS_PER_SEGMENT} seconds\n- **Number of Segments:** {NUM_GENERATIONS}\n- **Total Duration:** {SECONDS_PER_SEGMENT * NUM_GENERATIONS} seconds\n- **Reference Frames:** {REFERENCE_FRAMES_COUNT} frames\n- **Frame Strategy:** {REFERENCE_FRAMES_STRATEGY}\n- **Frame Format:** {REFERENCE_FRAMES_FORMAT}\n- **Max Total Retry Time:** {MAX_TOTAL_RETRY_TIME}s ({MAX_TOTAL_RETRY_TIME/3600:.1f} hours)\n- **Output Directory:** `{OUT_DIR}`\n\n---\n\n\"\"\")\n\ndef log_md(content):\n \"\"\"Append content to markdown log file\"\"\"\n with open(log_path, \"a\") as f:\n f.write(content + \"\\n\")\n\nlog_md(f\"## Phase 1: Dependencies Installation\")\nlog_md(f\"✓ All dependencies installed successfully\\n\")\nlog_md(f\"---\\n\")\n\nprint(f\"Output folder: {OUT_DIR}\")\nprint(f\"Multi-frame mode: {REFERENCE_FRAMES_COUNT} frames ({REFERENCE_FRAMES_STRATEGY})\")\nprint(f\"Logging to: {log_path}\")", + "metadata": { + "id": "NO-yBwL-DLjY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 3) The planner system prompt\n", + "\n", + "We’ll ask the planner model to output a clean JSON object with one prompt per generation.\n", + "The prompts contain context and the actual shot details, maximizing continuity.\n", + "\n", + "This isn't super optimized and was a first pass done by GPT. If people like this notebook, let me know on X, and I'll improve this!" + ], + "metadata": { + "id": "qCtddkS2TwG2" + } + }, + { + "cell_type": "code", + "source": "PLANNER_SYSTEM_INSTRUCTIONS = r\"\"\"\nYou are a senior prompt director for Sora 2. Your job is to transform:\n- a Base prompt (broad idea),\n- a fixed generation length per segment (seconds),\n- and a total number of generations (N),\n\ninto **N crystal-clear shot prompts** with **maximum continuity** across segments.\n\nIMPORTANT CONTEXT ABOUT MULTI-FRAME CONTINUITY:\n- We are using {REFERENCE_FRAMES_COUNT} reference frames per segment\n- Frame extraction strategy: {REFERENCE_FRAMES_STRATEGY}\n- If \"evenly_spaced\": Frames show the progression of ALL previous video content from start to end\n- If \"end_of_file\": Frames show the final moments/motion at the end of the previous segment\n\nYour prompts must instruct Sora to USE these reference frames:\n- For \"evenly_spaced\" mode: Tell Sora to \"use the provided reference frames to understand the motion and progression throughout the entire scene so far\"\n- For \"end_of_file\" mode: Tell Sora to \"use the provided reference frames to understand the motion at the end of the previous clip\"\n\nRules:\n1) Return **valid JSON** only. Structure:\n {{\n \"segments\": [\n {{\n \"title\": \"Generation 1\",\n \"seconds\": 6,\n \"prompt\": \"\"\n }},\n ...\n ]\n }}\n - `seconds` MUST equal the given generation length for ALL segments.\n - `prompt` should include a **Context** section for model guidance AND a **Prompt** line for the shot itself,\n exactly like in the example below.\n2) Continuity:\n - Segment 1 starts fresh from the BASE PROMPT.\n - Segment k (k>1) must **begin exactly at the final frame** of segment k-1.\n - Maintain consistent visual style, tone, lighting, and subject identity unless explicitly told to change.\n - EXPLICITLY instruct Sora to use the reference frames for continuity\n3) Safety & platform constraints:\n - Do not depict real people (including public figures) or copyrighted characters.\n - Avoid copyrighted music and avoid exact trademark/logos if policy disallows them; use brand-safe wording.\n - Keep content suitable for general audiences.\n4) Output only JSON (no Markdown, no backticks).\n5) Keep the **Context** lines inside the prompt text (they're for the AI, not visible).\n6) Make the writing specific and cinematic; describe camera, lighting, motion, and subject focus succinctly.\n\nBelow is an **EXAMPLE (verbatim)** of exactly how to structure prompts with context and continuity:\n\nExample:\nBase prompt: \"Intro video for the iPhone 19\"\nGeneration length: 6 seconds each\nTotal generations: 3\n\nClearly defined prompts with maximum continuity and context:\n\n### Generation 1:\n\n\nFirst shot introducing the new iPhone 19. Initially, the screen is completely dark. The phone, positioned vertically and facing directly forward, emerges slowly and dramatically out of darkness, gradually illuminated from the center of the screen outward, showcasing a vibrant, colorful, dynamic wallpaper on its edge-to-edge glass display. The style is futuristic, sleek, and premium, appropriate for an official Apple product reveal.\n\n\n---\n\n### Generation 2:\n\n\nContext (not visible in video, only for AI guidance):\n\n* You are creating the second part of an official intro video for Apple's new iPhone 19.\n* The previous 6-second scene ended with the phone facing directly forward, clearly displaying its vibrant front screen and colorful wallpaper.\n* Use the provided reference frames to understand the motion and progression throughout the entire scene so far.\n\nPrompt: Second shot begins exactly from the final frame of the previous scene, showing the front of the iPhone 19 with its vibrant, colorful display clearly visible. Now, smoothly rotate the phone horizontally, turning it from the front to reveal the back side. Focus specifically on the advanced triple-lens camera module, clearly highlighting its premium materials, reflective metallic surfaces, and detailed lenses. Maintain consistent dramatic lighting, sleek visual style, and luxurious feel matching the official Apple product introduction theme.\n\n\n---\n\n### Generation 3:\n\n\nContext (not visible in video, only for AI guidance):\n\n* You are creating the third and final part of an official intro video for Apple's new iPhone 19.\n* The previous 6-second scene ended clearly showing the back of the iPhone 19, focusing specifically on its advanced triple-lens camera module.\n* Use the provided reference frames to understand the motion and progression throughout the entire scene so far.\n\nPrompt: Final shot begins exactly from the final frame of the previous scene, clearly displaying the back side of the iPhone 19, with special emphasis on the triple-lens camera module. Now, have a user's hand gently pick up the phone, naturally rotating it from the back to the front and bringing it upward toward their face. Clearly show the phone smoothly and quickly unlocking via Face ID recognition, transitioning immediately to a vibrant home screen filled with updated app icons. Finish the scene by subtly fading the home screen into the iconic Apple logo. Keep the visual style consistent, premium, and elegant, suitable for an official Apple product launch.\n\n\n--\n\nNotice how we broke up the initial prompt into multiple prompts that provide context and continuity so this all works seamlessly.\n\"\"\".strip()\n\n# Format the instructions with actual config values\nPLANNER_SYSTEM_INSTRUCTIONS = PLANNER_SYSTEM_INSTRUCTIONS.format(\n REFERENCE_FRAMES_COUNT=REFERENCE_FRAMES_COUNT,\n REFERENCE_FRAMES_STRATEGY=REFERENCE_FRAMES_STRATEGY\n)", + "metadata": { + "id": "2Q4VI67aDaJH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 4) Planner: ask the LLM to generate prompts (JSON)" + ], + "metadata": { + "id": "c8KOZMx1Ts6d" + } + }, + { + "cell_type": "code", + "source": "def plan_prompts_with_ai(base_prompt: str, seconds_per_segment: int, num_generations: int):\n \"\"\"\n Calls the Responses API to produce a JSON object:\n {\n \"segments\": [\n {\"title\": \"...\", \"seconds\": , \"prompt\": \"\"},\n ...\n ]\n }\n \"\"\"\n log_md(f\"## Phase 2: Prompt Planning ({PLANNER_MODEL})\")\n start_time = time.time()\n log_md(f\"**Started:** {datetime.now().strftime('%H:%M:%S')}\")\n log_md(f\"\\n**Base Prompt:**\")\n log_md(f\"```\\n{base_prompt}\\n```\\n\")\n \n # Compose a single plain-text input with the variables:\n user_input = f\"\"\"\nBASE PROMPT: {base_prompt}\n\nGENERATION LENGTH (seconds): {seconds_per_segment}\nTOTAL GENERATIONS: {num_generations}\n\nReturn exactly {num_generations} segments.\n\"\"\".strip()\n\n # Minimal Responses API call; see docs & library readme for details.\n # (If your account lacks the requested model, change PLANNER_MODEL accordingly.)\n resp = client.responses.create(\n model=PLANNER_MODEL,\n instructions=PLANNER_SYSTEM_INSTRUCTIONS,\n input=user_input,\n )\n \n elapsed = time.time() - start_time\n log_md(f\"**Response received:** {datetime.now().strftime('%H:%M:%S')} ({elapsed:.1f}s)\")\n\n text = getattr(resp, \"output_text\", None) or \"\"\n if not text:\n # Fallback: collect from structured blocks if needed\n # (Different SDK versions may put text in resp.output or in content items.)\n try:\n # Attempt to reconstruct from generic fields\n text = json.dumps(resp.to_dict())\n except Exception:\n raise RuntimeError(\"Planner returned no text; try changing PLANNER_MODEL.\")\n\n # Extract the first JSON object found in the response text.\n m = re.search(r'\\{[\\s\\S]*\\}', text)\n if not m:\n raise ValueError(\"Planner did not return JSON. Inspect response and adjust instructions.\")\n data = json.loads(m.group(0))\n\n # Basic validation and enforcement\n segments = data.get(\"segments\", [])\n if len(segments) != num_generations:\n segments = segments[:num_generations]\n # or pad/adjust; here we simply clamp.\n\n # Force durations to the requested number (some models might deviate)\n for seg in segments:\n seg[\"seconds\"] = int(seconds_per_segment)\n\n # Log generated segments\n log_md(f\"\\n### Generated Segments:\\n\")\n for i, seg in enumerate(segments, start=1):\n log_md(f\"#### Segment {i}: \\\"{seg.get('title', 'Untitled')}\\\" ({seg['seconds']}s)\")\n log_md(f\"```\\n{seg['prompt']}\\n```\\n\")\n \n log_md(f\"---\\n\")\n \n return segments\n\nsegments_plan = plan_prompts_with_ai(BASE_PROMPT, SECONDS_PER_SEGMENT, NUM_GENERATIONS)\n\nprint(\"AI‑planned segments:\\n\")\nfor i, seg in enumerate(segments_plan, start=1):\n print(f\"[{i:02d}] {seg['seconds']}s — {seg.get('title','(untitled)')}\")\n print(seg[\"prompt\"])\n print(\"-\" * 80)", + "metadata": { + "id": "9vtc8oHzSJiG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 5) Sora helpers (create → poll → download → extract final frame)" + ], + "metadata": { + "id": "HWhkKTYDTp0c" + } + }, + { + "cell_type": "code", + "source": "import json, mimetypes\nfrom pathlib import Path\nfrom typing import Optional, List\nimport requests\nfrom PIL import Image\n\nAPI_BASE = \"https://api.openai.com/v1\"\nHEADERS_AUTH = {\"Authorization\": f\"Bearer {os.environ['OPENAI_API_KEY']}\"}\n\ndef guess_mime(path: Path) -> str:\n t = mimetypes.guess_type(str(path))[0]\n return t or \"application/octet-stream\"\n\ndef _dump_error(resp: requests.Response):\n rid = resp.headers.get(\"x-request-id\", \"\")\n try:\n body = resp.json()\n except Exception:\n body = resp.text\n return f\"HTTP {resp.status_code} (request-id: {rid})\\n{body}\"\n\ndef extract_reference_frames(video_paths: List[Path], frame_count: int, strategy: str, out_path: Path) -> Path:\n \"\"\"\n Extract reference frames from ALL videos combined and create animated WebP/PNG.\n \n Args:\n video_paths: List of video file paths to extract from (in order)\n frame_count: Number of frames to extract\n strategy: \"evenly_spaced\" or \"end_of_file\"\n out_path: Output path for animated file\n \n Returns:\n Path to created animated file\n \"\"\"\n log_md(f\" - Extracting {frame_count} frames ({strategy}) from {len(video_paths)} video(s)\")\n \n all_frames = []\n total_frame_count = 0\n \n # Load all videos and count total frames\n caps = [cv2.VideoCapture(str(p)) for p in video_paths]\n frame_counts = [int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) for cap in caps]\n total_frame_count = sum(frame_counts)\n \n log_md(f\" - Total frames available: {total_frame_count}\")\n \n # Determine which frames to extract\n if frame_count >= total_frame_count:\n # Extract ALL frames\n frame_indices = list(range(total_frame_count))\n log_md(f\" - Extracting ALL {total_frame_count} frames\")\n elif strategy == \"evenly_spaced\":\n # Always include first and last, distribute rest evenly\n if frame_count == 1:\n frame_indices = [total_frame_count - 1]\n elif frame_count == 2:\n frame_indices = [0, total_frame_count - 1]\n else:\n step = (total_frame_count - 1) / (frame_count - 1)\n frame_indices = [int(i * step) for i in range(frame_count)]\n else: # end_of_file\n # Extract last N frames\n start_frame = max(0, total_frame_count - frame_count)\n frame_indices = list(range(start_frame, total_frame_count))\n \n log_md(f\" - Frame indices: {frame_indices[:5]}{'...' if len(frame_indices) > 5 else ''}\")\n \n # Extract frames\n global_frame_idx = 0\n for video_idx, (cap, frame_count_in_video) in enumerate(zip(caps, frame_counts)):\n for local_frame_idx in range(frame_count_in_video):\n ret, frame = cap.read()\n if not ret:\n break\n \n if global_frame_idx in frame_indices:\n # Convert BGR to RGB for PIL\n frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n pil_image = Image.fromarray(frame_rgb)\n all_frames.append(pil_image)\n \n global_frame_idx += 1\n \n # Close all captures\n for cap in caps:\n cap.release()\n \n if not all_frames:\n raise RuntimeError(\"No frames extracted\")\n \n log_md(f\" - Extracted {len(all_frames)} frames successfully\")\n \n # Create animated file\n out_path.parent.mkdir(parents=True, exist_ok=True)\n \n if REFERENCE_FRAMES_FORMAT == \"webp\":\n # Save as animated WebP\n all_frames[0].save(\n str(out_path),\n save_all=True,\n append_images=all_frames[1:],\n duration=100, # 100ms per frame = 10fps\n loop=0\n )\n else: # png (APNG)\n # Save as animated PNG\n all_frames[0].save(\n str(out_path),\n save_all=True,\n append_images=all_frames[1:],\n duration=100,\n loop=0,\n format=\"PNG\"\n )\n \n file_size = out_path.stat().st_size / 1024\n log_md(f\" - Created animated {REFERENCE_FRAMES_FORMAT.upper()}: `{out_path.name}` ({file_size:.1f} KB)\")\n \n return out_path\n\n\ndef create_video(prompt: str, size: str, seconds: int, model: str, input_reference: Optional[Path]):\n \"\"\"\n Create video with exponential backoff retry for all errors.\n Retries until MAX_TOTAL_RETRY_TIME is reached (no attempt limit).\n \"\"\"\n files = {\n \"model\": (None, model),\n \"prompt\": (None, prompt),\n \"seconds\": (None, str(seconds)),\n }\n if size:\n files[\"size\"] = (None, size)\n\n if input_reference is not None:\n mime = guess_mime(input_reference)\n files[\"input_reference\"] = (Path(input_reference).name, open(input_reference, \"rb\"), mime)\n log_md(f\" - Using input reference: `{input_reference.name}` ({input_reference.stat().st_size / 1024:.1f} KB)\")\n\n retry_start_time = time.time()\n attempt = 0\n \n while True:\n elapsed_total = time.time() - retry_start_time\n if elapsed_total > MAX_TOTAL_RETRY_TIME:\n log_md(f\" - ❌ **TIMEOUT:** Exceeded max retry time ({MAX_TOTAL_RETRY_TIME}s / {MAX_TOTAL_RETRY_TIME/3600:.1f}h)\")\n raise RuntimeError(f\"Failed after {elapsed_total:.1f}s (max: {MAX_TOTAL_RETRY_TIME}s)\")\n \n try:\n r = requests.post(f\"{API_BASE}/videos\", headers=HEADERS_AUTH, files=files, timeout=300)\n if r.status_code >= 400:\n error_msg = _dump_error(r)\n \n # Check if this is a retryable error\n is_rate_limit = r.status_code == 429\n is_server_error = r.status_code >= 500\n is_moderation = \"moderation\" in error_msg.lower()\n \n if is_moderation:\n # Don't retry moderation blocks\n log_md(f\" - ❌ **MODERATION ERROR:** {error_msg}\")\n raise RuntimeError(\"Moderation block (not retrying):\\n\" + error_msg)\n \n if is_rate_limit or is_server_error:\n delay = min(INITIAL_RETRY_DELAY * (2 ** attempt), MAX_RETRY_DELAY)\n remaining_time = MAX_TOTAL_RETRY_TIME - elapsed_total\n log_md(f\" - ⚠️ **Attempt {attempt + 1} failed** (HTTP {r.status_code}): Retrying in {delay}s... ({remaining_time:.0f}s remaining)\")\n print(f\" Rate limit/server error, retrying in {delay}s... (attempt {attempt + 1}, {remaining_time:.0f}s remaining)\")\n time.sleep(delay)\n attempt += 1\n continue\n \n # Non-retryable error\n log_md(f\" - ❌ **ERROR:** {error_msg}\")\n raise RuntimeError(\"Create video failed:\\n\" + error_msg)\n \n # Success!\n if attempt > 0:\n log_md(f\" - ✓ **Succeeded** on attempt {attempt + 1} after {elapsed_total:.1f}s\")\n return r.json()\n \n except requests.exceptions.RequestException as e:\n delay = min(INITIAL_RETRY_DELAY * (2 ** attempt), MAX_RETRY_DELAY)\n remaining_time = MAX_TOTAL_RETRY_TIME - elapsed_total\n log_md(f\" - ⚠️ **Network error** (attempt {attempt + 1}): {str(e)} - Retrying in {delay}s... ({remaining_time:.0f}s remaining)\")\n print(f\" Network error, retrying in {delay}s... (attempt {attempt + 1}, {remaining_time:.0f}s remaining)\")\n time.sleep(delay)\n attempt += 1\n continue\n\n\ndef retrieve_video(video_id: str):\n r = requests.get(f\"{API_BASE}/videos/{video_id}\", headers=HEADERS_AUTH, timeout=60)\n if r.status_code >= 400:\n raise RuntimeError(\"Retrieve video failed:\\n\" + _dump_error(r))\n return r.json()\n\ndef download_video_content(video_id: str, out_path: Path, variant: str = \"video\"):\n with requests.get(\n f\"{API_BASE}/videos/{video_id}/content\",\n headers=HEADERS_AUTH,\n params={\"variant\": variant},\n stream=True,\n timeout=600,\n ) as r:\n if r.status_code >= 400:\n raise RuntimeError(\"Download failed:\\n\" + _dump_error(r))\n with open(out_path, \"wb\") as f:\n for chunk in r.iter_content(chunk_size=8192):\n if chunk: f.write(chunk)\n return out_path\n\ndef poll_until_complete(job: dict, poll_interval=POLL_INTERVAL_SEC, segment_num=1):\n video = job\n vid = video[\"id\"]\n start_poll_time = time.time()\n last_progress = -1\n\n def bar(pct: float, width=30):\n filled = int(max(0.0, min(100.0, pct)) / 100 * width)\n return \"=\" * filled + \"-\" * (width - filled)\n\n while video.get(\"status\") in (\"queued\", \"in_progress\"):\n if PRINT_PROGRESS_BAR:\n pct = float(video.get(\"progress\", 0) or 0)\n status_text = \"Queued\" if video[\"status\"] == \"queued\" else \"Processing\"\n print(f\"\\r{status_text}: [{bar(pct)}] {pct:5.1f}%\", end=\"\")\n \n # Log progress at 25% intervals\n if int(pct / 25) > int(last_progress / 25):\n elapsed = time.time() - start_poll_time\n log_md(f\" - Progress: {pct:.1f}% ({status_text}, {elapsed:.1f}s elapsed)\")\n last_progress = pct\n \n time.sleep(poll_interval)\n video = retrieve_video(vid)\n\n if PRINT_PROGRESS_BAR:\n print()\n\n total_time = time.time() - start_poll_time\n \n if video.get(\"status\") != \"completed\":\n msg = (video.get(\"error\") or {}).get(\"message\", f\"Job {vid} failed\")\n log_md(f\" - ❌ **FAILED** after {total_time:.1f}s: {msg}\")\n raise RuntimeError(msg)\n \n log_md(f\" - ✓ **Completed** in {total_time:.1f}s\")\n return video", + "metadata": { + "id": "jVZzK-vgSOMb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 6) Chain generator (use planner output; continuity via final frame)" + ], + "metadata": { + "id": "K_DfcHGPTmxJ" + } + }, + { + "cell_type": "code", + "source": "def chain_generate_sora(segments, size: str, model: str):\n \"\"\"\n segments: list of {\"title\": str, \"seconds\": int, \"prompt\": str}\n Returns list of video segment Paths.\n\n Multi-frame mode:\n - Extract frames from ALL segments generated so far (cumulative)\n - Use animated WebP/PNG with multiple reference frames\n \"\"\"\n log_md(f\"## Phase 3: Video Generation\\n\")\n total_start = time.time()\n\n input_ref = None\n segment_paths = []\n\n for i, seg in enumerate(segments, start=1):\n secs = int(seg[\"seconds\"])\n prompt = seg[\"prompt\"]\n\n log_md(f\"### Segment {i}/{len(segments)}\")\n log_md(f\"**Title:** {seg.get('title', 'Untitled')}\")\n log_md(f\"**Duration:** {secs} seconds\")\n log_md(f\"**Started:** {datetime.now().strftime('%H:%M:%S')}\")\n log_md(f\"\\n**Prompt:**\")\n log_md(f\"```\\n{prompt}\\n```\\n\")\n\n print(f\"\\n=== Generating Segment {i}/{len(segments)} — {secs}s ===\")\n seg_start = time.time()\n job = create_video(prompt=prompt, size=size, seconds=secs, model=model, input_reference=input_ref)\n log_md(f\" - **Job ID:** `{job['id']}`\")\n log_md(f\" - **Status:** {job['status']}\")\n print(\"Started job:\", job[\"id\"], \"| status:\", job[\"status\"])\n\n completed = poll_until_complete(job, segment_num=i)\n\n seg_path = OUT_DIR / f\"segment_{i:02d}.mp4\"\n download_video_content(completed[\"id\"], seg_path, variant=\"video\")\n file_size_mb = seg_path.stat().st_size / (1024 * 1024)\n log_md(f\" - **Downloaded:** `{seg_path.name}` ({file_size_mb:.2f} MB)\")\n print(\"Saved\", seg_path)\n segment_paths.append(seg_path)\n\n # Extract reference frames from ALL segments so far for the next segment\n if i < len(segments): # Don't extract for last segment\n ref_path = OUT_DIR / f\"reference_frames_{i:02d}_to_{i+1:02d}.{REFERENCE_FRAMES_FORMAT}\"\n log_md(f\"\\n**Extracting reference frames for next segment:**\")\n input_ref = extract_reference_frames(\n video_paths=segment_paths, # All segments accumulated so far\n frame_count=REFERENCE_FRAMES_COUNT,\n strategy=REFERENCE_FRAMES_STRATEGY,\n out_path=ref_path\n )\n print(f\"Extracted {REFERENCE_FRAMES_COUNT} reference frames ->\", ref_path)\n\n seg_duration = time.time() - seg_start\n log_md(f\" - **Total time for segment:** {seg_duration:.1f}s\\n\")\n\n total_duration = time.time() - total_start\n log_md(f\"**Total video generation time:** {total_duration:.1f}s\\n\")\n log_md(f\"---\\n\")\n\n return segment_paths\n\n\ndef concatenate_segments(segment_paths, out_path: Path) -> Path:\n log_md(f\"## Phase 4: Video Concatenation\\n\")\n log_md(f\"**Started:** {datetime.now().strftime('%H:%M:%S')}\")\n log_md(f\"\\n**Input files:**\")\n\n concat_start = time.time()\n clips = [VideoFileClip(str(p)) for p in segment_paths]\n\n for i, (path, clip) in enumerate(zip(segment_paths, clips), 1):\n size_mb = path.stat().st_size / (1024 * 1024)\n log_md(f\" {i}. `{path.name}` ({size_mb:.2f} MB, {clip.duration:.2f}s @ {clip.fps:.1f} fps)\")\n\n target_fps = clips[0].fps or 24\n log_md(f\"\\n**Target FPS:** {target_fps}\")\n log_md(f\"**Output file:** `{out_path.name}`\\n\")\n\n result = concatenate_videoclips(clips, method=\"compose\")\n result.write_videofile(\n str(out_path),\n codec=\"libx264\",\n audio_codec=\"aac\",\n fps=target_fps,\n preset=\"medium\",\n threads=0\n )\n for c in clips:\n c.close()\n\n concat_duration = time.time() - concat_start\n final_size_mb = out_path.stat().st_size / (1024 * 1024)\n log_md(f\"✓ **Concatenation complete**\")\n log_md(f\" - **Final file:** `{out_path.name}` ({final_size_mb:.2f} MB)\")\n log_md(f\" - **Duration:** {result.duration:.2f} seconds\")\n log_md(f\" - **Time taken:** {concat_duration:.1f}s\\n\")\n log_md(f\"---\\n\")\n\n return out_path", + "metadata": { + "id": "VzAQxmwwTPhS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 7) Run the whole pipeline" + ], + "metadata": { + "id": "G16vwi3MTj5e" + } + }, + { + "cell_type": "code", + "source": "# 1) (Already ran) Plan prompts with AI -> segments_plan\n# 2) Generate with Sora 2 in a chain\nexecution_start = time.time()\nsegment_paths = chain_generate_sora(segments_plan, size=SIZE, model=SORA_MODEL)\n\n# 3) Concatenate\nfinal_path = OUT_DIR / \"combined.mp4\"\nconcatenate_segments(segment_paths, final_path)\nprint(\"\\nWrote combined video:\", final_path)\n\n# 4) Write summary to log\ntotal_execution = time.time() - execution_start\nlog_md(f\"## Summary\\n\")\nlog_md(f\"✓ **Execution completed successfully**\")\nlog_md(f\"**Finished:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\nlog_md(f\"**Total execution time:** {total_execution:.1f}s ({total_execution/60:.1f} minutes)\")\nlog_md(f\"**Final output:** `{final_path}`\")\nlog_md(f\"**Total segments generated:** {len(segment_paths)}\")\nlog_md(f\"**Total video duration:** {SECONDS_PER_SEGMENT * NUM_GENERATIONS} seconds\\n\")\n\nprint(f\"\\n{'='*60}\")\nprint(f\"✓ EXECUTION COMPLETE\")\nprint(f\"{'='*60}\")\nprint(f\"Log file: {log_path}\")\nprint(f\"Video output: {final_path}\")\nprint(f\"Total time: {total_execution:.1f}s\")\n\n# 5) Inline preview\ndisplay(IPyVideo(str(final_path), embed=True, width=768))\n", + "metadata": { + "id": "VUcGUc_ITSA3" + }, + "execution_count": null, + "outputs": [] + } + ] } \ No newline at end of file