diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
new file mode 100644
index 0000000..d9fd84c
--- /dev/null
+++ b/.claude/CLAUDE.md
@@ -0,0 +1,25 @@
+# Voicebox Project Notes
+
+## CLI — voicebox-cli vs cli.py
+
+**`voicebox/voicebox-cli`** is the real CLI. It is stdlib-only (no pip deps), self-contained, and is what users actually run. It has all commands: `server`, `voices`, `import`, `generate`/`say`, `health`, `config`, `transcribe`, `create-voice`. Config persists to `~/.config/voicebox/config.json`.
+
+**`voicebox/backend/cli.py`** is dead code. It predates `voicebox-cli` and was superseded. Its only live reference is the launcher line in `setup-linux.sh` which is intentionally left as-is. **Do not modify cli.py.**
+
+When the user asks for CLI changes, always work on `voicebox-cli`.
+
+## Key Architecture
+
+- **Backend**: FastAPI (`backend/main.py`) served by uvicorn on port 17493
+- **Entry points**: `server.py` (PyInstaller binary), `backend/main.py __main__` (dev)
+- **Dev script**: `scripts/dev-backend-watch.sh` — loads `.env` from `voicebox/` and `../` then runs uvicorn with `--reload`
+- **MLX backend**: `backend/backends/mlx_backend.py` — Apple Silicon only, uses mlx-audio. Models: `mlx-community/Qwen3-TTS-12Hz-{1.7B,0.6B}-Base-4bit`. Uses `Base` variants (not `CustomVoice` — those require a named speaker, not ref_audio).
+- **PyTorch backend**: `backend/backends/pytorch_backend.py` — CUDA/CPU, uses qwen-tts
+- **Logging**: stdlib `logging`. Set `LOG_LEVEL=DEBUG` env var for verbose output.
+
+## MLX Gotchas
+
+- `transformers` verbosity is suppressed at module-level import in `mlx_backend.py` — do not restore or move this
+- Concurrent MLX loads crash Metal (`commit an already committed command buffer`) — serialized via `_MLX_LOAD_LOCK` threading lock in `load_model_async`
+- `CustomVoice` model variants require a named speaker arg; `Base` variants support arbitrary voice cloning via `ref_audio`/`ref_text`
+- On 16GB unified memory, bf16 models cause swap pressure — use 4-bit quantized variants
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..b3042dc
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,27 @@
+data/
+backend/venv/
+node_modules/
+__pycache__/
+*.pyc
+*.egg-info/
+.claude/
+.git/
+.github/
+.vscode/
+*.md
+docs/
+mlx-test/
+scripts/
+tauri/
+web/
+landing/
+.DS_Store
+*.log
+*.cache
+dist/
+build/
+.env
+.env.*
+*.swp
+*.swo
+*~
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
new file mode 100755
index 0000000..5dca125
--- /dev/null
+++ b/.githooks/pre-commit
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Pre-commit hook: lint staged files
+# Install: git config core.hooksPath .githooks
+set -euo pipefail
+
+REPO_ROOT="$(git rev-parse --show-toplevel)"
+STAGED=$(git diff --cached --name-only --diff-filter=ACMR)
+
+if [[ -z "$STAGED" ]]; then
+    exit 0
+fi
+
+FAILED=0
+
+# ── Python (ruff) ─────────────────────────────────────────────────────────────
+PY_FILES=$(echo "$STAGED" | grep '\.py$' || true)
+if [[ -n "$PY_FILES" ]]; then
+    RUFF="$REPO_ROOT/backend/venv/bin/ruff"
+    if [[ -x "$RUFF" ]]; then
+        echo "→ ruff: checking Python files..."
+        if ! echo "$PY_FILES" | xargs "$RUFF" check --quiet; then
+            echo "  ruff found issues. Run: backend/venv/bin/ruff check --fix <file>"
+            FAILED=1
+        fi
+    else
+        echo "  (ruff not found in backend/venv — skipping Python lint)"
+    fi
+fi
+
+# ── JS/TS (biome) ─────────────────────────────────────────────────────────────
+JS_FILES=$(echo "$STAGED" | grep -E '\.(js|jsx|ts|tsx|json)$' | grep -v 'node_modules' || true)
+if [[ -n "$JS_FILES" ]]; then
+    BIOME=$(command -v biome 2>/dev/null \
+        || ls "$REPO_ROOT"/node_modules/.bin/biome 2>/dev/null \
+        || ls "$REPO_ROOT"/app/node_modules/.bin/biome 2>/dev/null \
+        || true)
+    if [[ -x "$BIOME" ]]; then
+        echo "→ biome: checking JS/TS files..."
+        if ! echo "$JS_FILES" | xargs "$BIOME" check --no-errors-on-unmatched; then
+            echo "  biome found issues. Run: biome check --write <file>"
+            FAILED=1
+        fi
+    else
+        echo "  (biome not found — skipping JS/TS lint)"
+    fi
+fi
+
+exit $FAILED
diff --git a/.gitignore b/.gitignore
index 05f7ef0..4fbee19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,8 @@ data/profiles/*
 data/generations/*
 data/projects/*
 data/voicebox.db
+data/huggingface
+data/model_prefs.json
 !data/.gitkeep
 
 # Logs
@@ -57,3 +59,7 @@ tauri/src-tauri/binaries/*
 tmp/
 temp/
 *.tmp
+output*.m4a
+package-lock.json
+.claude
+tauri/src-tauri/gen/Assets.car
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..763b626
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12.12
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..f3d7c9a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,88 @@
+# Voicebox TTS Server
+# CUDA 12.9 + Python 3.12 on Ubuntu 24.04
+#
+# Build:
+#   DOCKER_BUILDKIT=1 docker build -t voicebox .
+#   DOCKER_BUILDKIT=1 docker build --build-arg CUDA=0 -t voicebox-cpu .
+#   DOCKER_BUILDKIT=1 docker build --build-arg SERVERLESS=1 -t voicebox-serverless .
+#
+# Run:
+#   docker compose up -d
+#
+# syntax=docker/dockerfile:1.4
+
+ARG CUDA=1
+ARG SERVERLESS=0
+
+# --- Base stage ---
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS base-cuda
+FROM ubuntu:24.04 AS base-cpu
+
+# --- Pick base based on CUDA arg --
+FROM base-cuda AS base-1
+FROM base-cpu AS base-0
+FROM base-${CUDA} AS base
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-venv \
+    python3-dev \
+    python3-pip \
+    libsndfile1 \
+    ffmpeg \
+    curl \
+    sox \
+    && rm -rf /var/lib/apt/lists/*
+
+# --- Dependencies stage (cached layer) ---
+FROM base AS deps
+
+ARG CUDA
+WORKDIR /app
+
+# Create virtual environment outside /app to survive volume mount
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+COPY backend/requirements.txt ./requirements.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --upgrade pip && \
+    if [ "$CUDA" = "1" ]; then \
+        pip install torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu124 && \
+        pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu124; \
+    else \
+        pip install torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cpu && \
+        pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu; \
+    fi
+
+# Source is volume-mounted at runtime (local dev) or COPYed below (serverless)
+ENV HF_HOME=/app/data/huggingface
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy source into image for non-volume-mount deployments (e.g. RunPod)
+COPY backend/ /app/backend/
+
+# --- Normal mode: FastAPI server on port 17493 ---
+FROM deps AS final-0
+EXPOSE 17493
+HEALTHCHECK --interval=60s --timeout=5s --start-period=30s --retries=3 \
+    CMD curl -f http://localhost:17493/health || exit 1
+ENTRYPOINT ["/opt/venv/bin/python3", "-m", "backend.main"]
+CMD ["--host", "0.0.0.0", "--port", "17493", "--data-dir", "/app/data"]
+
+# --- Serverless mode: RunPod handler ---
+FROM deps AS final-1
+ENV SERVERLESS=1
+HEALTHCHECK NONE
+ENTRYPOINT ["/opt/venv/bin/python3", "-u", "-m", "backend.serverless_handler"]
+CMD []
+
+# --- Pick final stage based on SERVERLESS arg ---
+ARG SERVERLESS
+FROM final-${SERVERLESS} AS final
diff --git a/Makefile b/Makefile
index 620f6c8..bd082ad 100644
--- a/Makefile
+++ b/Makefile
@@ -41,19 +41,29 @@ setup: setup-js setup-python ## Full project setup (all dependencies)
 	@echo -e "  Run $(YELLOW)make dev$(NC) to start development servers"
 
 setup-js: ## Install JavaScript dependencies (bun)
+	@command -v bun >/dev/null 2>&1 || { \
+		echo -e "$(YELLOW)bun not found — installing...$(NC)"; \
+		curl -fsSL https://bun.sh/install | bash; \
+	}
 	@echo -e "$(BLUE)Installing JavaScript dependencies...$(NC)"
 	bun install
 
 setup-python: $(VENV)/bin/activate ## Set up Python virtual environment and dependencies
 	@echo -e "$(BLUE)Installing Python dependencies...$(NC)"
 	$(PIP) install --upgrade pip
-	$(PIP) install -r $(BACKEND_DIR)/requirements.txt
 	@if [ "$$(uname -m)" = "arm64" ] && [ "$$(uname)" = "Darwin" ]; then \
-		echo -e "$(BLUE)Detected Apple Silicon - installing MLX dependencies...$(NC)"; \
+		echo -e "$(BLUE)Detected Apple Silicon - using MLX-compatible dependency resolution...$(NC)"; \
 		$(PIP) install -r $(BACKEND_DIR)/requirements-mlx.txt; \
+		grep -v -E "^transformers" $(BACKEND_DIR)/requirements.txt > /tmp/voicebox-requirements-filtered.txt; \
+		$(PIP) install -r /tmp/voicebox-requirements-filtered.txt; \
+		rm /tmp/voicebox-requirements-filtered.txt; \
+		$(PIP) install --no-deps git+https://github.com/QwenLM/Qwen3-TTS.git; \
 		echo -e "$(GREEN)✓ MLX backend enabled (native Metal acceleration)$(NC)"; \
+		echo -e "$(YELLOW)Note: Using transformers 5.0.0rc3 (required by MLX)$(NC)"; \
+	else \
+		$(PIP) install -r $(BACKEND_DIR)/requirements.txt; \
+		$(PIP) install git+https://github.com/QwenLM/Qwen3-TTS.git; \
 	fi
-	$(PIP) install git+https://github.com/QwenLM/Qwen3-TTS.git
 	@echo -e "$(GREEN)✓ Python environment ready$(NC)"
 
 $(VENV)/bin/activate:
@@ -72,7 +82,7 @@ setup-rust: ## Install Rust toolchain (if not present)
 # DEVELOPMENT
 # =============================================================================
 
-.PHONY: dev dev-backend dev-frontend dev-web kill-dev
+.PHONY: dev dev-backend dev-backend-watch dev-frontend dev-web kill-dev
 
 dev: ## Start backend + desktop app (parallel)
 	@echo -e "$(BLUE)Starting development servers...$(NC)"
@@ -82,9 +92,11 @@ dev: ## Start backend + desktop app (parallel)
 		sleep 2 && $(MAKE) dev-frontend & \
 		wait
 
-dev-backend: ## Start FastAPI backend server
+dev-backend: dev-backend-watch ## Start FastAPI backend server (venv-verified, auto-reload)
+
+dev-backend-watch: ## Start backend with venv verification + Python file watching
 	@echo -e "$(BLUE)Starting backend server on http://localhost:17493$(NC)"
-	$(VENV_BIN)/uvicorn backend.main:app --reload --port 17493
+	./scripts/dev-backend-watch.sh
 
 dev-frontend: ## Start Tauri desktop app
 	@echo -e "$(BLUE)Starting Tauri desktop app...$(NC)"
diff --git a/SERVERLESS.md b/SERVERLESS.md
new file mode 100644
index 0000000..cb003c1
--- /dev/null
+++ b/SERVERLESS.md
@@ -0,0 +1,161 @@
+# RunPod Serverless Deployment
+
+Voicebox can run as a [RunPod Serverless](https://docs.runpod.io/serverless/quickstart) worker. Workers spin up on demand, process requests, and shut down automatically — you only pay while they're running.
+
+## How it works
+
+The serverless image starts a RunPod handler (`serverless_handler.py`) which:
+
+1. Launches the existing FastAPI server in a background thread
+2. Waits for `/health` to respond (up to 5 min on cold start for model downloads)
+3. Proxies each RunPod job as an HTTP request to the local server
+4. Returns the response — JSON as-is, audio files as base64
+
+Model idle-unloading is disabled in serverless mode (`SERVERLESS=1`). The model stays loaded for the worker's lifetime. RunPod shuts down the entire worker after the configured idle timeout.
+
+## Build the image
+
+```bash
+# From the voicebox/ directory
+./scripts/serverless-build.sh
+
+# Build + push to Docker Hub
+./scripts/serverless-build.sh --push --tag youruser/voicebox-serverless:latest
+
+# Build + push to GHCR
+./scripts/serverless-build.sh --push --tag ghcr.io/youruser/voicebox-serverless:latest
+```
+
+Or manually:
+
+```bash
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg CUDA=1 \
+  --build-arg SERVERLESS=1 \
+  -t voicebox-serverless \
+  .
+```
+
+## RunPod endpoint settings
+
+When creating your endpoint on [runpod.io](https://runpod.io):
+
+| Setting           | Recommended                      |
+| ----------------- | -------------------------------- |
+| Container image   | your pushed image tag            |
+| GPU               | RTX 4090 or similar (16GB+ VRAM) |
+| Idle timeout      | **60 seconds**                   |
+| Execution timeout | 600 seconds                      |
+| Active workers    | 0 (pure on-demand)               |
+| Max workers       | 1 (increase for production)      |
+| FlashBoot         | enabled                          |
+
+**On idle timeout:** The GPU stays allocated (and billed) for the full idle window regardless of VRAM usage. Keeping the model hot and using a short idle timeout (60s) is more cost-effective than unloading the model and using a long idle timeout.
+
+## Authentication
+
+Add your RunPod API key to the root `.env`:
+
+```
+RUNPOD_API_KEY=your_runpod_api_key_here
+```
+
+All requests to the RunPod endpoint require this key as a bearer token:
+
+```
+Authorization: Bearer $RUNPOD_API_KEY
+```
+
+## Sending requests
+
+RunPod wraps requests in a job envelope. The handler accepts:
+
+| Field     | Type   | Description                            |
+| --------- | ------ | -------------------------------------- |
+| `method`  | string | HTTP method (default: `"POST"`)        |
+| `path`    | string | Required. API path, e.g. `"/generate"` |
+| `body`    | object | JSON body for POST/PUT requests        |
+| `params`  | object | Query string parameters                |
+| `headers` | object | Additional HTTP headers                |
+
+### Health check
+
+```bash
+curl -X POST https://api.runpod.ai/v2/$ENDPOINT_ID/runsync \
+  -H "Authorization: Bearer $RUNPOD_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"input": {"method": "GET", "path": "/health"}}'
+```
+
+### Generate speech
+
+```bash
+curl -X POST https://api.runpod.ai/v2/$ENDPOINT_ID/runsync \
+  -H "Authorization: Bearer $RUNPOD_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "input": {
+      "method": "POST",
+      "path": "/generate",
+      "body": {
+        "profile_id": "your-profile-id",
+        "text": "Hello from RunPod."
+      }
+    }
+  }'
+```
+
+### Download audio
+
+Audio endpoints return base64-encoded content with `"is_binary": true`:
+
+```json
+{
+  "output": {
+    "status_code": 200,
+    "is_binary": true,
+    "body_base64": "UklGRi..."
+  }
+}
+```
+
+Decode it:
+
+```bash
+echo "$BODY_BASE64" | base64 -d > output.wav
+```
+
+### Async jobs (long generations)
+
+For long texts, use `/run` instead of `/runsync` to avoid the 90s sync timeout:
+
+```bash
+# Submit
+JOB=$(curl -s -X POST https://api.runpod.ai/v2/$ENDPOINT_ID/run \
+  -H "Authorization: Bearer $RUNPOD_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"input": {"path": "/generate", "body": {"profile_id": "...", "text": "..."}}}')
+
+JOB_ID=$(echo $JOB | jq -r '.id')
+
+# Poll
+curl https://api.runpod.ai/v2/$ENDPOINT_ID/status/$JOB_ID \
+  -H "Authorization: Bearer $RUNPOD_API_KEY"
+```
+
+## Local testing
+
+The RunPod SDK includes a local test server:
+
+```bash
+cd voicebox/
+SERVERLESS=1 python3 -m backend.serverless_handler --rp_serve_api
+```
+
+This starts a local HTTP server that simulates the RunPod job protocol at `http://localhost:8000`.
+
+## Limitations
+
+- **SSE streaming not supported** — RunPod jobs return a single response. Generation still works, just without real-time progress events. Use `/generate` (non-streaming) via the job body.
+- **Ephemeral storage** — The SQLite database (profiles, history) is lost when the worker shuts down. Voice profiles need to be re-imported each cold start, or use a RunPod network volume for persistence.
+- **Cold start time** — First start downloads model weights (~3–5 GB from HuggingFace). Subsequent starts with FlashBoot are much faster.
diff --git a/app/src/App.tsx b/app/src/App.tsx
index fbe2911..f9a1bb6 100644
--- a/app/src/App.tsx
+++ b/app/src/App.tsx
@@ -82,7 +82,6 @@ function App() {
       console.log('Dev mode: Skipping auto-start of server (run it separately)');
       setServerReady(true); // Mark as ready so UI doesn't show loading screen
       // Mark that server was not started by app (so we don't try to stop it on close)
-      // @ts-expect-error - adding property to window
       window.__voiceboxServerStartedByApp = false;
       return;
     }
@@ -93,25 +92,48 @@ function App() {
     }
 
     serverStartingRef.current = true;
-    console.log('Production mode: Starting bundled server...');
-
-    platform.lifecycle
-      .startServer(false)
-      .then((serverUrl) => {
-        console.log('Server is ready at:', serverUrl);
-        // Update the server URL in the store with the dynamically assigned port
-        useServerStore.getState().setServerUrl(serverUrl);
-        setServerReady(true);
-        // Mark that we started the server (so we know to stop it on close)
-        // @ts-expect-error - adding property to window
-        window.__voiceboxServerStartedByApp = true;
-      })
-      .catch((error) => {
-        console.error('Failed to auto-start server:', error);
-        serverStartingRef.current = false;
-        // @ts-expect-error - adding property to window
-        window.__voiceboxServerStartedByApp = false;
-      });
+
+    const SERVER_URL = 'http://127.0.0.1:17493';
+
+    // Check if a server is already running before trying to start one.
+    // This handles the case where a dev server (or a previous instance) is
+    // already listening — we can skip the sidecar startup entirely.
+    const tryExistingServer = async (): Promise<boolean> => {
+      try {
+        const res = await fetch(`${SERVER_URL}/health`, { signal: AbortSignal.timeout(1500) });
+        if (res.ok) {
+          console.log('Production mode: Found server already running, reusing it.');
+          useServerStore.getState().setServerUrl(SERVER_URL);
+          setServerReady(true);
+          window.__voiceboxServerStartedByApp = false;
+          return true;
+        }
+      } catch {
+        // Not running — fall through to sidecar startup
+      }
+      return false;
+    };
+
+    tryExistingServer().then((alreadyRunning) => {
+      if (alreadyRunning) return;
+
+      console.log('Production mode: Starting bundled server...');
+      platform.lifecycle
+        .startServer(false)
+        .then((serverUrl) => {
+          console.log('Server is ready at:', serverUrl);
+          // Update the server URL in the store with the dynamically assigned port
+          useServerStore.getState().setServerUrl(serverUrl);
+          setServerReady(true);
+          // Mark that we started the server (so we know to stop it on close)
+          window.__voiceboxServerStartedByApp = true;
+        })
+        .catch((error) => {
+          console.error('Failed to auto-start server:', error);
+          serverStartingRef.current = false;
+          window.__voiceboxServerStartedByApp = false;
+        });
+    });
 
     // Cleanup: stop server on actual unmount (not StrictMode remount)
     // Note: Window close is handled separately in Tauri Rust code
diff --git a/app/src/components/AudioTab/AudioTab.tsx b/app/src/components/AudioTab/AudioTab.tsx
index f76e99d..e55374b 100644
--- a/app/src/components/AudioTab/AudioTab.tsx
+++ b/app/src/components/AudioTab/AudioTab.tsx
@@ -1,6 +1,6 @@
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
-import { Check, CheckCircle2, Edit, Plus, Speaker, Trash2 } from 'lucide-react';
-import { useState } from 'react';
+import { Check, CheckCircle2, Edit, Mic, Plus, RefreshCw, Speaker, Trash2 } from 'lucide-react';
+import { useEffect, useRef, useState } from 'react';
 import { Badge } from '@/components/ui/badge';
 import { Button } from '@/components/ui/button';
 import {
@@ -25,6 +25,7 @@ import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui';
 import { cn } from '@/lib/utils/cn';
 import { usePlayerStore } from '@/stores/playerStore';
 import { usePlatform } from '@/platform/PlatformContext';
+import { useToast } from '@/components/ui/use-toast';
 
 interface AudioDevice {
   id: string;
@@ -37,16 +38,19 @@ export function AudioTab() {
   const [createDialogOpen, setCreateDialogOpen] = useState(false);
   const [editingChannel, setEditingChannel] = useState<string | null>(null);
   const [selectedChannelId, setSelectedChannelId] = useState<string | null>(null);
+  const [isRefreshing, setIsRefreshing] = useState(false);
   const queryClient = useQueryClient();
   const audioUrl = usePlayerStore((state) => state.audioUrl);
   const isPlayerVisible = !!audioUrl;
+  const { toast } = useToast();
+  const prevDefaultInputRef = useRef<string | null>(null);
 
   const { data: channels, isLoading: channelsLoading } = useQuery({
     queryKey: ['channels'],
     queryFn: () => apiClient.listChannels(),
   });
 
-  const { data: devices, isLoading: devicesLoading } = useQuery({
+  const { data: devices, isLoading: devicesLoading, refetch: refetchDevices } = useQuery({
     queryKey: ['audio-devices'],
     queryFn: async () => {
       if (!platform.metadata.isTauri) {
@@ -60,8 +64,44 @@ export function AudioTab() {
       }
     },
     enabled: platform.metadata.isTauri,
+    refetchOnWindowFocus: true,
+    staleTime: 5000,
   });
 
+  const { data: inputDevices } = useQuery({
+    queryKey: ['audio-input-devices'],
+    queryFn: async () => {
+      if (!platform.metadata.isTauri) return [];
+      try {
+        return await platform.audio.listInputDevices();
+      } catch (error) {
+        console.error('Failed to list input devices:', error);
+        return [];
+      }
+    },
+    enabled: platform.metadata.isTauri,
+    refetchOnWindowFocus: true,
+    // Poll every 3 seconds so default input changes are noticed automatically
+    refetchInterval: 3000,
+    staleTime: 2000,
+  });
+
+  const defaultInputDevice = inputDevices?.find((d) => d.is_default);
+
+  // Toast when default input device changes
+  useEffect(() => {
+    if (!defaultInputDevice) return;
+    const prev = prevDefaultInputRef.current;
+    if (prev !== null && prev !== defaultInputDevice.name) {
+      toast({
+        title: 'Default input changed',
+        description: defaultInputDevice.name,
+        duration: 3000,
+      });
+    }
+    prevDefaultInputRef.current = defaultInputDevice.name;
+  }, [defaultInputDevice?.name, toast]);
+
   const { data: profiles } = useQuery({
     queryKey: ['profiles'],
     queryFn: () => apiClient.listProfiles(),
@@ -83,6 +123,27 @@ export function AudioTab() {
       channelId: string;
       data: { name?: string; device_ids?: string[] };
     }) => apiClient.updateChannel(channelId, data),
+    onMutate: async ({ channelId, data }) => {
+      // Optimistically update channels cache so checkmarks appear immediately
+      await queryClient.cancelQueries({ queryKey: ['channels'] });
+      const previous = queryClient.getQueryData(['channels']);
+      if (data.device_ids !== undefined) {
+        queryClient.setQueryData(
+          ['channels'],
+          (old: Array<{ id: string; device_ids: string[]; [key: string]: unknown }> | undefined) =>
+            old?.map((ch) =>
+              ch.id === channelId ? { ...ch, device_ids: data.device_ids! } : ch,
+            ),
+        );
+      }
+      return { previous };
+    },
+    onError: (_err, _vars, context) => {
+      // Roll back on error
+      if (context?.previous) {
+        queryClient.setQueryData(['channels'], context.previous);
+      }
+    },
     onSuccess: () => {
       queryClient.invalidateQueries({ queryKey: ['channels'] });
       queryClient.invalidateQueries({ queryKey: ['profile-channels'] });
@@ -110,8 +171,20 @@ export function AudioTab() {
   const setChannelVoices = useMutation({
     mutationFn: ({ channelId, profileIds }: { channelId: string; profileIds: string[] }) =>
       apiClient.setChannelVoices(channelId, profileIds),
-    onSuccess: () => {
-      queryClient.invalidateQueries({ queryKey: ['channel-voices'] });
+    onMutate: async ({ channelId, profileIds }) => {
+      // Optimistically update channel-voices cache so the list updates immediately
+      await queryClient.cancelQueries({ queryKey: ['channel-voices', channelId] });
+      const previous = queryClient.getQueryData(['channel-voices', channelId]);
+      queryClient.setQueryData(['channel-voices', channelId], { profile_ids: profileIds });
+      return { previous, channelId };
+    },
+    onError: (_err, _vars, context) => {
+      if (context?.previous !== undefined) {
+        queryClient.setQueryData(['channel-voices', context.channelId], context.previous);
+      }
+    },
+    onSuccess: (_data, { channelId }) => {
+      queryClient.invalidateQueries({ queryKey: ['channel-voices', channelId] });
       queryClient.invalidateQueries({ queryKey: ['profile-channels'] });
     },
   });
@@ -132,13 +205,17 @@ export function AudioTab() {
 
   return (
     <div className="h-full flex flex-col">
-      <div className="flex items-center justify-between mb-6 shrink-0">
+      <div className="flex items-center justify-between mb-2 shrink-0">
         <h2 className="text-2xl font-bold">Audio Channels</h2>
         <Button onClick={() => setCreateDialogOpen(true)}>
           <Plus className="h-4 w-4 mr-2" />
           New Channel
         </Button>
       </div>
+      <p className="text-sm text-muted-foreground mb-6 shrink-0">
+        Route different voices to dedicated speakers — ideal for story mode, museum displays, or
+        events where each character plays through a separate device.
+      </p>
 
       <div className="grid grid-cols-1 lg:grid-cols-2 gap-6 h-full min-h-0">
         {/* Left Column - Channels */}
@@ -268,7 +345,23 @@ export function AudioTab() {
           )}
         >
           <div className="shrink-0 mb-4">
-            <h3 className="text-lg font-semibold">Available Devices</h3>
+            <div className="flex items-center justify-between">
+              <h3 className="text-lg font-semibold">Available Devices</h3>
+              <Button
+                variant="ghost"
+                size="icon"
+                className="h-7 w-7"
+                onClick={async () => {
+                  setIsRefreshing(true);
+                  await refetchDevices();
+                  setIsRefreshing(false);
+                  toast({ title: 'Devices refreshed', duration: 2000 });
+                }}
+                title="Refresh output devices"
+              >
+                <RefreshCw className={cn('h-4 w-4', isRefreshing && 'animate-spin')} />
+              </Button>
+            </div>
             <p className="text-sm text-muted-foreground mt-1">
               {selectedChannelId
                 ? selectedChannel?.is_default
@@ -276,6 +369,12 @@ export function AudioTab() {
                   : 'Click devices to add or remove them from the selected channel'
                 : 'Select a channel to assign devices'}
             </p>
+            {defaultInputDevice && (
+              <div className="flex items-center gap-1.5 mt-2 text-xs text-muted-foreground">
+                <Mic className="h-3 w-3 shrink-0" />
+                <span>Default input: <span className="font-medium text-foreground">{defaultInputDevice.name}</span></span>
+              </div>
+            )}
           </div>
           {allDevices.length > 0 ? (
             <div className="space-y-2">
diff --git a/app/src/components/Generation/FloatingGenerateBox.tsx b/app/src/components/Generation/FloatingGenerateBox.tsx
index a8d556a..f61526f 100644
--- a/app/src/components/Generation/FloatingGenerateBox.tsx
+++ b/app/src/components/Generation/FloatingGenerateBox.tsx
@@ -34,7 +34,7 @@ export function FloatingGenerateBox({
   const setSelectedProfileId = useUIStore((state) => state.setSelectedProfileId);
   const { data: selectedProfile } = useProfile(selectedProfileId || '');
   const { data: profiles } = useProfiles();
-  const [isExpanded, setIsExpanded] = useState(false);
+  const [isExpanded, setIsExpanded] = useState(true);
   const [isInstructMode, setIsInstructMode] = useState(false);
   const containerRef = useRef<HTMLDivElement>(null);
   const textareaRef = useRef<HTMLTextAreaElement | null>(null);
@@ -49,9 +49,9 @@ export function FloatingGenerateBox({
   // Calculate if track editor is visible (on stories route with items)
   const hasTrackEditor = isStoriesRoute && currentStory && currentStory.items.length > 0;
 
-  const { form, handleSubmit, isPending } = useGenerationForm({
+  const { form, handleSubmit, isPending, isQueueLimitReached } = useGenerationForm({
     onSuccess: async (generationId) => {
-      setIsExpanded(false);
+      setIsInstructMode(false);
       // If on stories route and a story is selected, add generation to story
       if (isStoriesRoute && selectedStoryId && generationId) {
         try {
@@ -75,36 +75,6 @@ export function FloatingGenerateBox({
     },
   });
 
-  // Click away handler to collapse the box
-  useEffect(() => {
-    function handleClickOutside(event: MouseEvent) {
-      const target = event.target as HTMLElement;
-
-      // Don't collapse if clicking inside the container
-      if (containerRef.current?.contains(target)) {
-        return;
-      }
-
-      // Don't collapse if clicking on a Select dropdown (which renders in a portal)
-      if (
-        target.closest('[role="listbox"]') ||
-        target.closest('[data-radix-popper-content-wrapper]')
-      ) {
-        return;
-      }
-
-      setIsExpanded(false);
-    }
-
-    if (isExpanded) {
-      document.addEventListener('mousedown', handleClickOutside);
-    }
-
-    return () => {
-      document.removeEventListener('mousedown', handleClickOutside);
-    };
-  }, [isExpanded]);
-
   // Set first voice as default if none selected
   useEffect(() => {
     if (!selectedProfileId && profiles && profiles.length > 0) {
@@ -116,10 +86,10 @@ export function FloatingGenerateBox({
   useEffect(() => {
     if (!isExpanded) {
       // Reset textarea height after collapse animation completes
-      const timeoutId = setTimeout(() => {
-        const textarea = textareaRef.current;
-        if (textarea) {
-          textarea.style.height = '32px';
+        const timeoutId = setTimeout(() => {
+          const textarea = textareaRef.current;
+          if (textarea) {
+          textarea.style.height = '37px';
           textarea.style.overflowY = 'hidden';
         }
       }, 200); // Wait for animation to complete
@@ -132,7 +102,7 @@ export function FloatingGenerateBox({
     const adjustHeight = () => {
       textarea.style.height = 'auto';
       const scrollHeight = textarea.scrollHeight;
-      const minHeight = 100; // Expanded minimum
+      const minHeight = 115; // Expanded minimum (+15%)
       const maxHeight = 300; // Max height in pixels
       const targetHeight = Math.max(minHeight, Math.min(scrollHeight, maxHeight));
       textarea.style.height = `${targetHeight}px`;
@@ -207,7 +177,7 @@ export function FloatingGenerateBox({
                         <FormControl>
                           <motion.div
                             animate={{
-                              height: isExpanded ? 'auto' : '32px',
+                              height: isExpanded ? 'auto' : '37px',
                             }}
                             transition={{ duration: 0.15, ease: 'easeOut' }}
                             style={{ overflow: 'hidden' }}
@@ -233,10 +203,10 @@ export function FloatingGenerateBox({
                               }
                               className="resize-none bg-transparent border-none focus-visible:ring-0 focus-visible:ring-offset-0 focus:outline-none focus:ring-0 outline-none ring-0 rounded-2xl text-sm placeholder:text-muted-foreground/60 w-full"
                               style={{
-                                minHeight: isExpanded ? '100px' : '32px',
+                                minHeight: isExpanded ? '115px' : '37px',
                                 maxHeight: '300px',
                               }}
-                              disabled={!selectedProfileId}
+                              disabled={!selectedProfileId || isQueueLimitReached}
                               onClick={() => setIsExpanded(true)}
                               onFocus={() => setIsExpanded(true)}
                             />
@@ -257,7 +227,7 @@ export function FloatingGenerateBox({
                         <FormControl>
                           <motion.div
                             animate={{
-                              height: isExpanded ? 'auto' : '32px',
+                              height: isExpanded ? 'auto' : '37px',
                             }}
                             transition={{ duration: 0.15, ease: 'easeOut' }}
                             style={{ overflow: 'hidden' }}
@@ -277,10 +247,10 @@ export function FloatingGenerateBox({
                               placeholder="e.g. very happy and excited"
                               className="resize-none bg-transparent border-none focus-visible:ring-0 focus-visible:ring-offset-0 focus:outline-none focus:ring-0 outline-none ring-0 rounded-2xl text-sm placeholder:text-muted-foreground/60 w-full"
                               style={{
-                                minHeight: isExpanded ? '100px' : '32px',
+                                minHeight: isExpanded ? '115px' : '37px',
                                 maxHeight: '300px',
                               }}
-                              disabled={!selectedProfileId}
+                              disabled={!selectedProfileId || isQueueLimitReached}
                               onClick={() => setIsExpanded(true)}
                               onFocus={() => setIsExpanded(true)}
                             />
@@ -297,7 +267,7 @@ export function FloatingGenerateBox({
                 <div className="group relative">
                   <Button
                     type="submit"
-                    disabled={isPending || !selectedProfileId}
+                    disabled={isPending || !selectedProfileId || isQueueLimitReached}
                     className="h-10 w-10 rounded-full bg-accent hover:bg-accent/90 hover:scale-105 text-accent-foreground shadow-lg hover:shadow-accent/50 transition-all duration-200"
                     size="icon"
                   >
@@ -310,6 +280,8 @@ export function FloatingGenerateBox({
                   <span className="pointer-events-none absolute bottom-full left-1/2 -translate-x-1/2 mb-2 whitespace-nowrap rounded-md bg-popover px-3 py-1.5 text-xs text-popover-foreground border border-border opacity-0 transition-opacity group-hover:opacity-100 z-[9999]">
                     {isPending
                       ? 'Generating...'
+                      : isQueueLimitReached
+                        ? 'Queue full (3 max)'
                       : !selectedProfileId
                         ? 'Select a voice profile first'
                         : 'Generate speech'}
@@ -350,6 +322,16 @@ export function FloatingGenerateBox({
             </div>
 
             <AnimatePresence>
+              {isQueueLimitReached && (
+                <motion.div
+                  initial={{ opacity: 0, y: -4 }}
+                  animate={{ opacity: 1, y: 0 }}
+                  exit={{ opacity: 0, y: -4 }}
+                  className="mt-2 px-2 text-xs text-amber-500"
+                >
+                  Queue full: 3 active jobs max for this user.
+                </motion.div>
+              )}
               <motion.div
                 initial={{ height: 0, opacity: 0 }}
                 animate={{ height: 'auto', opacity: 1 }}
diff --git a/app/src/components/Generation/GenerationForm.tsx b/app/src/components/Generation/GenerationForm.tsx
index 31b100f..f0b77ee 100644
--- a/app/src/components/Generation/GenerationForm.tsx
+++ b/app/src/components/Generation/GenerationForm.tsx
@@ -105,7 +105,7 @@ export function GenerationForm() {
                 render={({ field }) => (
                   <FormItem>
                     <FormLabel>Language</FormLabel>
-                    <Select onValueChange={field.onChange} defaultValue={field.value}>
+                    <Select onValueChange={field.onChange} value={field.value}>
                       <FormControl>
                         <SelectTrigger>
                           <SelectValue />
@@ -130,7 +130,7 @@ export function GenerationForm() {
                 render={({ field }) => (
                   <FormItem>
                     <FormLabel>Model Size</FormLabel>
-                    <Select onValueChange={field.onChange} defaultValue={field.value}>
+                    <Select onValueChange={field.onChange} value={field.value}>
                       <FormControl>
                         <SelectTrigger>
                           <SelectValue />
diff --git a/app/src/components/History/HistoryTable.tsx b/app/src/components/History/HistoryTable.tsx
index e572f69..abecb80 100644
--- a/app/src/components/History/HistoryTable.tsx
+++ b/app/src/components/History/HistoryTable.tsx
@@ -1,5 +1,6 @@
 import {
   AudioWaveform,
+  Copy,
   Download,
   FileArchive,
   Loader2,
@@ -251,8 +252,11 @@ export function HistoryTable() {
             {history.map((gen) => {
               const isCurrentlyPlaying = currentAudioId === gen.id && isPlaying;
               return (
+                // biome-ignore lint/a11y/useSemanticElements: Complex flex layout requires div wrapper
                 <div
                   key={gen.id}
+                  role="button"
+                  tabIndex={0}
                   className={cn(
                     'flex items-stretch gap-4 h-26 border rounded-md p-3 bg-card hover:bg-muted/70 transition-colors text-left w-full',
                     isCurrentlyPlaying && 'bg-muted/70',
@@ -265,6 +269,16 @@ export function HistoryTable() {
                     }
                     handlePlay(gen.id, gen.text, gen.profile_id);
                   }}
+                  onKeyDown={(e) => {
+                    if (e.key === 'Enter' || e.key === ' ') {
+                      e.preventDefault();
+                      const target = e.target as HTMLElement;
+                      if (target.closest('textarea') || window.getSelection()?.toString()) {
+                        return;
+                      }
+                      handlePlay(gen.id, gen.text, gen.profile_id);
+                    }
+                  }}
                 >
                   {/* Waveform icon */}
                   <div className="flex items-center shrink-0">
@@ -288,17 +302,35 @@ export function HistoryTable() {
                   </div>
 
                   {/* Right side - Transcript textarea */}
-                  <div className="flex-1 min-w-0 flex">
+                  <div className="flex-1 min-w-0 flex relative group">
                     <Textarea
                       value={gen.text}
-                      className="flex-1 resize-none text-sm text-muted-foreground select-text"
+                      className="flex-1 resize-none text-sm text-muted-foreground select-text pr-10"
                       readOnly
                     />
+                    <Button
+                      variant="ghost"
+                      size="icon"
+                      className="absolute top-1 right-1 h-7 w-7 opacity-0 group-hover:opacity-100 transition-opacity bg-background/80 hover:bg-background"
+                      onClick={(e) => {
+                        e.stopPropagation();
+                        navigator.clipboard.writeText(gen.text);
+                        toast({
+                          title: 'Copied to clipboard',
+                          description: 'Text copied successfully',
+                        });
+                      }}
+                      aria-label="Copy text"
+                    >
+                      <Copy className="h-3.5 w-3.5" />
+                    </Button>
                   </div>
 
                   {/* Far right - Ellipsis actions */}
+                  {/* biome-ignore lint/a11y/noStaticElementInteractions: Event handlers prevent propagation to parent card */}
                   <div
                     className="w-10 shrink-0 flex justify-end"
+                    role="presentation"
                     onMouseDown={(e) => e.stopPropagation()}
                     onClick={(e) => e.stopPropagation()}
                   >
diff --git a/app/src/components/MainEditor/MainEditor.tsx b/app/src/components/MainEditor/MainEditor.tsx
index 9d597b1..d5a3439 100644
--- a/app/src/components/MainEditor/MainEditor.tsx
+++ b/app/src/components/MainEditor/MainEditor.tsx
@@ -26,11 +26,17 @@ export function MainEditor() {
   const setDialogOpen = useUIStore((state) => state.setProfileDialogOpen);
   const importProfile = useImportProfile();
   const fileInputRef = useRef<HTMLInputElement>(null);
+  const [importWarningOpen, setImportWarningOpen] = useState(false);
   const [importDialogOpen, setImportDialogOpen] = useState(false);
   const [selectedFile, setSelectedFile] = useState<File | null>(null);
   const { toast } = useToast();
 
   const handleImportClick = () => {
+    setImportWarningOpen(true);
+  };
+
+  const handleImportProceed = () => {
+    setImportWarningOpen(false);
     fileInputRef.current?.click();
   };
 
@@ -131,7 +137,29 @@ export function MainEditor() {
       {/* Floating Generate Box */}
       <FloatingGenerateBox isPlayerOpen={!!audioUrl} />
 
-      {/* Import Dialog */}
+      {/* Import Warning Dialog */}
+      <Dialog open={importWarningOpen} onOpenChange={setImportWarningOpen}>
+        <DialogContent>
+          <DialogHeader>
+            <DialogTitle>Import Voice Profile</DialogTitle>
+            <DialogDescription>
+              You can only import voice profiles that were exported from another instance of Voicebox.
+              The file must be a <code className="text-xs bg-muted px-1 py-0.5 rounded">.voicebox.zip</code> file.
+            </DialogDescription>
+          </DialogHeader>
+          <DialogFooter>
+            <Button variant="outline" onClick={() => setImportWarningOpen(false)}>
+              Cancel
+            </Button>
+            <Button onClick={handleImportProceed}>
+              <Upload className="mr-2 h-4 w-4" />
+              Import
+            </Button>
+          </DialogFooter>
+        </DialogContent>
+      </Dialog>
+
+      {/* Import Confirmation Dialog */}
       <Dialog open={importDialogOpen} onOpenChange={setImportDialogOpen}>
         <DialogContent>
           <DialogHeader>
diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
index 4a5fd43..31b98bf 100644
--- a/app/src/components/ServerSettings/ModelManagement.tsx
+++ b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -1,6 +1,6 @@
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
-import { Download, Loader2, Trash2 } from 'lucide-react';
-import { useCallback, useState } from 'react';
+import { Download, Loader2, Trash2, X } from 'lucide-react';
+import { useEffect, useState } from 'react';
 import {
   AlertDialog,
   AlertDialogAction,
@@ -16,47 +16,37 @@ import { Button } from '@/components/ui/button';
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card';
 import { useToast } from '@/components/ui/use-toast';
 import { apiClient } from '@/lib/api/client';
-import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast';
 
 export function ModelManagement() {
   const { toast } = useToast();
   const queryClient = useQueryClient();
   const [downloadingModel, setDownloadingModel] = useState<string | null>(null);
-  const [downloadingDisplayName, setDownloadingDisplayName] = useState<string | null>(null);
 
   const { data: modelStatus, isLoading } = useQuery({
     queryKey: ['modelStatus'],
-    queryFn: async () => {
-      console.log('[Query] Fetching model status');
-      const result = await apiClient.getModelStatus();
-      console.log('[Query] Model status fetched:', result);
-      return result;
-    },
-    refetchInterval: 5000, // Refresh every 5 seconds
+    queryFn: () => apiClient.getModelStatus(),
+    refetchInterval: 5000,
   });
 
-  // Callbacks for download completion
-  const handleDownloadComplete = useCallback(() => {
-    console.log('[ModelManagement] Download complete, clearing state');
-    setDownloadingModel(null);
-    setDownloadingDisplayName(null);
-    queryClient.invalidateQueries({ queryKey: ['modelStatus'] });
-  }, [queryClient]);
+  // Clear local downloading state when server reports the model is no longer downloading
+  useEffect(() => {
+    if (downloadingModel && modelStatus) {
+      const model = modelStatus.models.find((m) => m.model_name === downloadingModel);
+      if (model && !model.downloading) {
+        setDownloadingModel(null);
+      }
+    }
+  }, [downloadingModel, modelStatus]);
 
-  const handleDownloadError = useCallback(() => {
-    console.log('[ModelManagement] Download error, clearing state');
+  const handleCancel = async (modelName: string) => {
+    try {
+      await apiClient.cancelModelDownload(modelName);
+    } catch {
+      // Ignore errors — the download may have already finished
+    }
     setDownloadingModel(null);
-    setDownloadingDisplayName(null);
-  }, []);
-
-  // Use progress toast hook for the downloading model
-  useModelDownloadToast({
-    modelName: downloadingModel || '',
-    displayName: downloadingDisplayName || '',
-    enabled: !!downloadingModel && !!downloadingDisplayName,
-    onComplete: handleDownloadComplete,
-    onError: handleDownloadError,
-  });
+    queryClient.invalidateQueries({ queryKey: ['modelStatus'] });
+  };
 
   const [deleteDialogOpen, setDeleteDialogOpen] = useState(false);
   const [modelToDelete, setModelToDelete] = useState<{
@@ -66,31 +56,12 @@ export function ModelManagement() {
   } | null>(null);
 
   const handleDownload = async (modelName: string) => {
-    console.log('[Download] Button clicked for:', modelName, 'at', new Date().toISOString());
-    
-    // Find display name
-    const model = modelStatus?.models.find((m) => m.model_name === modelName);
-    const displayName = model?.display_name || modelName;
-    
     try {
-      // IMPORTANT: Call the API FIRST before setting state
-      // Setting state enables the SSE EventSource in useModelDownloadToast,
-      // which can block/delay the download fetch due to HTTP/1.1 connection limits
-      console.log('[Download] Calling download API for:', modelName);
-      const result = await apiClient.triggerModelDownload(modelName);
-      console.log('[Download] Download API responded:', result);
-      
-      // NOW set state to enable SSE tracking (after download has started on backend)
+      await apiClient.triggerModelDownload(modelName);
       setDownloadingModel(modelName);
-      setDownloadingDisplayName(displayName);
-      
-      // Download initiated successfully - state will be cleared when SSE reports completion
-      // or by the polling interval detecting the model is downloaded
       queryClient.invalidateQueries({ queryKey: ['modelStatus'] });
     } catch (error) {
-      console.error('[Download] Download failed:', error);
       setDownloadingModel(null);
-      setDownloadingDisplayName(null);
       toast({
         title: 'Download failed',
         description: error instanceof Error ? error.message : 'Unknown error',
@@ -179,6 +150,7 @@ export function ModelManagement() {
                         setDeleteDialogOpen(true);
                       }}
                       isDownloading={downloadingModel === model.model_name}
+                      onCancel={() => handleCancel(model.model_name)}
                       formatSize={formatSize}
                     />
                   ))}
@@ -207,6 +179,7 @@ export function ModelManagement() {
                         setDeleteDialogOpen(true);
                       }}
                       isDownloading={downloadingModel === model.model_name}
+                      onCancel={() => handleCancel(model.model_name)}
                       formatSize={formatSize}
                     />
                   ))}
@@ -271,11 +244,12 @@ interface ModelItemProps {
   };
   onDownload: () => void;
   onDelete: () => void;
+  onCancel: () => void;
   isDownloading: boolean;  // Local state - true if user just clicked download
   formatSize: (sizeMb?: number) => string;
 }
 
-function ModelItem({ model, onDownload, onDelete, isDownloading, formatSize }: ModelItemProps) {
+function ModelItem({ model, onDownload, onDelete, onCancel, isDownloading, formatSize }: ModelItemProps) {
   // Use server's downloading state OR local state (for immediate feedback before server updates)
   const showDownloading = model.downloading || isDownloading;
   
@@ -319,10 +293,15 @@ function ModelItem({ model, onDownload, onDelete, isDownloading, formatSize }: M
             </Button>
           </div>
         ) : showDownloading ? (
-          <Button size="sm" variant="outline" disabled>
-            <Loader2 className="h-4 w-4 mr-2 animate-spin" />
-            Downloading...
-          </Button>
+          <div className="flex items-center gap-2">
+            <div className="flex items-center gap-1 text-sm text-muted-foreground">
+              <Loader2 className="h-4 w-4 animate-spin" />
+              <span>Downloading...</span>
+            </div>
+            <Button size="sm" variant="outline" onClick={onCancel} title="Cancel download">
+              <X className="h-4 w-4" />
+            </Button>
+          </div>
         ) : (
           <Button size="sm" onClick={onDownload} variant="outline">
             <Download className="h-4 w-4 mr-2" />
diff --git a/app/src/components/ServerSettings/UpdateStatus.tsx b/app/src/components/ServerSettings/UpdateStatus.tsx
index a3d832a..8e2cad4 100644
--- a/app/src/components/ServerSettings/UpdateStatus.tsx
+++ b/app/src/components/ServerSettings/UpdateStatus.tsx
@@ -11,6 +11,7 @@ export function UpdateStatus() {
   const platform = usePlatform();
   const { status, checkForUpdates, downloadAndInstall, restartAndInstall } = useAutoUpdater(false);
   const [currentVersion, setCurrentVersion] = useState<string>('');
+  const buildInfo = platform.metadata.getBuildInfo();
 
   useEffect(() => {
     platform.metadata
@@ -28,7 +29,12 @@ export function UpdateStatus() {
         <div className="flex items-center justify-between">
           <div className="space-y-1">
             <div className="text-sm font-medium">Current Version</div>
-            <div className="text-sm text-muted-foreground">v{currentVersion}</div>
+            <div className="text-sm text-muted-foreground">
+              v{currentVersion}
+              {buildInfo && (
+                <span className="ml-2 text-xs font-mono opacity-60">{buildInfo}</span>
+              )}
+            </div>
           </div>
           <Button
             onClick={checkForUpdates}
diff --git a/app/src/components/Sidebar.tsx b/app/src/components/Sidebar.tsx
index a849344..bc5ea01 100644
--- a/app/src/components/Sidebar.tsx
+++ b/app/src/components/Sidebar.tsx
@@ -1,7 +1,8 @@
-import { Link, useMatchRoute } from '@tanstack/react-router';
+import { Link, useRouterState } from '@tanstack/react-router';
 import { Box, BookOpen, Loader2, Mic, Server, Speaker, Volume2 } from 'lucide-react';
 import voiceboxLogo from '@/assets/voicebox-logo.png';
 import { cn } from '@/lib/utils/cn';
+import { usePlatform } from '@/platform/PlatformContext';
 import { useGenerationStore } from '@/stores/generationStore';
 import { usePlayerStore } from '@/stores/playerStore';
 
@@ -9,20 +10,23 @@ interface SidebarProps {
   isMacOS?: boolean;
 }
 
-const tabs = [
-  { id: 'main', path: '/', icon: Volume2, label: 'Generate' },
-  { id: 'stories', path: '/stories', icon: BookOpen, label: 'Stories' },
-  { id: 'voices', path: '/voices', icon: Mic, label: 'Voices' },
-  { id: 'audio', path: '/audio', icon: Speaker, label: 'Audio' },
-  { id: 'models', path: '/models', icon: Box, label: 'Models' },
-  { id: 'server', path: '/server', icon: Server, label: 'Server' },
+const allTabs = [
+  { id: 'main', path: '/', icon: Volume2, label: 'Generate', tauriOnly: false },
+  { id: 'stories', path: '/stories', icon: BookOpen, label: 'Stories', tauriOnly: false },
+  { id: 'voices', path: '/voices', icon: Mic, label: 'Voices', tauriOnly: false },
+  { id: 'audio', path: '/audio', icon: Speaker, label: 'Audio', tauriOnly: true },
+  { id: 'models', path: '/models', icon: Box, label: 'Models', tauriOnly: false },
+  { id: 'server', path: '/server', icon: Server, label: 'Server', tauriOnly: false },
 ];
 
 export function Sidebar({ isMacOS }: SidebarProps) {
+  const { metadata } = usePlatform();
   const isGenerating = useGenerationStore((state) => state.isGenerating);
   const audioUrl = usePlayerStore((state) => state.audioUrl);
   const isPlayerVisible = !!audioUrl;
-  const matchRoute = useMatchRoute();
+  const pathname = useRouterState({ select: (state) => state.location.pathname });
+
+  const tabs = allTabs.filter((tab) => !tab.tauriOnly || metadata.isTauri);
 
   return (
     <div
@@ -41,10 +45,9 @@ export function Sidebar({ isMacOS }: SidebarProps) {
         {tabs.map((tab) => {
           const Icon = tab.icon;
           // For index route, use exact match; for others, use default matching
-          const isActive =
-            tab.path === '/'
-              ? matchRoute({ to: '/', exact: true })
-              : matchRoute({ to: tab.path });
+          const isActive = tab.path === '/'
+            ? pathname === '/'
+            : pathname.startsWith(tab.path);
 
           return (
             <Link
diff --git a/app/src/components/StoriesTab/StoryTrackEditor.tsx b/app/src/components/StoriesTab/StoryTrackEditor.tsx
index 74dbde2..bc2e040 100644
--- a/app/src/components/StoriesTab/StoryTrackEditor.tsx
+++ b/app/src/components/StoriesTab/StoryTrackEditor.tsx
@@ -313,7 +313,7 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {
     }
   }, [isResizing, handleResizeMove, handleResizeEnd]);
 
-  const handleTimelineClick = (e: React.MouseEvent<HTMLDivElement>) => {
+  const handleTimelineClick = (e: React.MouseEvent<HTMLElement>) => {
     if (!tracksRef.current || draggingItem || trimmingItem) return;
     const rect = tracksRef.current.getBoundingClientRect();
     const x = e.clientX - rect.left + tracksRef.current.scrollLeft;
diff --git a/app/src/components/VoiceProfiles/AudioSampleRecording.tsx b/app/src/components/VoiceProfiles/AudioSampleRecording.tsx
index 4f2db4e..06c0bab 100644
--- a/app/src/components/VoiceProfiles/AudioSampleRecording.tsx
+++ b/app/src/components/VoiceProfiles/AudioSampleRecording.tsx
@@ -3,6 +3,8 @@ import { memo, useEffect, useState } from 'react';
 import { Visualizer } from 'react-sound-visualizer';
 import { Button } from '@/components/ui/button';
 import { FormControl, FormItem, FormMessage } from '@/components/ui/form';
+import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select';
+import { type AudioInputDevice } from '@/lib/hooks/useAudioDevices';
 import { formatAudioDuration } from '@/lib/utils/audio';
 
 const MemoizedWaveform = memo(function MemoizedWaveform({
@@ -37,7 +39,12 @@ interface AudioSampleRecordingProps {
   onPlayPause: () => void;
   isPlaying: boolean;
   isTranscribing?: boolean;
+  transcribeLabel?: string;
   showWaveform?: boolean;
+  // Device selection
+  audioDevices?: AudioInputDevice[];
+  selectedDeviceId?: string | null;
+  onDeviceChange?: (deviceId: string | null) => void;
 }
 
 export function AudioSampleRecording({
@@ -51,19 +58,33 @@ export function AudioSampleRecording({
   onPlayPause,
   isPlaying,
   isTranscribing = false,
+  transcribeLabel,
   showWaveform = true,
+  audioDevices,
+  selectedDeviceId,
+  onDeviceChange,
 }: AudioSampleRecordingProps) {
   const [audioStream, setAudioStream] = useState<MediaStream | null>(null);
 
-  // Request microphone access when component mounts
+  // Request microphone access for waveform preview — re-acquire when selected device changes
   useEffect(() => {
     if (!showWaveform) return;
 
     let stream: MediaStream | null = null;
+    let cancelled = false;
+
+    const audioConstraints: MediaTrackConstraints = {};
+    if (selectedDeviceId) {
+      audioConstraints.deviceId = { exact: selectedDeviceId };
+    }
 
     navigator.mediaDevices
-      .getUserMedia({ audio: true, video: false })
+      .getUserMedia({ audio: audioConstraints, video: false })
       .then((s) => {
+        if (cancelled) {
+          s.getTracks().forEach((t) => t.stop());
+          return;
+        }
         stream = s;
         setAudioStream(s);
       })
@@ -72,18 +93,45 @@ export function AudioSampleRecording({
       });
 
     return () => {
+      cancelled = true;
       if (stream) {
         stream.getTracks().forEach((track) => {
           track.stop();
         });
       }
+      setAudioStream(null);
     };
-  }, [showWaveform]);
+  }, [showWaveform, selectedDeviceId]);
+
+  const showDevicePicker =
+    onDeviceChange && audioDevices && audioDevices.length > 1 && !isRecording && !file;
 
   return (
     <FormItem>
       <FormControl>
         <div className="space-y-4">
+          {showDevicePicker && (
+            <div className="flex items-center gap-2">
+              <Mic className="h-4 w-4 text-muted-foreground shrink-0" />
+              <Select
+                value={selectedDeviceId ?? 'default'}
+                onValueChange={(val) => onDeviceChange(val === 'default' ? null : val)}
+              >
+                <SelectTrigger className="h-8 text-xs">
+                  <SelectValue placeholder="System default microphone" />
+                </SelectTrigger>
+                <SelectContent>
+                  <SelectItem value="default" className="text-xs">System default</SelectItem>
+                  {audioDevices.map((d) => (
+                    <SelectItem key={d.deviceId} value={d.deviceId} className="text-xs">
+                      {d.label}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+          )}
+
           {!isRecording && !file && (
             <div className="relative flex flex-col items-center justify-center gap-4 p-4 border-2 border-dashed rounded-lg min-h-[180px] overflow-hidden">
               {showWaveform && audioStream && (
@@ -150,7 +198,7 @@ export function AudioSampleRecording({
                   className="flex items-center gap-2"
                 >
                   <Mic className="h-4 w-4" />
-                  {isTranscribing ? 'Transcribing...' : 'Transcribe'}
+                  {transcribeLabel ?? (isTranscribing ? 'Transcribing...' : 'Transcribe')}
                 </Button>
                 <Button
                   type="button"
diff --git a/app/src/components/VoiceProfiles/AudioSampleSystem.tsx b/app/src/components/VoiceProfiles/AudioSampleSystem.tsx
index 0cc892b..d957625 100644
--- a/app/src/components/VoiceProfiles/AudioSampleSystem.tsx
+++ b/app/src/components/VoiceProfiles/AudioSampleSystem.tsx
@@ -14,6 +14,7 @@ interface AudioSampleSystemProps {
   onPlayPause: () => void;
   isPlaying: boolean;
   isTranscribing?: boolean;
+  transcribeLabel?: string;
 }
 
 export function AudioSampleSystem({
@@ -27,6 +28,7 @@ export function AudioSampleSystem({
   onPlayPause,
   isPlaying,
   isTranscribing = false,
+  transcribeLabel,
 }: AudioSampleSystemProps) {
   return (
     <FormItem>
@@ -88,7 +90,7 @@ export function AudioSampleSystem({
                   className="flex items-center gap-2"
                 >
                   <Mic className="h-4 w-4" />
-                  {isTranscribing ? 'Transcribing...' : 'Transcribe'}
+                  {transcribeLabel ?? (isTranscribing ? 'Transcribing...' : 'Transcribe')}
                 </Button>
                 <Button
                   type="button"
diff --git a/app/src/components/VoiceProfiles/AudioSampleUpload.tsx b/app/src/components/VoiceProfiles/AudioSampleUpload.tsx
index c66ded1..5c48b91 100644
--- a/app/src/components/VoiceProfiles/AudioSampleUpload.tsx
+++ b/app/src/components/VoiceProfiles/AudioSampleUpload.tsx
@@ -12,6 +12,7 @@ interface AudioSampleUploadProps {
   isValidating?: boolean;
   isTranscribing?: boolean;
   isDisabled?: boolean;
+  transcribeLabel?: string;
   fieldName: string;
 }
 
@@ -24,6 +25,7 @@ export function AudioSampleUpload({
   isValidating = false,
   isTranscribing = false,
   isDisabled = false,
+  transcribeLabel,
   fieldName,
 }: AudioSampleUploadProps) {
   const [isDragging, setIsDragging] = useState(false);
@@ -48,8 +50,8 @@ export function AudioSampleUpload({
             }}
             className="hidden"
           />
-          <div
-            role="button"
+          <button
+            type="button"
             tabIndex={0}
             onDragOver={(e) => {
               e.preventDefault();
@@ -121,7 +123,7 @@ export function AudioSampleUpload({
                     className="flex items-center gap-2"
                   >
                     <Mic className="h-4 w-4" />
-                    {isTranscribing ? 'Transcribing...' : 'Transcribe'}
+                    {transcribeLabel ?? (isTranscribing ? 'Transcribing...' : 'Transcribe')}
                   </Button>
                   <Button
                     type="button"
@@ -138,7 +140,7 @@ export function AudioSampleUpload({
                 </div>
               </>
             )}
-          </div>
+          </button>
         </div>
       </FormControl>
       <FormMessage />
diff --git a/app/src/components/VoiceProfiles/ProfileForm.tsx b/app/src/components/VoiceProfiles/ProfileForm.tsx
index f4fc571..d8ba70a 100644
--- a/app/src/components/VoiceProfiles/ProfileForm.tsx
+++ b/app/src/components/VoiceProfiles/ProfileForm.tsx
@@ -31,6 +31,7 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs';
 import { Textarea } from '@/components/ui/textarea';
 import { useToast } from '@/components/ui/use-toast';
 import { LANGUAGE_CODES, LANGUAGE_OPTIONS, type LanguageCode } from '@/lib/constants/languages';
+import { useAudioDevices } from '@/lib/hooks/useAudioDevices';
 import { useAudioPlayer } from '@/lib/hooks/useAudioPlayer';
 import { useAudioRecording } from '@/lib/hooks/useAudioRecording';
 import {
@@ -43,6 +44,9 @@ import {
 } from '@/lib/hooks/useProfiles';
 import { useSystemAudioCapture } from '@/lib/hooks/useSystemAudioCapture';
 import { useTranscription } from '@/lib/hooks/useTranscription';
+import { useWhisperModelReady } from '@/lib/hooks/useWhisperModelReady';
+import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast';
+import { ModelDownloadingError } from '@/lib/api/client';
 import { formatAudioDuration, getAudioDuration } from '@/lib/utils/audio';
 import { usePlatform } from '@/platform/PlatformContext';
 import { useServerStore } from '@/stores/serverStore';
@@ -116,8 +120,25 @@ export function ProfileForm() {
   const uploadAvatar = useUploadAvatar();
   const deleteAvatar = useDeleteAvatar();
   const transcribe = useTranscription();
+  const { ready: whisperReady, downloading: whisperModelDownloading } = useWhisperModelReady({ enabled: open });
   const { toast } = useToast();
-  const [sampleMode, setSampleMode] = useState<'upload' | 'record' | 'system'>('record');
+  const [whisperDownloading, setWhisperDownloading] = useState<string | null>(null);
+  useModelDownloadToast({
+    modelName: whisperDownloading || '',
+    displayName: 'Downloading Whisper model',
+    enabled: !!whisperDownloading,
+    onComplete: () => setWhisperDownloading(null),
+    onError: () => setWhisperDownloading(null),
+  });
+  // Compute the transcribe button label based on model availability
+  const transcribeLabel = transcribe.isPending
+    ? 'Transcribing...'
+    : !!whisperDownloading || whisperModelDownloading
+      ? 'Downloading model...'
+      : !whisperReady
+        ? 'No model available'
+        : undefined; // undefined = use default "Transcribe"
+  const [sampleMode, setSampleMode] = useState<'upload' | 'record' | 'system'>('upload');
   const [audioDuration, setAudioDuration] = useState<number | null>(null);
   const [isValidatingAudio, setIsValidatingAudio] = useState(false);
   const [avatarPreview, setAvatarPreview] = useState<string | null>(null);
@@ -183,6 +204,8 @@ export function ProfileForm() {
     }
   }, [selectedFile, form]);
 
+  const { devices: audioDevices, selectedDeviceId, setSelectedDeviceId } = useAudioDevices();
+
   const {
     isRecording,
     duration,
@@ -192,6 +215,7 @@ export function ProfileForm() {
     cancelRecording,
   } = useAudioRecording({
     maxDurationSeconds: 29,
+    deviceId: selectedDeviceId ?? undefined,
     onRecordingComplete: (blob, recordedDuration) => {
       const file = new File([blob], `recording-${Date.now()}.webm`, {
         type: blob.type || 'audio/webm',
@@ -314,7 +338,7 @@ export function ProfileForm() {
         referenceText: undefined,
         avatarFile: undefined,
       });
-      setSampleMode('record');
+      setSampleMode('upload');
       setAvatarPreview(null);
     }
   }, [editingProfile, profileFormDraft, open, form]);
@@ -336,6 +360,14 @@ export function ProfileForm() {
 
       form.setValue('referenceText', result.text, { shouldValidate: true });
     } catch (error) {
+      if (error instanceof ModelDownloadingError) {
+        setWhisperDownloading(error.modelName);
+        toast({
+          title: 'Whisper model downloading',
+          description: 'The transcription model is being downloaded. You can try again once it finishes.',
+        });
+        return;
+      }
       toast({
         title: 'Transcription failed',
         description: error instanceof Error ? error.message : 'Failed to transcribe audio',
@@ -632,7 +664,7 @@ export function ProfileForm() {
                       sampleFile: undefined,
                       referenceText: '',
                     });
-                    setSampleMode('record');
+                    setSampleMode('upload');
                   }}
                 >
                   <X className="h-3 w-3 mr-1" />
@@ -695,11 +727,12 @@ export function ProfileForm() {
                                 onPlayPause={handlePlayPause}
                                 isPlaying={isPlaying}
                                 isValidating={isValidatingAudio}
-                                isTranscribing={transcribe.isPending}
+                                isTranscribing={transcribe.isPending || !!whisperDownloading || !whisperReady}
                                 isDisabled={
                                   audioDuration !== null &&
                                   audioDuration > MAX_AUDIO_DURATION_SECONDS
                                 }
+                                transcribeLabel={transcribeLabel}
                                 fieldName={name}
                               />
                             )}
@@ -721,7 +754,11 @@ export function ProfileForm() {
                                 onTranscribe={handleTranscribe}
                                 onPlayPause={handlePlayPause}
                                 isPlaying={isPlaying}
-                                isTranscribing={transcribe.isPending}
+                                isTranscribing={transcribe.isPending || !!whisperDownloading || !whisperReady}
+                                transcribeLabel={transcribeLabel}
+                                audioDevices={audioDevices}
+                                selectedDeviceId={selectedDeviceId}
+                                onDeviceChange={setSelectedDeviceId}
                               />
                             )}
                           />
@@ -743,7 +780,8 @@ export function ProfileForm() {
                                   onTranscribe={handleTranscribe}
                                   onPlayPause={handlePlayPause}
                                   isPlaying={isPlaying}
-                                  isTranscribing={transcribe.isPending}
+                                  isTranscribing={transcribe.isPending || !!whisperDownloading || !whisperReady}
+                                  transcribeLabel={transcribeLabel}
                                 />
                               )}
                             />
diff --git a/app/src/components/VoiceProfiles/SampleUpload.tsx b/app/src/components/VoiceProfiles/SampleUpload.tsx
index 3c53b7d..a150f6a 100644
--- a/app/src/components/VoiceProfiles/SampleUpload.tsx
+++ b/app/src/components/VoiceProfiles/SampleUpload.tsx
@@ -22,6 +22,7 @@ import {
 import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs';
 import { Textarea } from '@/components/ui/textarea';
 import { useToast } from '@/components/ui/use-toast';
+import { useAudioDevices } from '@/lib/hooks/useAudioDevices';
 import { useAudioPlayer } from '@/lib/hooks/useAudioPlayer';
 import { useAudioRecording } from '@/lib/hooks/useAudioRecording';
 import { useAddSample, useProfile } from '@/lib/hooks/useProfiles';
@@ -66,6 +67,8 @@ export function SampleUpload({ profileId, open, onOpenChange }: SampleUploadProp
 
   const selectedFile = form.watch('file');
 
+  const { devices: audioDevices, selectedDeviceId, setSelectedDeviceId } = useAudioDevices();
+
   const {
     isRecording,
     duration,
@@ -75,6 +78,7 @@ export function SampleUpload({ profileId, open, onOpenChange }: SampleUploadProp
     cancelRecording,
   } = useAudioRecording({
     maxDurationSeconds: 29,
+    deviceId: selectedDeviceId ?? undefined,
     onRecordingComplete: (blob, recordedDuration) => {
       // Convert blob to File object
       const file = new File([blob], `recording-${Date.now()}.webm`, {
@@ -285,6 +289,9 @@ export function SampleUpload({ profileId, open, onOpenChange }: SampleUploadProp
                       onPlayPause={handlePlayPause}
                       isPlaying={isPlaying}
                       isTranscribing={transcribe.isPending}
+                      audioDevices={audioDevices}
+                      selectedDeviceId={selectedDeviceId}
+                      onDeviceChange={setSelectedDeviceId}
                     />
                   )}
                 />
diff --git a/app/src/components/VoicesTab/VoicesTab.tsx b/app/src/components/VoicesTab/VoicesTab.tsx
index 12fedef..49a29bb 100644
--- a/app/src/components/VoicesTab/VoicesTab.tsx
+++ b/app/src/components/VoicesTab/VoicesTab.tsx
@@ -29,7 +29,7 @@ import { useUIStore } from '@/stores/uiStore';
 
 export function VoicesTab() {
   const { data: profiles, isLoading } = useProfiles();
-  const { data: historyData } = useHistory({ limit: 1000 });
+  const { data: historyData } = useHistory({ limit: 100 });
   const queryClient = useQueryClient();
   const setDialogOpen = useUIStore((state) => state.setProfileDialogOpen);
   const setEditingProfileId = useUIStore((state) => state.setEditingProfileId);
diff --git a/app/src/components/ui/checkbox.tsx b/app/src/components/ui/checkbox.tsx
index f423fef..e6d8b84 100644
--- a/app/src/components/ui/checkbox.tsx
+++ b/app/src/components/ui/checkbox.tsx
@@ -13,6 +13,7 @@ export interface CheckboxProps {
 const Checkbox = React.forwardRef<HTMLButtonElement, CheckboxProps>(
   ({ checked = false, onCheckedChange, disabled = false, className, id, ...props }, ref) => {
     return (
+      // biome-ignore lint/a11y/useSemanticElements: Custom checkbox component requires button for styling
       <button
         type="button"
         ref={ref}
diff --git a/app/src/global.d.ts b/app/src/global.d.ts
index d405eef..96eef23 100644
--- a/app/src/global.d.ts
+++ b/app/src/global.d.ts
@@ -1,3 +1,6 @@
 interface Window {
   __voiceboxServerStartedByApp?: boolean;
 }
+
+declare const __GIT_HASH__: string;
+declare const __GIT_COMMIT_COUNT__: number;
diff --git a/app/src/hooks/useAutoUpdater.ts b/app/src/hooks/useAutoUpdater.ts
deleted file mode 100644
index 7a9f169..0000000
--- a/app/src/hooks/useAutoUpdater.ts
+++ /dev/null
@@ -1,52 +0,0 @@
-import { useCallback, useEffect, useRef, useState } from 'react';
-import { usePlatform } from '@/platform/PlatformContext';
-import type { UpdateStatus } from '@/platform/types';
-
-// Re-export UpdateStatus for backwards compatibility
-export type { UpdateStatus };
-
-export function useAutoUpdater(checkOnMount = false) {
-  const platform = usePlatform();
-  const [status, setStatus] = useState<UpdateStatus>(platform.updater.getStatus());
-  const hasCheckedRef = useRef(false);
-
-  // Subscribe to updater status changes
-  useEffect(() => {
-    const unsubscribe = platform.updater.subscribe((newStatus) => {
-      setStatus(newStatus);
-    });
-    return unsubscribe;
-    // Empty dependency array - platform is stable from context
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [platform.updater.subscribe]);
-
-  const checkForUpdates = useCallback(async () => {
-    await platform.updater.checkForUpdates();
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [platform.updater.checkForUpdates]);
-
-  const downloadAndInstall = useCallback(async () => {
-    await platform.updater.downloadAndInstall();
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [platform.updater.downloadAndInstall]);
-
-  const restartAndInstall = useCallback(async () => {
-    await platform.updater.restartAndInstall();
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [platform.updater.restartAndInstall]);
-
-  useEffect(() => {
-    if (checkOnMount && platform.metadata.isTauri && !hasCheckedRef.current) {
-      hasCheckedRef.current = true;
-      checkForUpdates();
-    }
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [platform.metadata.isTauricheckOnMountcheckForUpdates]);
-
-  return {
-    status,
-    checkForUpdates,
-    downloadAndInstall,
-    restartAndInstall,
-  };
-}
diff --git a/app/src/hooks/useAutoUpdater.tsx b/app/src/hooks/useAutoUpdater.tsx
index 8a6351f..27683e9 100644
--- a/app/src/hooks/useAutoUpdater.tsx
+++ b/app/src/hooks/useAutoUpdater.tsx
@@ -73,7 +73,7 @@ export function useAutoUpdater(options: boolean | UseAutoUpdaterOptions = false)
     }
     // Empty dependency array - only run once on mount
     // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [platform.metadata.isTauricheckOnMountcheckForUpdates]);
+  }, [platform.metadata.isTauri, checkOnMount, checkForUpdates]);
 
   // Show toast when update is available
   useEffect(() => {
diff --git a/app/src/index.css b/app/src/index.css
index 0638171..5ab338d 100644
--- a/app/src/index.css
+++ b/app/src/index.css
@@ -1,5 +1,10 @@
 @import "tailwindcss" source(".");
 
+@keyframes indeterminate {
+  0% { transform: translateX(-100%); }
+  100% { transform: translateX(400%); }
+}
+
 @theme {
   --radius-sm: calc(var(--radius) - 4px);
   --radius-md: calc(var(--radius) - 2px);
diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts
index c5b079b..06ef7b8 100644
--- a/app/src/lib/api/client.ts
+++ b/app/src/lib/api/client.ts
@@ -6,6 +6,7 @@ import type {
   ProfileSampleResponse,
   GenerationRequest,
   GenerationResponse,
+  GenerationJobResponse,
   HistoryQuery,
   HistoryListResponse,
   HistoryResponse,
@@ -26,20 +27,81 @@ import type {
   StoryItemSplit,
 } from './types';
 
+export class ModelDownloadingError extends Error {
+  modelName: string;
+  constructor(message: string, modelName: string) {
+    super(message);
+    this.name = 'ModelDownloadingError';
+    this.modelName = modelName;
+  }
+}
+
+/** Lightweight health snapshot extracted from response headers. */
+export interface PiggybackHealth {
+  model_loaded: boolean;
+  model_size: string | null;
+  gpu_type: string | null;
+  backend: string | null;
+  timestamp: number;
+}
+
+/** Last health data piggybacked from any API response. */
+let _piggybackHealth: PiggybackHealth | null = null;
+let _lastApiEmissionTimestamp = 0;
+
+export function markApiEmission(): void {
+  _lastApiEmissionTimestamp = Date.now();
+}
+
+export function getLastApiEmissionTimestamp(): number {
+  return _lastApiEmissionTimestamp;
+}
+
+/** Read the latest piggybacked health (or null if stale / unavailable). */
+export function getPiggybackHealth(maxAgeMs = 60_000): PiggybackHealth | null {
+  if (!_piggybackHealth) return null;
+  if (Date.now() - _piggybackHealth.timestamp > maxAgeMs) return null;
+  return _piggybackHealth;
+}
+
+function extractHealthHeaders(response: Response): void {
+  const loaded = response.headers.get('X-Health-Model-Loaded');
+  if (loaded === null) return; // no health headers
+  _piggybackHealth = {
+    model_loaded: loaded === '1',
+    model_size: response.headers.get('X-Health-Model-Size'),
+    gpu_type: response.headers.get('X-Health-GPU-Type'),
+    backend: response.headers.get('X-Health-Backend'),
+    timestamp: Date.now(),
+  };
+}
+
 class ApiClient {
   private getBaseUrl(): string {
     const serverUrl = useServerStore.getState().serverUrl;
     return serverUrl;
   }
 
+  private async fetchWithTracking(url: string, options?: RequestInit): Promise<Response> {
+    const response = await fetch(url, options);
+    // Only mark as successful API emission if the request succeeded
+    markApiEmission();
+    extractHealthHeaders(response);
+    return response;
+  }
+
   private async request<T>(endpoint: string, options?: RequestInit): Promise<T> {
     const url = `${this.getBaseUrl()}${endpoint}`;
-    const response = await fetch(url, {
+    const headers = new Headers(options?.headers);
+    const hasBody = options?.body !== undefined && options?.body !== null;
+    const isFormBody = typeof FormData !== 'undefined' && options?.body instanceof FormData;
+    if (hasBody && !isFormBody && !headers.has('Content-Type')) {
+      headers.set('Content-Type', 'application/json');
+    }
+
+    const response = await this.fetchWithTracking(url, {
       ...options,
-      headers: {
-        'Content-Type': 'application/json',
-        ...options?.headers,
-      },
+      headers,
     });
 
     if (!response.ok) {
@@ -96,7 +158,7 @@ class ApiClient {
     formData.append('file', file);
     formData.append('reference_text', referenceText);
 
-    const response = await fetch(url, {
+    const response = await this.fetchWithTracking(url, {
       method: 'POST',
       body: formData,
     });
@@ -133,7 +195,7 @@ class ApiClient {
 
   async exportProfile(profileId: string): Promise<Blob> {
     const url = `${this.getBaseUrl()}/profiles/${profileId}/export`;
-    const response = await fetch(url);
+    const response = await this.fetchWithTracking(url);
 
     if (!response.ok) {
       const error = await response.json().catch(() => ({
@@ -150,7 +212,7 @@ class ApiClient {
     const formData = new FormData();
     formData.append('file', file);
 
-    const response = await fetch(url, {
+    const response = await this.fetchWithTracking(url, {
       method: 'POST',
       body: formData,
     });
@@ -170,7 +232,7 @@ class ApiClient {
     const formData = new FormData();
     formData.append('file', file);
 
-    const response = await fetch(url, {
+    const response = await this.fetchWithTracking(url, {
       method: 'POST',
       body: formData,
     });
@@ -191,6 +253,36 @@ class ApiClient {
     });
   }
 
+  // Jobs
+  async listPendingJobs(): Promise<GenerationJobResponse[]> {
+    return this.request<GenerationJobResponse[]>('/jobs/pending');
+  }
+
+  async listJobs(query?: {
+    limit?: number;
+    offset?: number;
+    status?: string;
+  }): Promise<GenerationJobResponse[]> {
+    const params = new URLSearchParams();
+    if (query?.limit !== undefined) params.append('limit', String(query.limit));
+    if (query?.offset !== undefined) params.append('offset', String(query.offset));
+    if (query?.status) params.append('status', query.status);
+    const qs = params.toString();
+    return this.request<GenerationJobResponse[]>(qs ? `/jobs?${qs}` : '/jobs');
+  }
+
+  async cancelJob(jobId: string): Promise<{ status: string }> {
+    return this.request<{ status: string }>(`/jobs/${jobId}/cancel`, {
+      method: 'POST',
+    });
+  }
+
+  async forceCancelJob(jobId: string): Promise<{ status: string }> {
+    return this.request<{ status: string }>(`/jobs/${jobId}/cancel/force`, {
+      method: 'POST',
+    });
+  }
+
   // Generation
   async generateSpeech(data: GenerationRequest): Promise<GenerationResponse> {
     return this.request<GenerationResponse>('/generate', {
@@ -225,7 +317,7 @@ class ApiClient {
 
   async exportGeneration(generationId: string): Promise<Blob> {
     const url = `${this.getBaseUrl()}/history/${generationId}/export`;
-    const response = await fetch(url);
+    const response = await this.fetchWithTracking(url);
 
     if (!response.ok) {
       const error = await response.json().catch(() => ({
@@ -239,7 +331,7 @@ class ApiClient {
 
   async exportGenerationAudio(generationId: string): Promise<Blob> {
     const url = `${this.getBaseUrl()}/history/${generationId}/export-audio`;
-    const response = await fetch(url);
+    const response = await this.fetchWithTracking(url);
 
     if (!response.ok) {
       const error = await response.json().catch(() => ({
@@ -256,7 +348,7 @@ class ApiClient {
     const formData = new FormData();
     formData.append('file', file);
 
-    const response = await fetch(url, {
+    const response = await this.fetchWithTracking(url, {
       method: 'POST',
       body: formData,
     });
@@ -289,11 +381,21 @@ class ApiClient {
     }
 
     const url = `${this.getBaseUrl()}/transcribe`;
-    const response = await fetch(url, {
+    const response = await this.fetchWithTracking(url, {
       method: 'POST',
       body: formData,
     });
 
+    // 202 means the whisper model is being downloaded
+    if (response.status === 202) {
+      const body = await response.json().catch(() => ({ detail: {} }));
+      const detail = body.detail || {};
+      throw new ModelDownloadingError(
+        detail.message || 'Whisper model is being downloaded',
+        detail.model_name || 'whisper-base',
+      );
+    }
+
     if (!response.ok) {
       const error = await response.json().catch(() => ({
         detail: response.statusText,
@@ -319,6 +421,12 @@ class ApiClient {
     return result;
   }
 
+  async cancelModelDownload(modelName: string): Promise<{ message: string }> {
+    return this.request<{ message: string }>(`/models/cancel/${modelName}`, {
+      method: 'POST',
+    });
+  }
+
   async deleteModel(modelName: string): Promise<{ message: string }> {
     return this.request<{ message: string }>(`/models/${modelName}`, {
       method: 'DELETE',
@@ -497,7 +605,7 @@ class ApiClient {
 
   async exportStoryAudio(storyId: string): Promise<Blob> {
     const url = `${this.getBaseUrl()}/stories/${storyId}/export-audio`;
-    const response = await fetch(url);
+    const response = await this.fetchWithTracking(url);
 
     if (!response.ok) {
       const error = await response.json().catch(() => ({
diff --git a/app/src/lib/api/core/CancelablePromise.ts b/app/src/lib/api/core/CancelablePromise.ts
index d94a263..39ea84e 100644
--- a/app/src/lib/api/core/CancelablePromise.ts
+++ b/app/src/lib/api/core/CancelablePromise.ts
@@ -88,6 +88,7 @@ export class CancelablePromise<T> implements Promise<T> {
     return 'Cancellable Promise';
   }
 
+  // biome-ignore lint/suspicious/noThenProperty: Required for Promise-like interface
   public then<TResult1 = T, TResult2 = never>(
     onFulfilled?: ((value: T) => TResult1 | PromiseLike<TResult1>) | null,
     onRejected?: ((reason: any) => TResult2 | PromiseLike<TResult2>) | null,
diff --git a/app/src/lib/api/core/request.ts b/app/src/lib/api/core/request.ts
index ac97e19..de0cbd0 100644
--- a/app/src/lib/api/core/request.ts
+++ b/app/src/lib/api/core/request.ts
@@ -114,15 +114,15 @@ export const getFormData = (options: ApiRequestOptions): FormData | undefined =>
       }
     };
 
-    Object.entries(options.formData)
-      .filter(([_, value]) => isDefined(value))
-      .forEach(([key, value]) => {
-        if (Array.isArray(value)) {
-          value.forEach((v) => process(key, v));
-        } else {
-          process(key, value);
+    for (const [key, value] of Object.entries(options.formData).filter(([_, value]) => isDefined(value))) {
+      if (Array.isArray(value)) {
+        for (const v of value) {
+          process(key, v);
         }
-      });
+      } else {
+        process(key, value);
+      }
+    }
 
     return formData;
   }
diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
index 131c1be..bc34b44 100644
--- a/app/src/lib/api/types.ts
+++ b/app/src/lib/api/types.ts
@@ -34,6 +34,9 @@ export interface GenerationRequest {
   language: LanguageCode;
   seed?: number;
   model_size?: '1.7B' | '0.6B';
+  instruct?: string;
+  request_user_id?: string;
+  request_user_first_name?: string;
 }
 
 export interface GenerationResponse {
@@ -43,8 +46,38 @@ export interface GenerationResponse {
   language: string;
   audio_path: string;
   duration: number;
+  generation_time_seconds?: number;
   seed?: number;
+  instruct?: string;
+  model_size?: string;
+  backend_type?: string;
+  request_user_id?: string;
+  request_user_first_name?: string;
+  request_ip?: string;
+  created_at: string;
+}
+
+export interface GenerationJobResponse {
+  id: string;
+  profile_id: string;
+  profile_name: string;
+  text: string;
+  language: string;
+  model_size?: string;
+  backend_type?: string;
+  request_user_id?: string;
+  request_user_first_name?: string;
+  request_ip?: string;
+  status: 'queued' | 'generating' | 'cancelling' | 'complete' | 'cancelled' | 'error' | 'timeout';
+  progress: number;
+  generation_id?: string;
+  audio_path?: string;
+  duration?: number;
+  generation_time_seconds?: number;
+  instruct?: string;
   created_at: string;
+  started_at?: string;
+  completed_at?: string;
 }
 
 export interface HistoryQuery {
@@ -56,6 +89,7 @@ export interface HistoryQuery {
 
 export interface HistoryResponse extends GenerationResponse {
   profile_name: string;
+  job_id?: string;
 }
 
 export interface HistoryListResponse {
@@ -78,7 +112,9 @@ export interface HealthResponse {
   model_downloaded?: boolean;
   model_size?: string;
   gpu_available: boolean;
+  gpu_type?: string;
   vram_used_mb?: number;
+  backend_type?: string;
 }
 
 export interface ModelProgress {
@@ -87,7 +123,7 @@ export interface ModelProgress {
   total: number;
   progress: number;
   filename?: string;
-  status: 'downloading' | 'extracting' | 'complete' | 'error';
+  status: 'downloading' | 'extracting' | 'loading' | 'complete' | 'error';
   timestamp: string;
   error?: string;
 }
diff --git a/app/src/lib/hooks/useAudioDevices.ts b/app/src/lib/hooks/useAudioDevices.ts
new file mode 100644
index 0000000..e02864d
--- /dev/null
+++ b/app/src/lib/hooks/useAudioDevices.ts
@@ -0,0 +1,108 @@
+import { useCallback, useEffect, useState } from 'react';
+
+export interface AudioInputDevice {
+  deviceId: string;
+  label: string;
+}
+
+/**
+ * Enumerates audio input devices and refreshes on devicechange events.
+ * Returns the list of devices and the selected deviceId (persisted to localStorage).
+ *
+ * NOTE: browsers only expose device labels after getUserMedia has been granted.
+ * We request permission lazily — if the list has no labels, we trigger a silent
+ * getUserMedia to unlock them, then re-enumerate.
+ */
+const STORAGE_KEY = 'voicebox:audioInputDeviceId';
+
+function loadStoredDeviceId(): string | null {
+  try {
+    return localStorage.getItem(STORAGE_KEY);
+  } catch {
+    return null;
+  }
+}
+
+function saveDeviceId(deviceId: string | null) {
+  try {
+    if (deviceId) {
+      localStorage.setItem(STORAGE_KEY, deviceId);
+    } else {
+      localStorage.removeItem(STORAGE_KEY);
+    }
+  } catch {
+    // ignore
+  }
+}
+
+export function useAudioDevices() {
+  const [devices, setDevices] = useState<AudioInputDevice[]>([]);
+  const [selectedDeviceId, setSelectedDeviceIdState] = useState<string | null>(
+    loadStoredDeviceId,
+  );
+
+  const enumerateDevices = useCallback(async () => {
+    if (!navigator.mediaDevices?.enumerateDevices) return;
+
+    try {
+      const all = await navigator.mediaDevices.enumerateDevices();
+      const inputs = all.filter((d) => d.kind === 'audioinput');
+
+      // If all labels are empty we don't have permission yet — request it silently
+      if (inputs.length > 0 && inputs.every((d) => !d.label)) {
+        try {
+          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+          stream.getTracks().forEach((t) => t.stop());
+          // Re-enumerate now that we have permission
+          const all2 = await navigator.mediaDevices.enumerateDevices();
+          const inputs2 = all2.filter((d) => d.kind === 'audioinput');
+          setDevices(inputs2.map((d) => ({ deviceId: d.deviceId, label: d.label || d.deviceId })));
+          return;
+        } catch {
+          // Permission denied — show unlabeled entries
+        }
+      }
+
+      setDevices(inputs.map((d) => ({ deviceId: d.deviceId, label: d.label || d.deviceId || 'Unknown device' })));
+    } catch (err) {
+      console.warn('[useAudioDevices] enumerateDevices failed:', err);
+    }
+  }, []);
+
+  // Initial enumeration
+  useEffect(() => {
+    enumerateDevices();
+  }, [enumerateDevices]);
+
+  // Refresh list on device plug/unplug
+  useEffect(() => {
+    if (!navigator.mediaDevices?.addEventListener) return;
+    navigator.mediaDevices.addEventListener('devicechange', enumerateDevices);
+    return () => {
+      navigator.mediaDevices.removeEventListener('devicechange', enumerateDevices);
+    };
+  }, [enumerateDevices]);
+
+  // If the stored device is no longer available, fall back to default
+  useEffect(() => {
+    if (devices.length === 0) return;
+    if (selectedDeviceId && !devices.find((d) => d.deviceId === selectedDeviceId)) {
+      console.log('[useAudioDevices] Stored device no longer available, resetting to default');
+      setSelectedDeviceIdState(null);
+      saveDeviceId(null);
+    }
+  }, [devices, selectedDeviceId]);
+
+  const setSelectedDeviceId = useCallback((deviceId: string | null) => {
+    setSelectedDeviceIdState(deviceId);
+    saveDeviceId(deviceId);
+  }, []);
+
+  return {
+    devices,
+    selectedDeviceId,
+    setSelectedDeviceId,
+    /** The effective deviceId to pass to getUserMedia — null means "system default" */
+    effectiveDeviceId: selectedDeviceId ?? undefined,
+  };
+}
diff --git a/app/src/lib/hooks/useAudioRecording.ts b/app/src/lib/hooks/useAudioRecording.ts
index 2916937..f06aa20 100644
--- a/app/src/lib/hooks/useAudioRecording.ts
+++ b/app/src/lib/hooks/useAudioRecording.ts
@@ -5,11 +5,13 @@ import { convertToWav } from '@/lib/utils/audio';
 interface UseAudioRecordingOptions {
   maxDurationSeconds?: number;
   onRecordingComplete?: (blob: Blob, duration?: number) => void;
+  deviceId?: string;
 }
 
 export function useAudioRecording({
   maxDurationSeconds = 29,
   onRecordingComplete,
+  deviceId,
 }: UseAudioRecordingOptions = {}) {
   const platform = usePlatform();
   const [isRecording, setIsRecording] = useState(false);
@@ -56,14 +58,16 @@ export function useAudioRecording({
         }
       }
 
-      // Request microphone access
-      const stream = await navigator.mediaDevices.getUserMedia({
-        audio: {
-          echoCancellation: true,
-          noiseSuppression: true,
-          autoGainControl: true,
-        },
-      });
+      // Request microphone access — use specific deviceId if provided
+      const audioConstraints: MediaTrackConstraints = {
+        echoCancellation: true,
+        noiseSuppression: true,
+        autoGainControl: true,
+      };
+      if (deviceId) {
+        audioConstraints.deviceId = { exact: deviceId };
+      }
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints });
 
       streamRef.current = stream;
 
diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
index c6fdba5..5e3d13f 100644
--- a/app/src/lib/hooks/useGenerationForm.ts
+++ b/app/src/lib/hooks/useGenerationForm.ts
@@ -1,15 +1,15 @@
 import { zodResolver } from '@hookform/resolvers/zod';
-import { useState } from 'react';
 import { useForm } from 'react-hook-form';
 import * as z from 'zod';
 import { useToast } from '@/components/ui/use-toast';
 import { apiClient } from '@/lib/api/client';
 import { LANGUAGE_CODES, type LanguageCode } from '@/lib/constants/languages';
 import { useGeneration } from '@/lib/hooks/useGeneration';
-import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast';
 import { useGenerationStore } from '@/stores/generationStore';
 import { usePlayerStore } from '@/stores/playerStore';
 
+const usePlayerReset = () => usePlayerStore((state) => state.reset);
+
 const generationSchema = z.object({
   text: z.string().min(1, 'Text is required').max(5000),
   language: z.enum(LANGUAGE_CODES as [LanguageCode, ...LanguageCode[]]),
@@ -29,15 +29,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
   const { toast } = useToast();
   const generation = useGeneration();
   const setAudioWithAutoPlay = usePlayerStore((state) => state.setAudioWithAutoPlay);
+  const resetPlayer = usePlayerReset();
   const setIsGenerating = useGenerationStore((state) => state.setIsGenerating);
-  const [downloadingModelName, setDownloadingModelName] = useState<string | null>(null);
-  const [downloadingDisplayName, setDownloadingDisplayName] = useState<string | null>(null);
-
-  useModelDownloadToast({
-    modelName: downloadingModelName || '',
-    displayName: downloadingDisplayName || '',
-    enabled: !!downloadingModelName,
-  });
 
   const form = useForm<GenerationFormValues>({
     resolver: zodResolver(generationSchema),
@@ -66,21 +59,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
 
     try {
       setIsGenerating(true);
-
-      const modelName = `qwen-tts-${data.modelSize}`;
-      const displayName = data.modelSize === '1.7B' ? 'Qwen TTS 1.7B' : 'Qwen TTS 0.6B';
-
-      try {
-        const modelStatus = await apiClient.getModelStatus();
-        const model = modelStatus.models.find((m) => m.model_name === modelName);
-
-        if (model && !model.downloaded) {
-          setDownloadingModelName(modelName);
-          setDownloadingDisplayName(displayName);
-        }
-      } catch (error) {
-        console.error('Failed to check model status:', error);
-      }
+      resetPlayer(); // Close any existing audio player
 
       const result = await generation.mutateAsync({
         profile_id: selectedProfileId,
@@ -99,7 +78,14 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
       const audioUrl = apiClient.getAudioUrl(result.id);
       setAudioWithAutoPlay(audioUrl, result.id, selectedProfileId, data.text.substring(0, 50));
 
-      form.reset();
+      // Preserve sticky fields across reset — only clear text/seed/instruct
+      form.reset({
+        text: '',
+        language: form.getValues('language'),
+        seed: undefined,
+        modelSize: form.getValues('modelSize'),
+        instruct: '',
+      });
       options.onSuccess?.(result.id);
     } catch (error) {
       toast({
@@ -109,14 +95,16 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
       });
     } finally {
       setIsGenerating(false);
-      setDownloadingModelName(null);
-      setDownloadingDisplayName(null);
     }
   }
 
+  const pendingJobs = useGenerationStore((state) => state.pendingJobs);
+  const isQueueLimitReached = pendingJobs.length >= 3;
+
   return {
     form,
     handleSubmit,
     isPending: generation.isPending,
+    isQueueLimitReached,
   };
 }
diff --git a/app/src/lib/hooks/useHistory.ts b/app/src/lib/hooks/useHistory.ts
index ec0c20c..3320d0d 100644
--- a/app/src/lib/hooks/useHistory.ts
+++ b/app/src/lib/hooks/useHistory.ts
@@ -2,11 +2,13 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { apiClient } from '@/lib/api/client';
 import type { HistoryQuery } from '@/lib/api/types';
 import { usePlatform } from '@/platform/PlatformContext';
+import { useGenerationStore } from '@/stores/generationStore';
 
 export function useHistory(query?: HistoryQuery) {
   return useQuery({
     queryKey: ['history', query],
     queryFn: () => apiClient.listHistory(query),
+    refetchInterval: 10_000,
   });
 }
 
@@ -20,11 +22,18 @@ export function useGenerationDetail(generationId: string) {
 
 export function useDeleteGeneration() {
   const queryClient = useQueryClient();
+  const removeJob = useGenerationStore((state) => state.removeJob);
 
   return useMutation({
     mutationFn: (generationId: string) => apiClient.deleteGeneration(generationId),
-    onSuccess: () => {
+    onSuccess: (_, generationId) => {
+      // Invalidate all history and jobs queries to immediately remove deleted item from UI
       queryClient.invalidateQueries({ queryKey: ['history'] });
+      queryClient.invalidateQueries({ queryKey: ['home-jobs'] });
+      
+      // Remove from pending store if it's an active/queued job
+      // Job IDs and generation IDs are the same for pending jobs
+      removeJob(generationId);
     },
   });
 }
diff --git a/app/src/lib/hooks/useModelDownloadToast.tsx b/app/src/lib/hooks/useModelDownloadToast.tsx
index 2df221e..12467d9 100644
--- a/app/src/lib/hooks/useModelDownloadToast.tsx
+++ b/app/src/lib/hooks/useModelDownloadToast.tsx
@@ -1,21 +1,34 @@
-import { CheckCircle2, Loader2, XCircle } from 'lucide-react';
-import { useCallback, useEffect, useRef } from 'react';
+import { CheckCircle2, Loader2, X, XCircle } from 'lucide-react';
+import { useEffect, useRef } from 'react';
+import { Button } from '@/components/ui/button';
 import { Progress } from '@/components/ui/progress';
-import { useToast } from '@/components/ui/use-toast';
+import { toast } from '@/components/ui/use-toast';
 import type { ModelProgress } from '@/lib/api/types';
+import { apiClient } from '@/lib/api/client';
 import { useServerStore } from '@/stores/serverStore';
 
+const POLL_MS = 1000;
+
 interface UseModelDownloadToastOptions {
   modelName: string;
   displayName: string;
   enabled?: boolean;
   onComplete?: () => void;
   onError?: () => void;
+  onCancel?: () => void;
+}
+
+function formatBytes(bytes: number): string {
+  if (bytes === 0) return '0 B';
+  const k = 1024;
+  const sizes = ['B', 'KB', 'MB', 'GB'];
+  const i = Math.floor(Math.log(bytes) / Math.log(k));
+  return `${(bytes / k ** i).toFixed(1)} ${sizes[i]}`;
 }
 
 /**
- * Hook to show and update a toast notification with model download progress.
- * Subscribes to Server-Sent Events for real-time progress updates.
+ * Hook to show and update a toast notification with model download/load progress.
+ * Polls the server every second — simpler than SSE, no connection-count issues.
  */
 export function useModelDownloadToast({
   modelName,
@@ -23,194 +36,148 @@ export function useModelDownloadToast({
   enabled = false,
   onComplete,
   onError,
+  onCancel,
 }: UseModelDownloadToastOptions) {
-  const { toast } = useToast();
   const serverUrl = useServerStore((state) => state.serverUrl);
-  const toastIdRef = useRef<string | null>(null);
-  // biome-ignore lint: Using any for toast update ref to handle complex toast types
-  const toastUpdateRef = useRef<any>(null);
-  const eventSourceRef = useRef<EventSource | null>(null);
-
-  const formatBytes = useCallback((bytes: number): string => {
-    if (bytes === 0) return '0 B';
-    const k = 1024;
-    const sizes = ['B', 'KB', 'MB', 'GB'];
-    const i = Math.floor(Math.log(bytes) / Math.log(k));
-    return `${(bytes / k ** i).toFixed(1)} ${sizes[i]}`;
-  }, []);
 
-  useEffect(() => {
-    console.log('[useModelDownloadToast] useEffect triggered', {
-      enabled,
-      serverUrl,
-      modelName,
-      displayName,
-    });
+  const onCompleteRef = useRef(onComplete);
+  const onErrorRef = useRef(onError);
+  const onCancelRef = useRef(onCancel);
+  const displayNameRef = useRef(displayName);
+  onCompleteRef.current = onComplete;
+  onErrorRef.current = onError;
+  onCancelRef.current = onCancel;
+  displayNameRef.current = displayName;
 
-    if (!enabled || !serverUrl || !modelName) {
-      console.log('[useModelDownloadToast] Not enabled, skipping');
-      return;
-    }
+  useEffect(() => {
+    if (!enabled || !serverUrl || !modelName) return;
 
-    console.log('[useModelDownloadToast] Creating toast and EventSource for:', modelName);
+    let stopped = false;
 
-    // Create initial toast
-    const toastResult = toast({
-      title: displayName,
+    // Create toast once — capture update/dismiss in closure-stable refs
+    const { update: updateToast, dismiss: dismissToast } = toast({
+      title: displayNameRef.current,
       description: (
         <div className="flex items-center gap-2">
           <Loader2 className="h-4 w-4 animate-spin" />
-          <span>Connecting to download...</span>
+          <span>Starting...</span>
         </div>
       ),
-      duration: Infinity, // Don't auto-dismiss, we'll handle it manually
+      duration: Infinity,
     });
-    toastIdRef.current = toastResult.id;
-    toastUpdateRef.current = toastResult.update;
 
-    // Subscribe to progress updates via Server-Sent Events
-    const eventSourceUrl = `${serverUrl}/models/progress/${modelName}`;
-    console.log('[useModelDownloadToast] Creating EventSource to:', eventSourceUrl);
-    const eventSource = new EventSource(eventSourceUrl);
+    const handleCancel = async () => {
+      stopped = true;
+      try { await apiClient.cancelModelDownload(modelName); } catch { /* ignore */ }
+      dismissToast();
+      onCancelRef.current?.();
+    };
 
-    eventSource.onopen = () => {
-      console.log('[useModelDownloadToast] EventSource connection opened for:', modelName);
+    const renderToast = (progress: ModelProgress | null) => {
+      const hasTotal = !!progress && progress.total > 0;
+      const progressPercent = hasTotal ? progress!.progress : 0;
+      const progressText = hasTotal
+        ? `${formatBytes(progress!.current)} / ${formatBytes(progress!.total)} (${progress!.progress.toFixed(1)}%)`
+        : '';
+
+      const status = progress?.status ?? 'downloading';
+      const isTerminal = status === 'complete' || status === 'error';
+      const showCancel = !isTerminal && status !== 'loading';
+
+      let statusIcon: React.ReactNode = <Loader2 className="h-4 w-4 animate-spin" />;
+      let statusText = 'Downloading...';
+
+      if (status === 'complete') {
+        statusIcon = <CheckCircle2 className="h-4 w-4 text-green-500" />;
+        statusText = 'Download complete';
+      } else if (status === 'error') {
+        statusIcon = <XCircle className="h-4 w-4 text-destructive" />;
+        statusText = `Error: ${progress?.error || 'Unknown error'}`;
+      } else if (status === 'loading') {
+        statusText = 'Loading model...';
+      } else if (status === 'extracting') {
+        statusText = 'Extracting...';
+      } else {
+        statusText = progress?.filename || 'Downloading...';
+      }
+
+      // biome-ignore lint: updateToast expects ToasterToast but id is captured in closure
+      (updateToast as any)({
+        title: (
+          <div className="flex items-center justify-between gap-2">
+            <div className="flex items-center gap-2">
+              {statusIcon}
+              <span>{displayNameRef.current}</span>
+            </div>
+            {showCancel && (
+              <Button size="sm" variant="ghost" className="h-5 w-5 p-0 shrink-0" onClick={handleCancel} title="Cancel">
+                <X className="h-3 w-3" />
+              </Button>
+            )}
+          </div>
+        ),
+        description: (
+          <div className="space-y-2">
+            <div className="text-sm">{statusText}</div>
+            {status !== 'loading' && (
+              hasTotal ? (
+                <>
+                  <Progress value={progressPercent} className="h-2" />
+                  <div className="text-xs text-muted-foreground">{progressText}</div>
+                </>
+              ) : (
+                <div className="h-2 w-full rounded-full bg-secondary overflow-hidden">
+                  <div className="h-full w-1/3 rounded-full bg-primary animate-[indeterminate_1.5s_ease-in-out_infinite]" />
+                </div>
+              )
+            )}
+          </div>
+        ),
+        duration: isTerminal ? 4000 : Infinity,
+        variant: status === 'error' ? 'destructive' : 'default',
+      });
     };
 
-    eventSource.onmessage = (event) => {
-      console.log('[useModelDownloadToast] Received SSE message:', event.data);
+    // Poll loop
+    const poll = async () => {
+      if (stopped) return;
       try {
-        const progress = JSON.parse(event.data) as ModelProgress;
-
-        // Update toast with progress
-        if (toastIdRef.current && toastUpdateRef.current) {
-          const progressPercent = progress.total > 0 ? progress.progress : 0;
-          const progressText =
-            progress.total > 0
-              ? `${formatBytes(progress.current)} / ${formatBytes(progress.total)} (${progress.progress.toFixed(1)}%)`
-              : '';
-
-          // Determine status icon and text
-          let statusIcon: React.ReactNode = null;
-          let statusText = 'Processing...';
-
-          switch (progress.status) {
-            case 'complete':
-              statusIcon = <CheckCircle2 className="h-4 w-4 text-green-500" />;
-              statusText = 'Download complete';
-              break;
-            case 'error':
-              statusIcon = <XCircle className="h-4 w-4 text-destructive" />;
-              statusText = `Error: ${progress.error || 'Unknown error'}`;
-              break;
-            case 'downloading':
-              statusIcon = <Loader2 className="h-4 w-4 animate-spin" />;
-              statusText = progress.filename || 'Downloading...';
-              break;
-            case 'extracting':
-              statusIcon = <Loader2 className="h-4 w-4 animate-spin" />;
-              statusText = 'Extracting...';
-              break;
+        const res = await fetch(`${serverUrl}/models/progress-snapshot/${modelName}`);
+        if (stopped) return;
+        if (res.ok) {
+          const data = await res.json();
+          // 'idle' means no active download (finished, cancelled, or not started)
+          if (data.status === 'idle') {
+            stopped = true;
+            dismissToast();
+            return;
           }
+          const progress: ModelProgress = data;
+          renderToast(progress);
 
-          toastUpdateRef.current({
-            title: (
-              <div className="flex items-center gap-2">
-                {statusIcon}
-                <span>{displayName}</span>
-              </div>
-            ),
-            description: (
-              <div className="space-y-2">
-                <div className="text-sm">{statusText}</div>
-                {progress.total > 0 && (
-                  <>
-                    <Progress value={progressPercent} className="h-2" />
-                    <div className="text-xs text-muted-foreground">{progressText}</div>
-                  </>
-                )}
-              </div>
-            ),
-            duration: progress.status === 'complete' ? 5000 : Infinity,
-            variant: progress.status === 'error' ? 'destructive' : 'default',
-          });
-
-          // Close connection and dismiss toast on completion or error
-          // Also treat progress >= 100% as complete
-          const isComplete = progress.status === 'complete' || progress.progress >= 100;
-          const isError = progress.status === 'error';
-
-          if (isComplete || isError) {
-            console.log('[useModelDownloadToast] Download finished:', {
-              isComplete,
-              isError,
-              progress: progress.progress,
-            });
-            eventSource.close();
-            eventSourceRef.current = null;
-
-            // Update toast to show completion state before callbacks
-            if (isComplete && toastUpdateRef.current) {
-              toastUpdateRef.current({
-                title: (
-                  <div className="flex items-center gap-2">
-                    <CheckCircle2 className="h-4 w-4 text-green-500" />
-                    <span>{displayName}</span>
-                  </div>
-                ),
-                description: 'Download complete',
-                duration: 3000,
-              });
-            }
-
-            // Call callbacks
-            if (isComplete && onComplete) {
-              console.log('[useModelDownloadToast] Download complete, calling onComplete callback');
-              onComplete();
-            } else if (isError && onError) {
-              console.log('[useModelDownloadToast] Download error, calling onError callback');
-              onError();
-            }
+          if (progress.status === 'complete' || (progress.progress ?? 0) >= 100) {
+            stopped = true;
+            onCompleteRef.current?.();
+            return;
+          }
+          if (progress.status === 'error') {
+            stopped = true;
+            onErrorRef.current?.();
+            return;
           }
         }
-      } catch (error) {
-        console.error('Error parsing progress event:', error);
+      } catch {
+        // server temporarily unavailable, keep polling
       }
+      if (!stopped) intervalId = window.setTimeout(poll, POLL_MS);
     };
 
-    eventSource.onerror = (error) => {
-      console.error('[useModelDownloadToast] SSE error for:', modelName, error);
-      console.log('[useModelDownloadToast] EventSource readyState:', eventSource.readyState);
-      eventSource.close();
-      eventSourceRef.current = null;
-
-      // Show error toast
-      if (toastIdRef.current && toastUpdateRef.current) {
-        toastUpdateRef.current({
-          title: displayName,
-          description: 'Failed to track download progress',
-          variant: 'destructive',
-          duration: 5000,
-        });
-        toastIdRef.current = null;
-        toastUpdateRef.current = null;
-      }
-    };
+    let intervalId = window.setTimeout(poll, POLL_MS);
 
-    eventSourceRef.current = eventSource;
-
-    // Cleanup on unmount or when disabled
     return () => {
-      console.log('[useModelDownloadToast] Cleanup - closing EventSource for:', modelName);
-      if (eventSourceRef.current) {
-        eventSourceRef.current.close();
-        eventSourceRef.current = null;
-      }
-      // Note: We don't dismiss the toast here as it might still be showing completion state
+      stopped = true;
+      clearTimeout(intervalId);
+      dismissToast();
     };
-  }, [enabled, serverUrl, modelName, displayName, toast, formatBytes, onComplete, onError]);
-
-  return {
-    isTracking: enabled && eventSourceRef.current !== null,
-  };
+  }, [enabled, serverUrl, modelName]);
 }
diff --git a/app/src/lib/hooks/useRestoreActiveTasks.tsx b/app/src/lib/hooks/useRestoreActiveTasks.tsx
index 063e6bc..ddaeae4 100644
--- a/app/src/lib/hooks/useRestoreActiveTasks.tsx
+++ b/app/src/lib/hooks/useRestoreActiveTasks.tsx
@@ -18,8 +18,8 @@ export function useRestoreActiveTasks() {
   const setIsGenerating = useGenerationStore((state) => state.setIsGenerating);
   const setActiveGenerationId = useGenerationStore((state) => state.setActiveGenerationId);
   
-  // Track which downloads we've seen to detect new ones
-  const seenDownloadsRef = useRef<Set<string>>(new Set());
+  // Track current download names to avoid spurious re-renders on every poll
+  const activeDownloadNamesRef = useRef<string>('');
 
   const fetchActiveTasks = useCallback(async () => {
     try {
@@ -38,23 +38,13 @@ export function useRestoreActiveTasks() {
         }
       }
 
-      // Update active downloads
-      // Keep track of all active downloads (including new ones)
-      const currentDownloadNames = new Set(tasks.downloads.map((d) => d.model_name));
-      
-      // Remove completed downloads from our seen set
-      for (const name of seenDownloadsRef.current) {
-        if (!currentDownloadNames.has(name)) {
-          seenDownloadsRef.current.delete(name);
-        }
-      }
-      
-      // Add new downloads to seen set
-      for (const download of tasks.downloads) {
-        seenDownloadsRef.current.add(download.model_name);
+      // Only update state (and cause re-renders) when the set of downloading
+      // model names actually changes — prevents SSE from reconnecting every 2s.
+      const newKey = tasks.downloads.map((d) => d.model_name).sort().join(',');
+      if (newKey !== activeDownloadNamesRef.current) {
+        activeDownloadNamesRef.current = newKey;
+        setActiveDownloads(tasks.downloads);
       }
-
-      setActiveDownloads(tasks.downloads);
     } catch (error) {
       // Silently fail - server might be temporarily unavailable
       console.debug('Failed to fetch active tasks:', error);
diff --git a/app/src/lib/hooks/useServer.ts b/app/src/lib/hooks/useServer.ts
index 3ff06a0..84fcf3f 100644
--- a/app/src/lib/hooks/useServer.ts
+++ b/app/src/lib/hooks/useServer.ts
@@ -1,14 +1,59 @@
 import { useQuery } from '@tanstack/react-query';
-import { apiClient } from '@/lib/api/client';
+import { apiClient, getLastApiEmissionTimestamp, getPiggybackHealth } from '@/lib/api/client';
+import type { HealthResponse } from '@/lib/api/types';
 import { useServerStore } from '@/stores/serverStore';
 
+let _lastResolvedHealth: HealthResponse | null = null;
+
+function fromPiggyback(): HealthResponse | null {
+  const piggyback = getPiggybackHealth(45_000);
+  if (!piggyback) return null;
+  return {
+    status: 'healthy',
+    model_loaded: piggyback.model_loaded,
+    model_size: piggyback.model_size ?? undefined,
+    gpu_available: piggyback.gpu_type !== null,
+    gpu_type: piggyback.gpu_type ?? undefined,
+    backend_type: piggyback.backend ?? undefined,
+  };
+}
+
 export function useServerHealth() {
   const serverUrl = useServerStore((state) => state.serverUrl);
 
   return useQuery({
     queryKey: ['server', 'health', serverUrl],
-    queryFn: () => apiClient.getHealth(),
-    refetchInterval: 30000, // Check every 30 seconds
+    queryFn: async (): Promise<HealthResponse> => {
+      const piggybackHealth = fromPiggyback();
+      if (piggybackHealth) {
+        _lastResolvedHealth = piggybackHealth;
+        return piggybackHealth;
+      }
+
+      // Only call /health when no API emission occurred in the last 7s.
+      const recentApiEmission = Date.now() - getLastApiEmissionTimestamp() < 7_000;
+      if (recentApiEmission) {
+        if (_lastResolvedHealth) return _lastResolvedHealth;
+        const stalePiggyback = getPiggybackHealth(Number.MAX_SAFE_INTEGER);
+        if (stalePiggyback) {
+          const mapped: HealthResponse = {
+            status: 'healthy',
+            model_loaded: stalePiggyback.model_loaded,
+            model_size: stalePiggyback.model_size ?? undefined,
+            gpu_available: stalePiggyback.gpu_type !== null,
+            gpu_type: stalePiggyback.gpu_type ?? undefined,
+            backend_type: stalePiggyback.backend ?? undefined,
+          };
+          _lastResolvedHealth = mapped;
+          return mapped;
+        }
+      }
+
+      const freshHealth = await apiClient.getHealth();
+      _lastResolvedHealth = freshHealth;
+      return freshHealth;
+    },
+    refetchInterval: 30000,
     retry: 1,
   });
 }
diff --git a/app/src/lib/hooks/useStoryPlayback.ts b/app/src/lib/hooks/useStoryPlayback.ts
index f9678cd..0505d11 100644
--- a/app/src/lib/hooks/useStoryPlayback.ts
+++ b/app/src/lib/hooks/useStoryPlayback.ts
@@ -252,6 +252,10 @@ export function useStoryPlayback(items: StoryItemDetail[] | undefined) {
 
           const source = audioContext.createBufferSource();
           source.buffer = buffer;
+          // TODO: per-voice device routing — each item has a profile_id; look up its
+          // channel → device_ids and use AudioContext.setSinkId (or playToDevices via
+          // Tauri) so voices route to different speakers. Currently everything goes to
+          // the system default output via audioContext.destination.
           source.connect(masterGainRef.current || audioContext.destination);
 
           const activeSource: ActiveSource = {
diff --git a/app/src/lib/hooks/useWhisperModelReady.ts b/app/src/lib/hooks/useWhisperModelReady.ts
new file mode 100644
index 0000000..aac0539
--- /dev/null
+++ b/app/src/lib/hooks/useWhisperModelReady.ts
@@ -0,0 +1,28 @@
+import { useQuery } from '@tanstack/react-query';
+import { apiClient } from '@/lib/api/client';
+
+/**
+ * Returns whether a Whisper (transcription) model is downloaded and ready.
+ * Polls /models/status and checks for any whisper-* model with downloaded: true.
+ *
+ * Also exposes `isDownloading` so the UI can show a progress indicator.
+ */
+interface UseWhisperModelReadyOptions {
+  enabled?: boolean;
+}
+
+export function useWhisperModelReady(options: UseWhisperModelReadyOptions = {}) {
+  const { enabled = true } = options;
+  const { data } = useQuery({
+    queryKey: ['modelStatus'],
+    queryFn: () => apiClient.getModelStatus(),
+    refetchInterval: enabled ? 15_000 : false,
+    enabled,
+  });
+
+  const whisperModels = data?.models?.filter((m) => m.model_name.startsWith('whisper-')) ?? [];
+  const ready = whisperModels.some((m) => m.downloaded);
+  const downloading = whisperModels.some((m) => m.downloading);
+
+  return { ready, downloading };
+}
diff --git a/app/src/platform/types.ts b/app/src/platform/types.ts
index 5ea4d60..357b3c4 100644
--- a/app/src/platform/types.ts
+++ b/app/src/platform/types.ts
@@ -44,6 +44,7 @@ export interface PlatformAudio {
   startSystemAudioCapture(maxDurationSecs: number): Promise<void>;
   stopSystemAudioCapture(): Promise<Blob>;
   listOutputDevices(): Promise<AudioDevice[]>;
+  listInputDevices(): Promise<AudioDevice[]>;
   playToDevices(audioData: Uint8Array, deviceIds: string[]): Promise<void>;
   stopPlayback(): void;
 }
@@ -58,6 +59,8 @@ export interface PlatformLifecycle {
 
 export interface PlatformMetadata {
   getVersion(): Promise<string>;
+  /** Short git hash + commit count injected at build time, e.g. "abc1234 #264" */
+  getBuildInfo(): string;
   isTauri: boolean;
 }
 
diff --git a/app/src/stores/generationStore.ts b/app/src/stores/generationStore.ts
index c0d6338..b0475ff 100644
--- a/app/src/stores/generationStore.ts
+++ b/app/src/stores/generationStore.ts
@@ -1,15 +1,91 @@
 import { create } from 'zustand';
 
+export interface PendingJob {
+  id: string;
+  profileId: string;
+  profileName: string;
+  text: string;
+  language: string;
+  modelSize?: string;
+  backendType?: string;
+  requestedByUserId?: string;
+  requestedByFirstName?: string;
+  requestIp?: string;
+  status: 'queued' | 'generating' | 'cancelling';
+  progress: number; // 0-100
+  createdAt: string;
+  startedAt?: string;
+}
+
 interface GenerationState {
+  // Legacy fields used by upstream voicebox app.
   isGenerating: boolean;
   activeGenerationId: string | null;
-  setIsGenerating: (generating: boolean) => void;
+  pendingJobs: PendingJob[];
+  setIsGenerating: (isGenerating: boolean) => void;
   setActiveGenerationId: (id: string | null) => void;
+  addJob: (job: PendingJob) => void;
+  mergePendingJobs: (jobs: PendingJob[]) => void;
+  removeJob: (id: string) => void;
+  setPendingJobs: (jobs: PendingJob[]) => void;
+  updateJobProgress: (id: string, progress: number) => void;
+  updateJobStatus: (id: string, status: 'queued' | 'generating' | 'cancelling') => void;
+  /** True if any job is queued or generating. */
+  hasActiveJobs: () => boolean;
 }
 
-export const useGenerationStore = create<GenerationState>((set) => ({
+export const useGenerationStore = create<GenerationState>((set, get) => ({
   isGenerating: false,
   activeGenerationId: null,
-  setIsGenerating: (generating) => set({ isGenerating: generating }),
+  pendingJobs: [],
+  setIsGenerating: (isGenerating) => set({ isGenerating }),
   setActiveGenerationId: (id) => set({ activeGenerationId: id }),
+  addJob: (job) =>
+    set((state) => ({
+      isGenerating: true,
+      activeGenerationId: state.activeGenerationId || job.id,
+      pendingJobs: state.pendingJobs.some((j) => j.id === job.id)
+        ? state.pendingJobs.map((j) => (j.id === job.id ? { ...j, ...job } : j))
+        : [...state.pendingJobs, job],
+    })),
+  mergePendingJobs: (jobs) =>
+    set((state) => {
+      const byId = new Map(state.pendingJobs.map((job) => [job.id, job]));
+      for (const job of jobs) {
+        const existing = byId.get(job.id);
+        byId.set(job.id, existing ? { ...existing, ...job } : job);
+      }
+      const merged = Array.from(byId.values());
+      return {
+        pendingJobs: merged,
+        isGenerating: merged.length > 0 || state.isGenerating,
+        activeGenerationId: state.activeGenerationId || merged[0]?.id || null,
+      };
+    }),
+  removeJob: (id) =>
+    set((state) => ({
+      pendingJobs: state.pendingJobs.filter((j) => j.id !== id),
+      isGenerating: state.pendingJobs.filter((j) => j.id !== id).length > 0,
+      activeGenerationId: state.activeGenerationId === id
+        ? (state.pendingJobs.filter((j) => j.id !== id)[0]?.id ?? null)
+        : state.activeGenerationId,
+    })),
+  setPendingJobs: (jobs) => set({
+    pendingJobs: jobs,
+    isGenerating: jobs.length > 0,
+    activeGenerationId: jobs[0]?.id ?? null,
+  }),
+  updateJobProgress: (id, progress) =>
+    set((state) => ({
+      pendingJobs: state.pendingJobs.map((j) =>
+        j.id === id ? { ...j, progress } : j,
+      ),
+    })),
+  updateJobStatus: (id, status) =>
+    set((state) => ({
+      pendingJobs: state.pendingJobs.map((j) =>
+        j.id === id ? { ...j, status } : j,
+      ),
+    })),
+  hasActiveJobs: () => get().pendingJobs.length > 0,
 }));
diff --git a/backend/.dockerignore b/backend/.dockerignore
new file mode 100644
index 0000000..e37b3ef
--- /dev/null
+++ b/backend/.dockerignore
@@ -0,0 +1,6 @@
+venv/
+__pycache__/
+*.pyc
+.claude/
+*.egg-info/
+data/
diff --git a/backend/NVIDIA.md b/backend/NVIDIA.md
new file mode 100644
index 0000000..1d1e94d
--- /dev/null
+++ b/backend/NVIDIA.md
@@ -0,0 +1,188 @@
+# NVIDIA GPU Setup for Voicebox (Ubuntu/Debian)
+
+## 1. Install NVIDIA Driver
+
+```bash
+# Ubuntu
+sudo apt update
+sudo apt install -y nvidia-driver-535
+sudo reboot
+
+# Debian
+sudo apt update
+sudo apt install -y linux-headers-$(uname -r)
+sudo apt install -y nvidia-driver
+sudo reboot
+```
+
+Verify after reboot:
+
+```bash
+nvidia-smi
+```
+
+You should see your GPU name, driver version, and CUDA version.
+
+## 2. Verify Device Nodes
+
+```bash
+ls -la /dev/nvidia*
+```
+
+You need at minimum:
+- `/dev/nvidia0` (GPU device)
+- `/dev/nvidiactl` (control device)
+- `/dev/nvidia-uvm` (CUDA unified memory)
+
+If `/dev/nvidia-uvm` is missing:
+
+```bash
+sudo modprobe nvidia-uvm
+```
+
+To make it persist across reboots:
+
+```bash
+echo "nvidia-uvm" | sudo tee /etc/modules-load.d/nvidia-uvm.conf
+```
+
+## 3. User Permissions
+
+The user running voicebox needs access to the GPU devices:
+
+```bash
+# Check which group owns the devices
+ls -la /dev/nvidia0
+# Usually: crw-rw---- 1 root video ...
+
+# Add your user (or the voicebox service user) to that group
+sudo usermod -aG video $USER
+sudo usermod -aG render $USER
+
+# Log out and back in for group changes to take effect
+```
+
+## 4a. Bare Metal Install
+
+The setup script handles everything:
+
+```bash
+sudo ./setup-linux.sh check    # verify GPU is detected
+sudo ./setup-linux.sh install  # installs with CUDA PyTorch
+```
+
+PyTorch bundles its own CUDA runtime, so you do **not** need to install the CUDA toolkit separately.
+
+## 4b. Docker Install
+
+### Install NVIDIA Container Toolkit
+
+```bash
+# Add the repo
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+  | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+  | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+  | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+
+sudo apt update
+sudo apt install -y nvidia-container-toolkit
+```
+
+### Configure Docker Runtime
+
+```bash
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+
+### Verify GPU Access in Docker
+
+```bash
+docker run --rm --gpus all nvidia/cuda:12.1.1-base-ubuntu22.04 nvidia-smi
+```
+
+### Run Voicebox
+
+```bash
+cd backend
+docker compose up -d
+```
+
+The `docker-compose.yml` already requests GPU access. Verify:
+
+```bash
+curl http://localhost:17493/health
+# Should show: "gpu_available": true, "gpu_type": "CUDA"
+```
+
+## Troubleshooting
+
+### `nvidia-smi` works but PyTorch can't see GPU
+
+```bash
+# Inside the container or venv:
+python3 -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))"
+```
+
+If `False`, the CUDA version bundled with PyTorch may not match your driver. Check compatibility:
+
+| Driver Version | Max CUDA Version |
+|---------------|-----------------|
+| 525.x         | 12.0            |
+| 535.x         | 12.2            |
+| 545.x         | 12.3            |
+| 550.x+        | 12.4            |
+
+Reinstall PyTorch with the right CUDA version if needed:
+
+```bash
+# Example for CUDA 11.8
+pip install torch --index-url https://download.pytorch.org/whl/cu118
+```
+
+### Permission denied on `/dev/nvidia*`
+
+```bash
+# Quick fix (non-persistent)
+sudo chmod 666 /dev/nvidia*
+
+# Proper fix — add user to video group
+sudo usermod -aG video voicebox
+sudo systemctl restart voicebox
+```
+
+### Docker: `could not select device driver "nvidia"`
+
+The NVIDIA Container Toolkit is not installed or Docker wasn't restarted:
+
+```bash
+sudo apt install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+
+### Out of memory (OOM)
+
+The 1.7B model needs ~4GB VRAM. The 0.6B model needs ~2GB. Check usage:
+
+```bash
+nvidia-smi
+
+# Use the smaller model via the API:
+curl -X POST http://localhost:17493/generate \
+  -H "Content-Type: application/json" \
+  -d '{"profile_id": "...", "text": "hello", "model_size": "0.6B"}'
+```
+
+### Models not downloading
+
+HuggingFace downloads go to `$HF_HOME` (default: `~/.cache/huggingface`). In Docker this is `/data/huggingface` inside the volume.
+
+If downloads fail behind a proxy:
+
+```bash
+export HF_ENDPOINT=https://hf-mirror.com
+export HTTPS_PROXY=http://your-proxy:port
+```
diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
index f7c47ba..b03bec7 100644
--- a/backend/backends/__init__.py
+++ b/backend/backends/__init__.py
@@ -53,10 +53,19 @@ async def generate(
         language: str = "en",
         seed: Optional[int] = None,
         instruct: Optional[str] = None,
+        progress_callback: Optional[callable] = None,
     ) -> Tuple[np.ndarray, int]:
         """
         Generate audio from text.
-        
+
+        Args:
+            text: Text to synthesize
+            voice_prompt: Voice prompt dictionary
+            language: Language code
+            seed: Random seed for reproducibility
+            instruct: Natural language instruction for speech delivery
+            progress_callback: Optional callback(progress_pct: float) where 0.0-100.0
+
         Returns:
             Tuple of (audio_array, sample_rate)
         """
diff --git a/backend/backends/mlx_backend.py b/backend/backends/mlx_backend.py
index c4ecc09..fc6429c 100644
--- a/backend/backends/mlx_backend.py
+++ b/backend/backends/mlx_backend.py
@@ -2,26 +2,64 @@
 MLX backend implementation for TTS and STT using mlx-audio.
 """
 
+import warnings
+import logging
+
+# Suppress upstream tokenizer warnings from mlx-audio/transformers.
+# Must be set BEFORE transformers is imported anywhere.
+warnings.filterwarnings("ignore", message="You are using a model of type.*to instantiate a model of type")
+warnings.filterwarnings("ignore", message=".*incorrect regex pattern.*fix_mistral_regex.*")
+
+# transformers.logging routes through its own verbosity system, not Python warnings.
+# Pre-set verbosity to ERROR so these never print even on first import.
+try:
+    import transformers as _tf_early
+    _tf_early.logging.set_verbosity_error()
+except Exception:
+    pass
+
+logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
+logging.getLogger("transformers.convert_slow_tokenizer").setLevel(logging.ERROR)
+
+import os
+import threading
 from typing import Optional, List, Tuple
 import asyncio
 import numpy as np
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 from . import TTSBackend, STTBackend
 from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt
 from ..utils.audio import normalize_audio, load_audio
 from ..utils.progress import get_progress_manager
-from ..utils.hf_progress import HFProgressTracker, create_hf_progress_callback
+from ..utils.hf_progress import HFProgressTracker, create_hf_progress_callback, hf_offline_for_cached
 from ..utils.tasks import get_task_manager
+from ..utils.idle_timer import IdleTimer
+
+# Idle timeouts (seconds). Disabled in serverless mode — the entire
+# worker shuts down instead of unloading individual models.
+_SERVERLESS = os.environ.get("SERVERLESS", "") in ("1", "true")
+_TTS_IDLE_TIMEOUT = 0 if _SERVERLESS else 180   # 3 minutes (normal)
+_STT_IDLE_TIMEOUT = 0 if _SERVERLESS else 300   # 5 minutes (normal)
+
+# Global load lock — prevents concurrent MLX model loads which cause Metal crashes.
+_MLX_LOAD_LOCK = threading.Lock()
 
 
 class MLXTTSBackend:
     """MLX-based TTS backend using mlx-audio."""
-    
+
     def __init__(self, model_size: str = "1.7B"):
         self.model = None
         self.model_size = model_size
         self._current_model_size = None
+        self._idle_timer = IdleTimer(
+            timeout=_TTS_IDLE_TIMEOUT,
+            on_timeout=self.unload_model,
+            label="TTS",
+        )
     
     def is_loaded(self) -> bool:
         """Check if model is loaded."""
@@ -37,28 +75,29 @@ def _get_model_path(self, model_size: str) -> str:
         Returns:
             HuggingFace Hub model ID for MLX
         """
-        # MLX model mapping
+        # MLX model mapping.
+        # Use Base variants — these accept ref_audio/ref_text for voice cloning.
+        # CustomVoice variants require a named speaker ('Chelsie', 'Ethan', etc.)
+        # and don't support arbitrary voice cloning.
+        # 4-bit quantized: ~900MB (1.7B) / ~300MB (0.6B) vs ~3.4GB for bf16.
         mlx_model_map = {
-            "1.7B": "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-bf16",
-            # 0.6B not yet converted to MLX format
-            "0.6B": "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-bf16",  # Fallback to 1.7B
+            "1.7B": "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-4bit",
+            "0.6B": "mlx-community/Qwen3-TTS-12Hz-0.6B-Base-4bit",
         }
         
         if model_size not in mlx_model_map:
             raise ValueError(f"Unknown model size: {model_size}")
         
         hf_model_id = mlx_model_map[model_size]
-        print(f"Will download MLX model from HuggingFace Hub: {hf_model_id}")
-        
         return hf_model_id
-    
+
     def _is_model_cached(self, model_size: str) -> bool:
         """
         Check if the model is already cached locally AND fully downloaded.
-        
+
         Args:
             model_size: Model size to check
-            
+
         Returns:
             True if model is fully cached, False if missing or incomplete
         """
@@ -73,9 +112,9 @@ def _is_model_cached(self, model_size: str) -> bool:
             # Check for .incomplete files - if any exist, download is still in progress
             blobs_dir = repo_cache / "blobs"
             if blobs_dir.exists() and any(blobs_dir.glob("*.incomplete")):
-                print(f"[_is_model_cached] Found .incomplete files for {model_size}, treating as not cached")
+                logger.debug(f"[_is_model_cached] Found .incomplete files for {model_size}, treating as not cached")
                 return False
-            
+
             # Check that actual model weight files exist in snapshots
             snapshots_dir = repo_cache / "snapshots"
             if snapshots_dir.exists():
@@ -85,65 +124,71 @@ def _is_model_cached(self, model_size: str) -> bool:
                     any(snapshots_dir.rglob("*.npz"))
                 )
                 if not has_weights:
-                    print(f"[_is_model_cached] No model weights found for {model_size}, treating as not cached")
+                    logger.debug(f"[_is_model_cached] No model weights found for {model_size}, treating as not cached")
                     return False
-            
+
             return True
         except Exception as e:
-            print(f"[_is_model_cached] Error checking cache for {model_size}: {e}")
+            logger.debug(f"[_is_model_cached] Error checking cache for {model_size}: {e}")
             return False
-    
+
     async def load_model_async(self, model_size: Optional[str] = None):
         """
         Lazy load the MLX TTS model.
-        
+
         Args:
             model_size: Model size to load (1.7B or 0.6B)
         """
         if model_size is None:
             model_size = self.model_size
-            
-        # If already loaded with correct size, return
+
+        # Fast path — already loaded, no lock needed
         if self.model is not None and self._current_model_size == model_size:
+            self._idle_timer.touch()
             return
-        
-        # Unload existing model if different size requested
-        if self.model is not None and self._current_model_size != model_size:
-            self.unload_model()
-        
-        # Run blocking load in thread pool
-        await asyncio.to_thread(self._load_model_sync, model_size)
-    
+
+        # Serialize loads via a threading lock run in the thread pool.
+        # Concurrent MLX loads cause Metal command buffer crashes.
+        def _locked_load():
+            with _MLX_LOAD_LOCK:
+                # Re-check inside the lock — another caller may have loaded while we waited
+                if self.model is not None and self._current_model_size == model_size:
+                    logger.debug(f"[TTS] Load skipped — model {model_size} already loaded by concurrent caller")
+                    return
+                if self.model is not None and self._current_model_size != model_size:
+                    self.unload_model()
+                is_cached = self._is_model_cached(model_size)
+                self._load_model_sync(model_size, is_cached)
+
+        await asyncio.to_thread(_locked_load)
+        self._idle_timer.touch()
+
     # Alias for compatibility
     load_model = load_model_async
-    
-    def _load_model_sync(self, model_size: str):
+
+    def _load_model_sync(self, model_size: str, is_cached: bool = False):
         """Synchronous model loading."""
         try:
             # Get model path BEFORE importing mlx_audio
             model_path = self._get_model_path(model_size)
-            
+
             # Set up progress tracking
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             model_name = f"qwen-tts-{model_size}"
             
-            # Check if model is already cached
-            is_cached = self._is_model_cached(model_size)
-            
             # Set up progress callback
             # If cached: filter out non-download progress
             # If not cached: report all progress (we're actually downloading)
             progress_callback = create_hf_progress_callback(model_name, progress_manager)
             tracker = HFProgressTracker(progress_callback, filter_non_downloads=is_cached)
             
-            print(f"Loading MLX TTS model {model_size}...")
-            
-            # Only track download progress if model is NOT cached
+            logger.info(f"Loading MLX TTS model {model_size}...")
+
             if not is_cached:
                 # Start tracking download task
                 task_manager.start_download(model_name)
-                
+
                 # Initialize progress state so SSE endpoint has initial data to send
                 # This provides immediate feedback while HuggingFace fetches metadata
                 progress_manager.update_progress(
@@ -153,34 +198,40 @@ def _load_model_sync(self, model_size: str):
                     filename="Connecting to HuggingFace...",
                     status="downloading",
                 )
-            
-            # IMPORTANT: Patch tqdm BEFORE importing mlx_audio
-            # Otherwise mlx_audio caches reference to original tqdm
-            tracker_context = tracker.patch_download()
-            tracker_context.__enter__()
-            
-            # Import mlx_audio AFTER patching tqdm
-            from mlx_audio.tts import load
-            
-            # Load MLX model (downloads automatically)
-            try:
+            else:
+                # Emit a "loading" status so the UI can show a spinner while the
+                # cached model is being loaded into GPU memory (can take a few seconds).
+                progress_manager.update_progress(
+                    model_name=model_name,
+                    current=0,
+                    total=0,
+                    filename="Loading model into memory...",
+                    status="loading",
+                )
+
+            # Patch tqdm BEFORE importing mlx_audio, use proper context manager
+            # to ensure cleanup even if model loading crashes.
+            # When cached, set HF_HUB_OFFLINE to skip remote "Fetching N files" validation.
+            with tracker.patch_download(), hf_offline_for_cached(is_cached):
+                from mlx_audio.tts import load
                 self.model = load(model_path)
-            finally:
-                # Exit the patch context
-                tracker_context.__exit__(None, None, None)
-            
-            # Only mark download as complete if we were tracking it
+
             if not is_cached:
                 progress_manager.mark_complete(model_name)
                 task_manager.complete_download(model_name)
-            
+            else:
+                # Clear the loading status so future SSE subscribers don't see stale data
+                progress_manager.clear_progress(model_name)
+
             self._current_model_size = model_size
             self.model_size = model_size
-            
-            print(f"MLX TTS model {model_size} loaded successfully")
-            
+
+            logger.info(f"MLX TTS model {model_size} loaded successfully")
+
         except ImportError as e:
-            print(f"Error: mlx_audio package not found. Install with: pip install mlx-audio")
+            logger.error(f"Error: mlx_audio package not found. Install with: pip install mlx-audio")
+            self.model = None
+            self._current_model_size = None
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             model_name = f"qwen-tts-{model_size}"
@@ -188,22 +239,26 @@ def _load_model_sync(self, model_size: str):
             task_manager.error_download(model_name, str(e))
             raise
         except Exception as e:
-            print(f"Error loading MLX TTS model: {e}")
+            logger.error(f"Error loading MLX TTS model: {e}")
+            self.model = None
+            self._current_model_size = None
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             model_name = f"qwen-tts-{model_size}"
             progress_manager.mark_error(model_name, str(e))
             task_manager.error_download(model_name, str(e))
             raise
-    
+
     def unload_model(self):
         """Unload the model to free memory."""
+        self._idle_timer.cancel()
         if self.model is not None:
+            size = self._current_model_size or "unknown"
             del self.model
             self.model = None
             self._current_model_size = None
-            print("MLX TTS model unloaded")
-    
+            logger.info(f"MLX TTS model unloaded ({size})")
+
     async def create_voice_prompt(
         self,
         audio_path: str,
@@ -239,7 +294,7 @@ async def create_voice_prompt(
                         return cached_prompt, True
                     else:
                         # Cached file no longer exists, invalidate cache
-                        print(f"Cached audio file not found: {cached_audio_path}, regenerating prompt")
+                        logger.debug(f"Cached audio file not found: {cached_audio_path}, regenerating prompt")
         
         # MLX voice prompt format - store audio path and text
         # The model will process this during generation
@@ -274,12 +329,12 @@ async def combine_voice_prompts(
         
         for audio_path in audio_paths:
             audio, sr = load_audio(audio_path)
-            audio = normalize_audio(audio)
+            audio = normalize_audio(audio, sample_rate=sr)
             combined_audio.append(audio)
-        
+
         # Concatenate audio
         mixed = np.concatenate(combined_audio)
-        mixed = normalize_audio(mixed)
+        mixed = normalize_audio(mixed, sample_rate=sr)
         
         # Combine texts
         combined_text = " ".join(reference_texts)
@@ -293,6 +348,7 @@ async def generate(
         language: str = "en",
         seed: Optional[int] = None,
         instruct: Optional[str] = None,
+        progress_callback: Optional[callable] = None,
     ) -> Tuple[np.ndarray, int]:
         """
         Generate audio from text using voice prompt.
@@ -303,93 +359,168 @@ async def generate(
             language: Language code (en or zh) - may not be fully supported by MLX
             seed: Random seed for reproducibility
             instruct: Natural language instruction (may not be supported by MLX)
+            progress_callback: Optional callback(progress_pct: float) where 0.0-100.0
 
         Returns:
             Tuple of (audio_array, sample_rate)
         """
         await self.load_model_async(None)
 
-        print(f"Generating audio for text: {text}")
+        import time as _time
+        text_preview = text[:80] + ("..." if len(text) > 80 else "")
+        logger.info(f"[TTS] Generating: \"{text_preview}\" (lang={language}, model={self._current_model_size})")
+
+        gen_start = _time.perf_counter()
 
         def _generate_sync():
             """Run synchronous generation in thread pool."""
+            import mlx.core as mx
+
             # MLX generate() returns a generator yielding GenerationResult objects
             audio_chunks = []
             sample_rate = 24000
-            
+
+            # Estimate total chunks for progress reporting
+            estimated_chunks = max(1, len(text) // 2)
+            chunk_count = 0
+
             # Set seed if provided (MLX uses numpy random)
             if seed is not None:
-                import mlx.core as mx
                 np.random.seed(seed)
                 mx.random.seed(seed)
-            
+
             # Extract voice prompt info
-            ref_audio = voice_prompt.get("ref_audio") or voice_prompt.get("ref_audio_path")
+            ref_audio_path = voice_prompt.get("ref_audio") or voice_prompt.get("ref_audio_path")
             ref_text = voice_prompt.get("ref_text", "")
-            
-            # Validate that the audio file exists
-            if ref_audio and not Path(ref_audio).exists():
-                print(f"Warning: Audio file not found: {ref_audio}")
-                print("This may be due to a cached voice prompt referencing a deleted temp file.")
-                print("Regenerating without voice prompt.")
-                ref_audio = None
-            
-            # Check if model supports voice cloning via generate method
-            # MLX API may support ref_audio parameter directly
+            ref_audio = None
+
+            # Validate that the audio file exists and load it
+            if ref_audio_path and Path(ref_audio_path).exists():
+                try:
+                    ref_audio_data, _ = load_audio(ref_audio_path, sample_rate=24000)
+                    ref_audio = mx.array(ref_audio_data.astype(np.float32))
+                    mx.eval(ref_audio)
+                    dur_s = ref_audio.shape[0] / 24000
+                    logger.debug(f"[TTS] Reference audio loaded: {dur_s:.1f}s ({ref_audio.shape[0]} samples)")
+                except Exception as e:
+                    logger.warning(f"[TTS] Warning: Failed to load reference audio: {e}")
+                    ref_audio = None
+            elif ref_audio_path:
+                logger.warning(f"[TTS] Warning: Audio file not found: {ref_audio_path}")
+                logger.warning("[TTS] Regenerating without voice prompt (cached prompt may reference deleted temp file).")
+
+            def _process_results(generator):
+                """Collect audio chunks from model generator."""
+                nonlocal chunk_count, sample_rate
+                for result in generator:
+                    audio_chunks.append(np.array(result.audio))
+                    sample_rate = result.sample_rate
+                    chunk_count += 1
+                    if progress_callback:
+                        pct = min(95.0, (chunk_count / estimated_chunks) * 100.0)
+                        progress_callback(pct)
+                    if chunk_count % 5 == 0 or chunk_count == 1:
+                        elapsed = _time.perf_counter() - gen_start
+                        logger.debug(f"[TTS] Chunk {chunk_count} generated ({elapsed:.1f}s elapsed)")
+
+            # Generate with or without voice cloning
             try:
-                # Try with voice cloning parameters if supported
-                if ref_audio:
-                    # Check if generate accepts ref_audio parameter
+                if ref_audio is not None:
                     import inspect
                     sig = inspect.signature(self.model.generate)
                     if "ref_audio" in sig.parameters:
-                        # Generate with voice cloning
-                        for result in self.model.generate(text, ref_audio=ref_audio, ref_text=ref_text):
-                            audio_chunks.append(np.array(result.audio))
-                            sample_rate = result.sample_rate
+                        logger.debug(f"[TTS] Starting voice-cloned generation (ref_text: \"{ref_text[:50]}...\")")
+                        _process_results(self.model.generate(text, ref_audio=ref_audio, ref_text=ref_text))
                     else:
-                        # Fallback: generate without voice cloning
-                        for result in self.model.generate(text):
-                            audio_chunks.append(np.array(result.audio))
-                            sample_rate = result.sample_rate
+                        logger.debug("[TTS] Starting generation (model doesn't support ref_audio)")
+                        _process_results(self.model.generate(text))
                 else:
-                    # No voice prompt, generate normally
-                    for result in self.model.generate(text):
-                        audio_chunks.append(np.array(result.audio))
-                        sample_rate = result.sample_rate
+                    logger.debug("[TTS] Starting generation (no reference audio)")
+                    _process_results(self.model.generate(text))
             except Exception as e:
-                # If voice cloning fails, try without it
-                print(f"Warning: Voice cloning failed, generating without voice prompt: {e}")
-                for result in self.model.generate(text):
-                    audio_chunks.append(np.array(result.audio))
-                    sample_rate = result.sample_rate
-            
+                # If cancelled or model unloaded, don't try to fall back
+                if "cancel" in str(e).lower() or self.model is None:
+                    raise
+                logger.warning(f"[TTS] Warning: Voice cloning failed, falling back to uncloned: {e}", exc_info=True)
+                audio_chunks.clear()
+                chunk_count = 0
+                _process_results(self.model.generate(text))
+
             # Concatenate all chunks
             if audio_chunks:
                 audio = np.concatenate([np.asarray(chunk, dtype=np.float32) for chunk in audio_chunks])
             else:
-                # Fallback: empty audio
                 audio = np.array([], dtype=np.float32)
-            
+
             return audio, sample_rate
 
         # Run blocking inference in thread pool
         audio, sample_rate = await asyncio.to_thread(_generate_sync)
 
+        elapsed = _time.perf_counter() - gen_start
+        duration = len(audio) / sample_rate if sample_rate > 0 else 0
+        logger.info(f"[TTS] Generation complete: {duration:.1f}s audio in {elapsed:.1f}s (x{duration/elapsed:.1f} realtime)")
+
         return audio, sample_rate
 
 
 class MLXSTTBackend:
     """MLX-based STT backend using mlx-audio Whisper."""
-    
-    def __init__(self, model_size: str = "base"):
+
+    def __init__(self, model_size: Optional[str] = None):
         self.model = None
+        if model_size is None:
+            model_size = "base"
         self.model_size = model_size
+        self._current_model_size = None
+        self._idle_timer = IdleTimer(
+            timeout=_STT_IDLE_TIMEOUT,
+            on_timeout=self.unload_model,
+            label="STT",
+        )
     
     def is_loaded(self) -> bool:
         """Check if model is loaded."""
         return self.model is not None
     
+    @staticmethod
+    def get_mlx_whisper_model_map() -> dict:
+        """
+        Get the mapping of model sizes to MLX model IDs.
+        
+        Returns:
+            Dictionary mapping model size to HuggingFace model ID
+        """
+        # Use the new ASR-specific models that work with mlx_audio.stt.load()
+        return {
+            "tiny": "mlx-community/whisper-tiny-asr-fp16",
+            "base": "mlx-community/whisper-base-asr-fp16",
+            "small": "mlx-community/whisper-small-asr-fp16",
+            "medium": "mlx-community/whisper-medium-asr-fp16",
+            "large": "mlx-community/whisper-large-asr-fp16",
+            "large-v2": "mlx-community/whisper-large-v2-asr-fp16",
+            "large-v3": "mlx-community/whisper-large-v3-asr-fp16",
+            "large-v3-turbo": "mlx-community/whisper-large-v3-turbo-asr-fp16",
+        }
+    
+    def _get_model_path(self, model_size: str) -> str:
+        """
+        Get the MLX Whisper model path.
+        
+        Args:
+            model_size: Model size (tiny, base, small, medium, large, large-v2, large-v3, large-v3-turbo)
+            
+        Returns:
+            HuggingFace Hub model ID for MLX Whisper
+        """
+        mlx_model_map = self.get_mlx_whisper_model_map()
+        
+        if model_size not in mlx_model_map:
+            raise ValueError(f"Unknown Whisper model size: {model_size}. Available sizes: {list(mlx_model_map.keys())}")
+        
+        hf_model_id = mlx_model_map[model_size]
+        return hf_model_id
+    
     def _is_model_cached(self, model_size: str) -> bool:
         """
         Check if the Whisper model is already cached locally AND fully downloaded.
@@ -402,8 +533,8 @@ def _is_model_cached(self, model_size: str) -> bool:
         """
         try:
             from huggingface_hub import constants as hf_constants
-            model_name = f"openai/whisper-{model_size}"
-            repo_cache = Path(hf_constants.HF_HUB_CACHE) / ("models--" + model_name.replace("/", "--"))
+            model_path = self._get_model_path(model_size)
+            repo_cache = Path(hf_constants.HF_HUB_CACHE) / ("models--" + model_path.replace("/", "--"))
             
             if not repo_cache.exists():
                 return False
@@ -411,9 +542,9 @@ def _is_model_cached(self, model_size: str) -> bool:
             # Check for .incomplete files - if any exist, download is still in progress
             blobs_dir = repo_cache / "blobs"
             if blobs_dir.exists() and any(blobs_dir.glob("*.incomplete")):
-                print(f"[_is_model_cached] Found .incomplete files for whisper-{model_size}, treating as not cached")
+                logger.debug(f"[_is_model_cached] Found .incomplete files for whisper-{model_size}, treating as not cached")
                 return False
-            
+
             # Check that actual model weight files exist in snapshots
             snapshots_dir = repo_cache / "snapshots"
             if snapshots_dir.exists():
@@ -423,12 +554,12 @@ def _is_model_cached(self, model_size: str) -> bool:
                     any(snapshots_dir.rglob("*.npz"))
                 )
                 if not has_weights:
-                    print(f"[_is_model_cached] No model weights found for whisper-{model_size}, treating as not cached")
+                    logger.debug(f"[_is_model_cached] No model weights found for whisper-{model_size}, treating as not cached")
                     return False
-            
+
             return True
         except Exception as e:
-            print(f"[_is_model_cached] Error checking cache for whisper-{model_size}: {e}")
+            logger.debug(f"[_is_model_cached] Error checking cache for whisper-{model_size}: {e}")
             return False
     
     async def load_model_async(self, model_size: Optional[str] = None):
@@ -436,53 +567,53 @@ async def load_model_async(self, model_size: Optional[str] = None):
         Lazy load the MLX Whisper model.
         
         Args:
-            model_size: Model size (tiny, base, small, medium, large)
+            model_size: Model size (tiny, base, small, medium, large, large-v2, large-v3, large-v3-turbo)
         """
         if model_size is None:
             model_size = self.model_size
         
-        if self.model is not None and self.model_size == model_size:
+        # If already loaded with correct size, return
+        if self.model is not None and self._current_model_size == model_size:
+            self._idle_timer.touch()
             return
-        
+
+        # Unload existing model if different size requested
+        if self.model is not None and self._current_model_size != model_size:
+            self.unload_model()
+
+        # Check cache before entering thread pool so we can skip redundant checks
+        is_cached = self._is_model_cached(model_size)
+
         # Run blocking load in thread pool
-        await asyncio.to_thread(self._load_model_sync, model_size)
-    
+        await asyncio.to_thread(self._load_model_sync, model_size, is_cached)
+        self._idle_timer.touch()
+
     # Alias for compatibility
     load_model = load_model_async
-    
-    def _load_model_sync(self, model_size: str):
+
+    def _load_model_sync(self, model_size: str, is_cached: bool = False):
         """Synchronous model loading."""
         try:
+            # Get model path BEFORE importing mlx_audio
+            model_path = self._get_model_path(model_size)
+
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             progress_model_name = f"whisper-{model_size}"
 
-            # Check if model is already cached
-            is_cached = self._is_model_cached(model_size)
-
             # Set up progress callback and tracker
             # If cached: filter out non-download progress
             # If not cached: report all progress (we're actually downloading)
             progress_callback = create_hf_progress_callback(progress_model_name, progress_manager)
             tracker = HFProgressTracker(progress_callback, filter_non_downloads=is_cached)
 
-            # Patch tqdm BEFORE importing mlx_audio
-            tracker_context = tracker.patch_download()
-            tracker_context.__enter__()
-
-            # Import mlx_audio
-            from mlx_audio.stt import load
-
-            # MLX Whisper uses the standard OpenAI models
-            model_name = f"openai/whisper-{model_size}"
-
-            print(f"Loading MLX Whisper model {model_size}...")
+            logger.info(f"Loading MLX Whisper model {model_size}...")
 
             # Only track download progress if model is NOT cached
             if not is_cached:
                 # Start tracking download task
                 task_manager.start_download(progress_model_name)
-                
+
                 # Initialize progress state so SSE endpoint has initial data to send
                 progress_manager.update_progress(
                     model_name=progress_model_name,
@@ -492,24 +623,44 @@ def _load_model_sync(self, model_size: str):
                     status="downloading",
                 )
 
-            # Load the model (tqdm is patched, but filters out non-download progress)
-            try:
-                self.model = load(model_name)
-            finally:
-                # Exit the patch context
-                tracker_context.__exit__(None, None, None)
-            
+            # Patch tqdm BEFORE importing mlx_audio, use proper context manager
+            # to ensure cleanup even if model loading crashes.
+            # When cached, set HF_HUB_OFFLINE to skip remote "Fetching N files" validation.
+            with tracker.patch_download(), hf_offline_for_cached(is_cached):
+                # Import the proper load function (from_pretrained is deprecated and doesn't load the processor)
+                from mlx_audio.stt import load as stt_load
+
+                # Load the model using stt.load() which properly initializes the HuggingFace processor
+                # This is required for the tokenizer/generate functionality to work
+                self.model = stt_load(model_path)
+
+            # Verify the processor was loaded — the ASR-specific MLX repos sometimes
+            # don't include the HuggingFace processor files, causing post_load_hook
+            # to silently set _processor = None.  Fall back to the original OpenAI repo.
+            # NOTE: must happen AFTER offline context exits so we can fetch from HF if needed.
+            if getattr(self.model, '_processor', None) is None:
+                logger.warning(f"Whisper processor missing after load — loading from openai/whisper-{model_size}")
+                try:
+                    from transformers import WhisperProcessor
+                    self.model._processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}")
+                    logger.info(f"Whisper processor loaded successfully from openai/whisper-{model_size}")
+                except Exception as proc_err:
+                    logger.warning(f"WARNING: Could not load WhisperProcessor fallback: {proc_err}")
+
             # Only mark download as complete if we were tracking it
             if not is_cached:
                 progress_manager.mark_complete(progress_model_name)
                 task_manager.complete_download(progress_model_name)
-            
+
+            self._current_model_size = model_size
             self.model_size = model_size
-            
-            print(f"MLX Whisper model {model_size} loaded successfully")
-            
+
+            logger.info(f"MLX Whisper model {model_size} loaded successfully")
+
         except ImportError as e:
-            print(f"Error: mlx_audio package not found. Install with: pip install mlx-audio")
+            logger.error(f"Error: mlx_audio package not found. Install with: pip install mlx-audio")
+            self.model = None
+            self._current_model_size = None
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             progress_model_name = f"whisper-{model_size}"
@@ -517,21 +668,26 @@ def _load_model_sync(self, model_size: str):
             task_manager.error_download(progress_model_name, str(e))
             raise
         except Exception as e:
-            print(f"Error loading MLX Whisper model: {e}")
+            logger.error(f"Error loading MLX Whisper model: {e}")
+            self.model = None
+            self._current_model_size = None
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             progress_model_name = f"whisper-{model_size}"
             progress_manager.mark_error(progress_model_name, str(e))
             task_manager.error_download(progress_model_name, str(e))
             raise
-    
+
     def unload_model(self):
         """Unload the model to free memory."""
+        self._idle_timer.cancel()
         if self.model is not None:
+            size = self._current_model_size or "unknown"
             del self.model
             self.model = None
-            print("MLX Whisper model unloaded")
-    
+            self._current_model_size = None
+            logger.info(f"MLX Whisper model unloaded ({size})")
+
     async def transcribe(
         self,
         audio_path: str,
@@ -549,15 +705,38 @@ async def transcribe(
         """
         await self.load_model_async(None)
 
+        # Ensure the processor is available — it may be missing if the model
+        # was loaded before the fallback fix, or if post_load_hook failed.
+        if self.model is not None and getattr(self.model, '_processor', None) is None:
+            logger.warning(f"[STT] Whisper processor missing at transcribe time — loading from openai/whisper-{self.model_size}")
+            try:
+                from transformers import WhisperProcessor
+                self.model._processor = WhisperProcessor.from_pretrained(f"openai/whisper-{self.model_size}")
+                logger.info(f"[STT] Whisper processor loaded successfully from openai/whisper-{self.model_size}")
+            except Exception as proc_err:
+                raise RuntimeError(
+                    f"Whisper processor not available. Try restarting the server. Details: {proc_err}"
+                ) from proc_err
+
         def _transcribe_sync():
             """Run synchronous transcription in thread pool."""
+            import numpy as np
+
+            # Load audio ourselves to handle any format (mp3, wav, etc.)
+            # Whisper expects 16kHz mono audio
+            audio, sr = load_audio(audio_path, sample_rate=16000)
+
+            # Ensure it's a numpy array (model.generate expects numpy or mx.array)
+            if not isinstance(audio, np.ndarray):
+                audio = np.array(audio)
+
             # MLX Whisper transcription using generate method
-            # The generate method accepts audio path directly
             decode_options = {}
             if language:
                 decode_options["language"] = language
 
-            result = self.model.generate(str(audio_path), **decode_options)
+            # Pass audio array instead of path to avoid format detection issues
+            result = self.model.generate(audio, **decode_options)
 
             # Extract text from result
             if isinstance(result, str):
diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py
index 26f3872..4e156a7 100644
--- a/backend/backends/pytorch_backend.py
+++ b/backend/backends/pytorch_backend.py
@@ -2,18 +2,29 @@
 PyTorch backend implementation for TTS and STT.
 """
 
+import logging
+import os
 from typing import Optional, List, Tuple
 import asyncio
 import torch
 import numpy as np
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 from . import TTSBackend, STTBackend
 from ..utils.cache import get_cache_key, get_cached_voice_prompt, cache_voice_prompt
 from ..utils.audio import normalize_audio, load_audio
 from ..utils.progress import get_progress_manager
-from ..utils.hf_progress import HFProgressTracker, create_hf_progress_callback
+from ..utils.hf_progress import HFProgressTracker, create_hf_progress_callback, hf_offline_for_cached
 from ..utils.tasks import get_task_manager
+from ..utils.idle_timer import IdleTimer
+
+# Idle timeouts (seconds). Disabled in serverless mode — the entire
+# worker shuts down instead of unloading individual models.
+_SERVERLESS = os.environ.get("SERVERLESS", "") in ("1", "true")
+_TTS_IDLE_TIMEOUT = 0 if _SERVERLESS else 180   # 3 minutes (normal)
+_STT_IDLE_TIMEOUT = 0 if _SERVERLESS else 300   # 5 minutes (normal)
 
 
 class PyTorchTTSBackend:
@@ -24,6 +35,11 @@ def __init__(self, model_size: str = "1.7B"):
         self.model_size = model_size
         self.device = self._get_device()
         self._current_model_size = None
+        self._idle_timer = IdleTimer(
+            timeout=_TTS_IDLE_TIMEOUT,
+            on_timeout=self.unload_model,
+            label="TTS",
+        )
     
     def _get_device(self) -> str:
         """Get the best available device."""
@@ -79,7 +95,7 @@ def _is_model_cached(self, model_size: str) -> bool:
             # Check for .incomplete files - if any exist, download is still in progress
             blobs_dir = repo_cache / "blobs"
             if blobs_dir.exists() and any(blobs_dir.glob("*.incomplete")):
-                print(f"[_is_model_cached] Found .incomplete files for {model_size}, treating as not cached")
+                logger.debug(f"[_is_model_cached] Found .incomplete files for {model_size}, treating as not cached")
                 return False
             
             # Check that actual model weight files exist in snapshots
@@ -90,14 +106,14 @@ def _is_model_cached(self, model_size: str) -> bool:
                     any(snapshots_dir.rglob("*.bin"))
                 )
                 if not has_weights:
-                    print(f"[_is_model_cached] No model weights found for {model_size}, treating as not cached")
+                    logger.debug(f"[_is_model_cached] No model weights found for {model_size}, treating as not cached")
                     return False
             
             return True
         except Exception as e:
-            print(f"[_is_model_cached] Error checking cache for {model_size}: {e}")
+            logger.warning(f"[_is_model_cached] Error checking cache for {model_size}: {e}")
             return False
-    
+
     async def load_model_async(self, model_size: Optional[str] = None):
         """
         Lazy load the TTS model with automatic downloading from HuggingFace Hub.
@@ -110,47 +126,41 @@ async def load_model_async(self, model_size: Optional[str] = None):
             
         # If already loaded with correct size, return
         if self.model is not None and self._current_model_size == model_size:
+            self._idle_timer.touch()
             return
-        
+
         # Unload existing model if different size requested
         if self.model is not None and self._current_model_size != model_size:
             self.unload_model()
-        
+
+        # Check cache before entering thread pool
+        is_cached = self._is_model_cached(model_size)
+
         # Run blocking load in thread pool
-        await asyncio.to_thread(self._load_model_sync, model_size)
-    
+        await asyncio.to_thread(self._load_model_sync, model_size, is_cached)
+        self._idle_timer.touch()
+
     # Alias for compatibility
     load_model = load_model_async
-    
-    def _load_model_sync(self, model_size: str):
+
+    def _load_model_sync(self, model_size: str, is_cached: bool = False):
         """Synchronous model loading."""
         try:
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             model_name = f"qwen-tts-{model_size}"
 
-            # Check if model is already cached
-            is_cached = self._is_model_cached(model_size)
-
             # Set up progress callback and tracker
             # If cached: filter out non-download progress (like "Segment 1/1" during generation)
             # If not cached: report all progress (we're actually downloading)
             progress_callback = create_hf_progress_callback(model_name, progress_manager)
             tracker = HFProgressTracker(progress_callback, filter_non_downloads=is_cached)
 
-            # Patch tqdm BEFORE importing qwen_tts
-            tracker_context = tracker.patch_download()
-            tracker_context.__enter__()
-
-            # Import qwen_tts
-            from qwen_tts import Qwen3TTSModel
-
             # Get model path (local or HuggingFace Hub ID)
             model_path = self._get_model_path(model_size)
 
-            print(f"Loading TTS model {model_size} on {self.device}...")
+            logger.info(f"Loading TTS model {model_size} on {self.device}...")
 
-            # Only track download progress if model is NOT cached
             if not is_cached:
                 # Start tracking download task
                 task_manager.start_download(model_name)
@@ -163,30 +173,40 @@ def _load_model_sync(self, model_size: str):
                     filename="Connecting to HuggingFace...",
                     status="downloading",
                 )
+            else:
+                # Emit a "loading" status so the UI can show a spinner while the
+                # cached model is being loaded into GPU memory.
+                progress_manager.update_progress(
+                    model_name=model_name,
+                    current=0,
+                    total=0,
+                    filename="Loading model into memory...",
+                    status="loading",
+                )
+
+            # Patch tqdm and use HF offline mode for cached models to skip remote validation
+            with tracker.patch_download(), hf_offline_for_cached(is_cached):
+                from qwen_tts import Qwen3TTSModel
 
-            # Load the model (tqdm is patched, but filters out non-download progress)
-            try:
                 self.model = Qwen3TTSModel.from_pretrained(
                     model_path,
                     device_map=self.device,
                     torch_dtype=torch.float32 if self.device == "cpu" else torch.bfloat16,
                 )
-            finally:
-                # Exit the patch context
-                tracker_context.__exit__(None, None, None)
-            
-            # Only mark download as complete if we were tracking it
+
             if not is_cached:
                 progress_manager.mark_complete(model_name)
                 task_manager.complete_download(model_name)
+            else:
+                progress_manager.clear_progress(model_name)
             
             self._current_model_size = model_size
             self.model_size = model_size
             
-            print(f"TTS model {model_size} loaded successfully")
+            logger.info(f"TTS model {model_size} loaded successfully")
             
         except ImportError as e:
-            print(f"Error: qwen_tts package not found. Install with: pip install git+https://github.com/QwenLM/Qwen3-TTS.git")
+            logger.error(f"Error: qwen_tts package not found. Install with: pip install git+https://github.com/QwenLM/Qwen3-TTS.git")
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             model_name = f"qwen-tts-{model_size}"
@@ -194,8 +214,8 @@ def _load_model_sync(self, model_size: str):
             task_manager.error_download(model_name, str(e))
             raise
         except Exception as e:
-            print(f"Error loading TTS model: {e}")
-            print(f"Tip: The model will be automatically downloaded from HuggingFace Hub on first use.")
+            logger.error(f"Error loading TTS model: {e}")
+            logger.info(f"Tip: The model will be automatically downloaded from HuggingFace Hub on first use.")
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             model_name = f"qwen-tts-{model_size}"
@@ -205,15 +225,16 @@ def _load_model_sync(self, model_size: str):
     
     def unload_model(self):
         """Unload the model to free memory."""
+        self._idle_timer.cancel()
         if self.model is not None:
             del self.model
             self.model = None
             self._current_model_size = None
-            
+
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
-            
-            print("TTS model unloaded")
+
+            logger.info("TTS model unloaded")
     
     async def create_voice_prompt(
         self,
@@ -287,12 +308,12 @@ async def combine_voice_prompts(
         
         for audio_path in audio_paths:
             audio, sr = load_audio(audio_path)
-            audio = normalize_audio(audio)
+            audio = normalize_audio(audio, sample_rate=sr)
             combined_audio.append(audio)
-        
+
         # Concatenate audio
         mixed = np.concatenate(combined_audio)
-        mixed = normalize_audio(mixed)
+        mixed = normalize_audio(mixed, sample_rate=sr)
         
         # Combine texts
         combined_text = " ".join(reference_texts)
@@ -306,6 +327,7 @@ async def generate(
         language: str = "en",
         seed: Optional[int] = None,
         instruct: Optional[str] = None,
+        progress_callback: Optional[callable] = None,
     ) -> Tuple[np.ndarray, int]:
         """
         Generate audio from text using voice prompt.
@@ -316,6 +338,7 @@ async def generate(
             language: Language code (en or zh)
             seed: Random seed for reproducibility
             instruct: Natural language instruction for speech delivery control
+            progress_callback: Optional callback(progress_pct: float) where 0.0-100.0
 
         Returns:
             Tuple of (audio_array, sample_rate)
@@ -325,6 +348,9 @@ async def generate(
 
         def _generate_sync():
             """Run synchronous generation in thread pool."""
+            if progress_callback:
+                progress_callback(0.0)
+
             # Set seed if provided
             if seed is not None:
                 torch.manual_seed(seed)
@@ -337,6 +363,10 @@ def _generate_sync():
                 voice_clone_prompt=voice_prompt,
                 instruct=instruct,
             )
+
+            if progress_callback:
+                progress_callback(100.0)
+
             return wavs[0], sample_rate
 
         # Run blocking inference in thread pool to avoid blocking event loop
@@ -353,6 +383,11 @@ def __init__(self, model_size: str = "base"):
         self.processor = None
         self.model_size = model_size
         self.device = self._get_device()
+        self._idle_timer = IdleTimer(
+            timeout=_STT_IDLE_TIMEOUT,
+            on_timeout=self.unload_model,
+            label="STT",
+        )
     
     def _get_device(self) -> str:
         """Get the best available device."""
@@ -388,7 +423,7 @@ def _is_model_cached(self, model_size: str) -> bool:
             # Check for .incomplete files - if any exist, download is still in progress
             blobs_dir = repo_cache / "blobs"
             if blobs_dir.exists() and any(blobs_dir.glob("*.incomplete")):
-                print(f"[_is_model_cached] Found .incomplete files for whisper-{model_size}, treating as not cached")
+                logger.debug(f"[_is_model_cached] Found .incomplete files for whisper-{model_size}, treating as not cached")
                 return False
             
             # Check that actual model weight files exist in snapshots
@@ -399,14 +434,14 @@ def _is_model_cached(self, model_size: str) -> bool:
                     any(snapshots_dir.rglob("*.bin"))
                 )
                 if not has_weights:
-                    print(f"[_is_model_cached] No model weights found for whisper-{model_size}, treating as not cached")
+                    logger.debug(f"[_is_model_cached] No model weights found for whisper-{model_size}, treating as not cached")
                     return False
             
             return True
         except Exception as e:
-            print(f"[_is_model_cached] Error checking cache for whisper-{model_size}: {e}")
+            logger.warning(f"[_is_model_cached] Error checking cache for whisper-{model_size}: {e}")
             return False
-    
+
     async def load_model_async(self, model_size: Optional[str] = None):
         """
         Lazy load the Whisper model.
@@ -414,53 +449,39 @@ async def load_model_async(self, model_size: Optional[str] = None):
         Args:
             model_size: Model size (tiny, base, small, medium, large)
         """
-        print(f"[DEBUG] load_model_async called with size: {model_size}")
         if model_size is None:
             model_size = self.model_size
 
-        print(f"[DEBUG] Model already loaded? {self.model is not None}, current size: {self.model_size}, requested: {model_size}")
         if self.model is not None and self.model_size == model_size:
-            print(f"[DEBUG] Early return - model already loaded")
+            self._idle_timer.touch()
             return
 
-        print(f"[DEBUG] Calling asyncio.to_thread for _load_model_sync")
+        # Check cache before entering thread pool
+        is_cached = self._is_model_cached(model_size)
+
         # Run blocking load in thread pool
-        await asyncio.to_thread(self._load_model_sync, model_size)
-        print(f"[DEBUG] asyncio.to_thread completed")
-    
+        await asyncio.to_thread(self._load_model_sync, model_size, is_cached)
+        self._idle_timer.touch()
+
     # Alias for compatibility
     load_model = load_model_async
-    
-    def _load_model_sync(self, model_size: str):
+
+    def _load_model_sync(self, model_size: str, is_cached: bool = False):
         """Synchronous model loading."""
-        print(f"[DEBUG] _load_model_sync called for Whisper {model_size}")
         try:
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             progress_model_name = f"whisper-{model_size}"
 
-            # Check if model is already cached
-            is_cached = self._is_model_cached(model_size)
-
             # Set up progress callback and tracker
             # If cached: filter out non-download progress
             # If not cached: report all progress (we're actually downloading)
             progress_callback = create_hf_progress_callback(progress_model_name, progress_manager)
             tracker = HFProgressTracker(progress_callback, filter_non_downloads=is_cached)
 
-            # Patch tqdm BEFORE importing transformers
-            print("[DEBUG] Starting tqdm patch BEFORE transformers import")
-            tracker_context = tracker.patch_download()
-            tracker_context.__enter__()
-            print("[DEBUG] tqdm patched, now importing transformers")
-
-            # Import transformers
-            from transformers import WhisperProcessor, WhisperForConditionalGeneration
-
             model_name = f"openai/whisper-{model_size}"
-            print(f"[DEBUG] Model name: {model_name}")
 
-            print(f"Loading Whisper model {model_size} on {self.device}...")
+            logger.info(f"Loading Whisper model {model_size} on {self.device}...")
 
             # Only track download progress if model is NOT cached
             if not is_cached:
@@ -476,13 +497,12 @@ def _load_model_sync(self, model_size: str):
                     status="downloading",
                 )
 
-            # Load models (tqdm is patched, but filters out non-download progress)
-            try:
+            # Patch tqdm and use HF offline mode for cached models
+            with tracker.patch_download(), hf_offline_for_cached(is_cached):
+                from transformers import WhisperProcessor, WhisperForConditionalGeneration
+
                 self.processor = WhisperProcessor.from_pretrained(model_name)
                 self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
-            finally:
-                # Exit the patch context
-                tracker_context.__exit__(None, None, None)
             
             # Only mark download as complete if we were tracking it
             if not is_cached:
@@ -492,10 +512,10 @@ def _load_model_sync(self, model_size: str):
             self.model.to(self.device)
             self.model_size = model_size
             
-            print(f"Whisper model {model_size} loaded successfully")
+            logger.info(f"Whisper model {model_size} loaded successfully")
             
         except Exception as e:
-            print(f"Error loading Whisper model: {e}")
+            logger.error(f"Error loading Whisper model: {e}")
             progress_manager = get_progress_manager()
             task_manager = get_task_manager()
             progress_model_name = f"whisper-{model_size}"
@@ -505,16 +525,17 @@ def _load_model_sync(self, model_size: str):
     
     def unload_model(self):
         """Unload the model to free memory."""
+        self._idle_timer.cancel()
         if self.model is not None:
             del self.model
             del self.processor
             self.model = None
             self.processor = None
-            
+
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
-            
-            print("Whisper model unloaded")
+
+            logger.info("Whisper model unloaded")
     
     async def transcribe(
         self,
diff --git a/backend/cli.py b/backend/cli.py
new file mode 100644
index 0000000..f80040f
--- /dev/null
+++ b/backend/cli.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""CLI for voicebox — headless TTS generation and voice management."""
+
+import argparse
+import shutil
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import requests
+
+DEFAULT_URL = "http://127.0.0.1:17493"
+SERVER_BIN = "/Applications/Voicebox.app/Contents/MacOS/voicebox-server"
+DEFAULT_DATA_DIR = Path.home() / "Library/Application Support/sh.voicebox.app"
+
+
+# --- API helpers ---
+
+def api(method, base_url, path, **kwargs):
+    """Make an API call with consistent error handling."""
+    kwargs.setdefault("timeout", 30)
+    try:
+        resp = getattr(requests, method)(f"{base_url}{path}", **kwargs)
+    except requests.ConnectionError:
+        print(f"Error: cannot connect to server at {base_url}", file=sys.stderr)
+        print("Start it with: voicebox server", file=sys.stderr)
+        sys.exit(1)
+    if resp.status_code >= 400:
+        try:
+            detail = resp.json().get("detail", resp.text[:200])
+        except Exception:
+            detail = resp.text[:200]
+        print(f"Error: {resp.status_code} — {detail}", file=sys.stderr)
+        sys.exit(1)
+    return resp
+
+
+# --- Subcommands ---
+
+PID_FILE = Path.home() / ".voicebox.pid"
+LOG_FILE = Path.home() / ".voicebox.log"
+
+
+def cmd_server(args):
+    """Start the backend server (no frontend)."""
+    data_dir = args.data_dir or str(DEFAULT_DATA_DIR)
+    port = str(args.port)
+
+    if args.stop:
+        _stop_server()
+        return
+
+    # Try the installed app binary first
+    if Path(SERVER_BIN).exists():
+        bin_path = SERVER_BIN
+    elif shutil.which("voicebox-server"):
+        bin_path = "voicebox-server"
+    else:
+        bin_path = None
+
+    cmd = ([bin_path, "--data-dir", data_dir, "--port", port] if bin_path
+           else [sys.executable, "-m", "backend.main", "--port", port])
+
+    if args.detach:
+        # Check if already running
+        if PID_FILE.exists():
+            pid = int(PID_FILE.read_text().strip())
+            try:
+                import os
+                os.kill(pid, 0)
+                print(f"Server already running (pid {pid})")
+                return
+            except ProcessLookupError:
+                PID_FILE.unlink(missing_ok=True)
+
+        log = open(LOG_FILE, "w")
+        proc = subprocess.Popen(
+            cmd,
+            stdout=log,
+            stderr=log,
+            start_new_session=True,
+            cwd=None if bin_path else Path(__file__).resolve().parent.parent,
+        )
+        PID_FILE.write_text(str(proc.pid))
+        print(f"Waiting for server (pid {proc.pid}, port {port})...", end="", flush=True)
+        url = f"http://127.0.0.1:{port}/health"
+        for _ in range(60):
+            time.sleep(0.5)
+            if proc.poll() is not None:
+                print(" failed.")
+                print(f"Server exited. Check log: {LOG_FILE}", file=sys.stderr)
+                PID_FILE.unlink(missing_ok=True)
+                sys.exit(1)
+            try:
+                r = requests.get(url, timeout=2)
+                if r.status_code == 200:
+                    print(" ready.")
+                    print(f"Stop with: voicebox server --stop")
+                    return
+            except requests.ConnectionError:
+                print(".", end="", flush=True)
+        print(" timed out.")
+        print(f"Server didn't respond within 30s. Check log: {LOG_FILE}", file=sys.stderr)
+    else:
+        print(f"Starting voicebox server on port {port}...")
+        print(f"Data dir: {data_dir}")
+        print(f"Press Ctrl+C to stop.\n")
+        try:
+            subprocess.run(
+                cmd,
+                cwd=None if bin_path else Path(__file__).resolve().parent.parent,
+            )
+        except KeyboardInterrupt:
+            print("\nServer stopped.")
+
+
+def _stop_server():
+    """Stop a detached server."""
+    import os, signal
+    if not PID_FILE.exists():
+        print("No server running (no pid file).")
+        return
+    pid = int(PID_FILE.read_text().strip())
+    try:
+        os.kill(pid, signal.SIGTERM)
+        print(f"Stopped server (pid {pid})")
+    except ProcessLookupError:
+        print(f"Server not running (stale pid {pid})")
+    PID_FILE.unlink(missing_ok=True)
+
+
+def cmd_voices(args):
+    """List all voice profiles."""
+    resp = api("get", args.url, "/profiles")
+    profiles = resp.json()
+    if not profiles:
+        print("No voice profiles found. Import one with: voicebox import <file.zip>")
+        return
+    print(f"{'Name':<30} {'Language':<10} {'ID'}")
+    print("-" * 75)
+    for p in profiles:
+        print(f"{p['name']:<30} {p['language']:<10} {p['id']}")
+
+
+def cmd_import(args):
+    """Import a voice profile from a ZIP file."""
+    zip_path = Path(args.file)
+    if not zip_path.exists():
+        print(f"Error: file not found: {zip_path}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Importing {zip_path.name}...")
+    with open(zip_path, "rb") as f:
+        resp = api("post", args.url, "/profiles/import",
+                    files={"file": (zip_path.name, f, "application/zip")},
+                    timeout=60)
+    profile = resp.json()
+    print(f"Imported: {profile['name']} ({profile['id']})")
+
+
+def cmd_generate(args):
+    """Generate speech from text."""
+    import json as json_lib
+
+    # Resolve text input
+    if args.text:
+        text = args.text
+    elif args.file:
+        text = Path(args.file).read_text().strip()
+    elif not sys.stdin.isatty():
+        text = sys.stdin.read().strip()
+    else:
+        print("Error: provide text via --text, --file, or stdin.", file=sys.stderr)
+        sys.exit(1)
+
+    if not text:
+        print("Error: text is empty.", file=sys.stderr)
+        sys.exit(1)
+
+    # Resolve voice
+    profile = resolve_profile(args.url, args.voice)
+
+    # Generate
+    payload = {
+        "profile_id": profile["id"],
+        "text": text,
+        "language": args.language or profile.get("language", "en"),
+    }
+    if args.seed is not None:
+        payload["seed"] = args.seed
+    if args.instruct:
+        payload["instruct"] = args.instruct
+
+    print(f"Generating with voice '{profile['name']}'...")
+    start = time.time()
+
+    # Use streaming mode — start generation asynchronously
+    resp = api("post", args.url, "/generate?stream=true", json=payload, timeout=10)
+    start_data = resp.json()
+    generation_id = start_data["generation_id"]
+
+    # Stream progress via SSE
+    progress_resp = requests.get(
+        f"{args.url}/generate/progress/{generation_id}",
+        stream=True,
+        timeout=300,
+    )
+
+    final_data = None
+    for line in progress_resp.iter_lines():
+        if not line:
+            continue
+        line = line.decode("utf-8") if isinstance(line, bytes) else line
+        if line.startswith("data: "):
+            data_str = line[6:]
+            data = json_lib.loads(data_str)
+            pct = data.get("progress", 0)
+            status = data.get("status", "")
+
+            # Print progress bar
+            bar_len = 20
+            filled = int(pct / 100 * bar_len)
+            bar = "#" * filled + "-" * (bar_len - filled)
+            print(f"\r[{bar}] {pct:.0f}%", end="", flush=True)
+
+            if status in ("complete", "error"):
+                print()  # newline after progress bar
+                final_data = data
+                break
+
+    elapsed = time.time() - start
+
+    if final_data and final_data.get("status") == "error":
+        error_msg = final_data.get("error", "Unknown error")
+        print(f"Generation failed: {error_msg}", file=sys.stderr)
+        sys.exit(1)
+
+    # Fetch latest history entry to get the generation result
+    history_resp = api("get", args.url, "/history", params={"limit": 1}, timeout=10)
+    history_data = history_resp.json()
+    if history_data["items"]:
+        result = history_data["items"][0]
+        print(f"Done in {elapsed:.1f}s (audio duration: {result['duration']:.1f}s)")
+    else:
+        print(f"Done in {elapsed:.1f}s")
+        result = {"id": generation_id, "duration": 0}
+
+    # Download wav then convert to m4a
+    tag = str(int(time.time()))[-5:]
+    wav_path = f"output_{tag}.wav"
+    dl = api("get", args.url, f"/audio/{result['id']}", timeout=60)
+    Path(wav_path).write_bytes(dl.content)
+
+    if not shutil.which("ffmpeg"):
+        print("Warning: ffmpeg not found, keeping .wav", file=sys.stderr)
+        output = wav_path
+    else:
+        output = args.output or f"output_{tag}.m4a"
+        r = subprocess.run(
+            ["ffmpeg", "-y", "-i", wav_path, "-c:a", "aac", "-b:a", "128k", output],
+            capture_output=True,
+        )
+        if r.returncode != 0:
+            print(f"Warning: ffmpeg failed, keeping .wav", file=sys.stderr)
+            output = wav_path
+        else:
+            Path(wav_path).unlink()
+            print(f"Saved: {output}")
+
+    if not args.no_open:
+        subprocess.run(["open", output])
+
+
+def cmd_health(args):
+    """Check server health."""
+    resp = api("get", args.url, "/health")
+    h = resp.json()
+    print(f"Status:       {h['status']}")
+    print(f"Model loaded: {h['model_loaded']}")
+    print(f"Backend:      {h.get('backend_type', '?')}")
+    print(f"GPU:          {h.get('gpu_type', 'none')}")
+    if h.get('vram_used_mb'):
+        print(f"VRAM used:    {h['vram_used_mb']:.0f} MB")
+
+
+# --- Profile resolution (shared) ---
+
+def resolve_profile(base_url, voice_name):
+    resp = api("get", base_url, "/profiles")
+    profiles = resp.json()
+    if not profiles:
+        print("Error: no voice profiles found.", file=sys.stderr)
+        print("Import one with: voicebox import <file.zip>", file=sys.stderr)
+        sys.exit(1)
+
+    if voice_name:
+        match = [p for p in profiles if p["name"].lower() == voice_name.lower()]
+        if not match:
+            match = [p for p in profiles if voice_name.lower() in p["name"].lower()]
+        if not match:
+            print(f"Error: no voice matching '{voice_name}'. Available:", file=sys.stderr)
+            for p in profiles:
+                print(f"  - {p['name']}", file=sys.stderr)
+            sys.exit(1)
+        if len(match) > 1:
+            print(f"Multiple voices match '{voice_name}':", file=sys.stderr)
+            for p in match:
+                print(f"  - {p['name']}", file=sys.stderr)
+            sys.exit(1)
+        return match[0]
+    else:
+        # Interactive picker
+        print("Available voices:")
+        for i, p in enumerate(profiles, 1):
+            print(f"  {i}. {p['name']} ({p['language']})")
+        print()
+        while True:
+            choice = input(f"Choose a voice [1-{len(profiles)}]: ").strip()
+            try:
+                idx = int(choice) - 1
+                if 0 <= idx < len(profiles):
+                    return profiles[idx]
+            except ValueError:
+                pass
+            print("Invalid choice, try again.")
+
+
+# --- Main ---
+
+def main():
+    parser = argparse.ArgumentParser(prog="voicebox", description="Voicebox CLI — headless TTS")
+    parser.add_argument("--url", default=DEFAULT_URL, help=f"Server URL (default: {DEFAULT_URL})")
+    sub = parser.add_subparsers(dest="command")
+
+    # server
+    p_server = sub.add_parser("server", help="Start the backend server")
+    p_server.add_argument("--port", type=int, default=17493, help="Port (default: 17493)")
+    p_server.add_argument("--data-dir", help="Data directory")
+    p_server.add_argument("-d", "--detach", action="store_true", help="Run in background (daemon)")
+    p_server.add_argument("--stop", action="store_true", help="Stop a detached server")
+
+    # voices
+    sub.add_parser("voices", help="List voice profiles")
+
+    # import
+    p_import = sub.add_parser("import", help="Import a voice profile from ZIP")
+    p_import.add_argument("file", help="Path to .zip file")
+
+    # generate
+    p_gen = sub.add_parser("generate", aliases=["gen", "say"], help="Generate speech")
+    p_gen.add_argument("--voice", "-v", help="Voice name (interactive picker if omitted)")
+    p_gen.add_argument("--text", "-t", help="Text to speak")
+    p_gen.add_argument("--file", "-f", help="Read text from a file")
+    p_gen.add_argument("--output", "-o", help="Output path (default: output_<epoch>.m4a)")
+    p_gen.add_argument("--language", "-l", help="Language code")
+    p_gen.add_argument("--seed", "-s", type=int, help="Random seed")
+    p_gen.add_argument("--instruct", help="Style instruction (e.g. 'speak slowly')")
+    p_gen.add_argument("--no-open", action="store_true", help="Don't open file after generating")
+
+    # health
+    sub.add_parser("health", help="Check server status")
+
+    args = parser.parse_args()
+
+    if args.command == "server":
+        cmd_server(args)
+    elif args.command == "voices":
+        cmd_voices(args)
+    elif args.command == "import":
+        cmd_import(args)
+    elif args.command in ("generate", "gen", "say"):
+        cmd_generate(args)
+    elif args.command == "health":
+        cmd_health(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/config.py b/backend/config.py
index a471820..ed6ccec 100644
--- a/backend/config.py
+++ b/backend/config.py
@@ -4,8 +4,11 @@
 Handles data directory configuration for production bundling.
 """
 
+import logging
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 # Default data directory (used in development)
 _data_dir = Path("data")
 
@@ -19,7 +22,7 @@ def set_data_dir(path: str | Path):
     global _data_dir
     _data_dir = Path(path)
     _data_dir.mkdir(parents=True, exist_ok=True)
-    print(f"Data directory set to: {_data_dir.absolute()}")
+    logger.info(f"Data directory set to: {_data_dir.absolute()}")
 
 def get_data_dir() -> Path:
     """
diff --git a/backend/database.py b/backend/database.py
index 3b9c51e..bcdbc8d 100644
--- a/backend/database.py
+++ b/backend/database.py
@@ -2,6 +2,7 @@
 SQLite database ORM using SQLAlchemy.
 """
 
+import logging
 from sqlalchemy import create_engine, Column, String, Integer, Float, DateTime, Text, ForeignKey, Boolean
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker, Session
@@ -11,6 +12,8 @@
 
 from . import config
 
+logger = logging.getLogger(__name__)
+
 Base = declarative_base()
 
 
@@ -27,6 +30,30 @@ class VoiceProfile(Base):
     updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
 
 
+class GenerationJob(Base):
+    """Persistent job queue for TTS generation."""
+    __tablename__ = "generation_jobs"
+
+    id = Column(String, primary_key=True)  # UUID set by caller
+    profile_id = Column(String, ForeignKey("profiles.id"), nullable=False)
+    text = Column(Text, nullable=False)
+    language = Column(String, default="en")
+    seed = Column(Integer, nullable=True)
+    model_size = Column(String, default="1.7B")
+    instruct = Column(Text, nullable=True)
+    status = Column(String, default="queued")  # queued | generating | cancelling | complete | cancelled | error | timeout | deleted
+    progress = Column(Float, default=0.0)
+    error = Column(Text, nullable=True)
+    generation_id = Column(String, nullable=True)  # links to generations.id on complete
+    request_user_id = Column(String, nullable=True)
+    request_user_first_name = Column(String, nullable=True)
+    request_ip = Column(String, nullable=True)
+    backend_type = Column(String, nullable=True)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    started_at = Column(DateTime, nullable=True)
+    completed_at = Column(DateTime, nullable=True)
+
+
 class ProfileSample(Base):
     """Voice profile sample database model."""
     __tablename__ = "profile_samples"
@@ -47,9 +74,16 @@ class Generation(Base):
     language = Column(String, default="en")
     audio_path = Column(String, nullable=False)
     duration = Column(Float, nullable=False)
+    generation_time_seconds = Column(Float, nullable=True)
     seed = Column(Integer)
     instruct = Column(Text)
+    model_size = Column(String, nullable=True)
+    backend_type = Column(String, nullable=True)
+    request_user_id = Column(String, nullable=True)
+    request_user_first_name = Column(String, nullable=True)
+    request_ip = Column(String, nullable=True)
     created_at = Column(DateTime, default=datetime.utcnow)
+    deleted_at = Column(DateTime, nullable=True)
 
 
 class Story(Base):
@@ -182,7 +216,7 @@ def _run_migrations(engine):
     # Migration: Remove position column and ensure start_time_ms exists
     # SQLite doesn't support DROP COLUMN easily, so we recreate the table
     if 'position' in columns:
-        print("Migrating story_items: removing position column, using start_time_ms")
+        logger.info("Migrating story_items: removing position column, using start_time_ms")
         
         with engine.connect() as conn:
             # Check if start_time_ms already exists
@@ -248,45 +282,82 @@ def _run_migrations(engine):
             conn.execute(text("ALTER TABLE story_items_new RENAME TO story_items"))
             
             conn.commit()
-            print("Migrated story_items table to use start_time_ms (removed position column)")
+            logger.info("Migrated story_items table to use start_time_ms (removed position column)")
     
     # Migration: Add track column if it doesn't exist
     # Re-check columns after potential position migration
     columns = {col['name'] for col in inspector.get_columns('story_items')}
     if 'track' not in columns:
-        print("Migrating story_items: adding track column")
+        logger.info("Migrating story_items: adding track column")
         with engine.connect() as conn:
             conn.execute(text("ALTER TABLE story_items ADD COLUMN track INTEGER NOT NULL DEFAULT 0"))
             conn.commit()
-            print("Added track column to story_items")
+            logger.info("Added track column to story_items")
     
     # Migration: Add trim columns if they don't exist
     # Re-check columns after potential track migration
     columns = {col['name'] for col in inspector.get_columns('story_items')}
     if 'trim_start_ms' not in columns:
-        print("Migrating story_items: adding trim_start_ms column")
+        logger.info("Migrating story_items: adding trim_start_ms column")
         with engine.connect() as conn:
             conn.execute(text("ALTER TABLE story_items ADD COLUMN trim_start_ms INTEGER NOT NULL DEFAULT 0"))
             conn.commit()
-            print("Added trim_start_ms column to story_items")
+            logger.info("Added trim_start_ms column to story_items")
     
     columns = {col['name'] for col in inspector.get_columns('story_items')}
     if 'trim_end_ms' not in columns:
-        print("Migrating story_items: adding trim_end_ms column")
+        logger.info("Migrating story_items: adding trim_end_ms column")
         with engine.connect() as conn:
             conn.execute(text("ALTER TABLE story_items ADD COLUMN trim_end_ms INTEGER NOT NULL DEFAULT 0"))
             conn.commit()
-            print("Added trim_end_ms column to story_items")
+            logger.info("Added trim_end_ms column to story_items")
 
     # Migration: Add avatar_path to profiles table
     if 'profiles' in inspector.get_table_names():
         columns = {col['name'] for col in inspector.get_columns('profiles')}
         if 'avatar_path' not in columns:
-            print("Migrating profiles: adding avatar_path column")
+            logger.info("Migrating profiles: adding avatar_path column")
             with engine.connect() as conn:
                 conn.execute(text("ALTER TABLE profiles ADD COLUMN avatar_path VARCHAR"))
                 conn.commit()
-                print("Added avatar_path column to profiles")
+                logger.info("Added avatar_path column to profiles")
+
+    # Migration: add queue metadata columns
+    if 'generation_jobs' in inspector.get_table_names():
+        columns = {col['name'] for col in inspector.get_columns('generation_jobs')}
+        queue_additions = [
+            ('request_user_id', 'VARCHAR'),
+            ('request_user_first_name', 'VARCHAR'),
+            ('request_ip', 'VARCHAR'),
+            ('backend_type', 'VARCHAR'),
+        ]
+        for column_name, column_type in queue_additions:
+            if column_name not in columns:
+                logger.info(f"Migrating generation_jobs: adding {column_name} column")
+                with engine.connect() as conn:
+                    conn.execute(text(f"ALTER TABLE generation_jobs ADD COLUMN {column_name} {column_type}"))
+                    conn.commit()
+                logger.info(f"Added {column_name} column to generation_jobs")
+
+    # Migration: add generation metadata columns
+    if 'generations' in inspector.get_table_names():
+        columns = {col['name'] for col in inspector.get_columns('generations')}
+        generation_additions = [
+            ('generation_time_seconds', 'FLOAT'),
+            ('model_size', 'VARCHAR'),
+            ('backend_type', 'VARCHAR'),
+            ('request_user_id', 'VARCHAR'),
+            ('request_user_first_name', 'VARCHAR'),
+            ('request_ip', 'VARCHAR'),
+            ('deleted_at', 'DATETIME'),
+        ]
+        for column_name, column_type in generation_additions:
+            if column_name not in columns:
+                logger.info(f"Migrating generations: adding {column_name} column")
+                with engine.connect() as conn:
+                    conn.execute(text(f"ALTER TABLE generations ADD COLUMN {column_name} {column_type}"))
+                    conn.commit()
+                logger.info(f"Added {column_name} column to generations")
 
 
 def get_db():
diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml
new file mode 100644
index 0000000..ef25b2d
--- /dev/null
+++ b/backend/docker-compose.yml
@@ -0,0 +1,36 @@
+services:
+  voicebox:
+    build:
+      context: .
+      args:
+        CUDA: "1"
+    ports:
+      - "17493:17493"
+    volumes:
+      - voicebox-data:/data
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - HF_HOME=/data/huggingface
+
+  # CPU-only variant (comment out the above, uncomment this)
+  # voicebox:
+  #   build:
+  #     context: .
+  #     args:
+  #       CUDA: "0"
+  #   ports:
+  #     - "17493:17493"
+  #   volumes:
+  #     - voicebox-data:/data
+  #   restart: unless-stopped
+
+volumes:
+  voicebox-data:
diff --git a/backend/history.py b/backend/history.py
index 64834d3..109ea99 100644
--- a/backend/history.py
+++ b/backend/history.py
@@ -29,6 +29,12 @@ async def create_generation(
     seed: Optional[int],
     db: Session,
     instruct: Optional[str] = None,
+    model_size: Optional[str] = None,
+    backend_type: Optional[str] = None,
+    request_user_id: Optional[str] = None,
+    request_user_first_name: Optional[str] = None,
+    request_ip: Optional[str] = None,
+    generation_time_seconds: Optional[float] = None,
 ) -> GenerationResponse:
     """
     Create a new generation history entry.
@@ -53,8 +59,14 @@ async def create_generation(
         language=language,
         audio_path=audio_path,
         duration=duration,
+        generation_time_seconds=generation_time_seconds,
         seed=seed,
         instruct=instruct,
+        model_size=model_size,
+        backend_type=backend_type,
+        request_user_id=request_user_id,
+        request_user_first_name=request_user_first_name,
+        request_ip=request_ip,
         created_at=datetime.utcnow(),
     )
 
@@ -82,6 +94,8 @@ async def get_generation(
     generation = db.query(DBGeneration).filter_by(id=generation_id).first()
     if not generation:
         return None
+    if generation.deleted_at is not None:
+        return None
     
     return GenerationResponse.model_validate(generation)
 
@@ -107,6 +121,8 @@ async def list_generations(
     ).join(
         DBVoiceProfile,
         DBGeneration.profile_id == DBVoiceProfile.id
+    ).filter(
+        DBGeneration.deleted_at.is_(None)
     )
     
     # Apply profile filter
@@ -141,8 +157,14 @@ async def list_generations(
             language=generation.language,
             audio_path=generation.audio_path,
             duration=generation.duration,
+            generation_time_seconds=generation.generation_time_seconds,
             seed=generation.seed,
             instruct=generation.instruct,
+            model_size=generation.model_size,
+            backend_type=generation.backend_type,
+            request_user_id=generation.request_user_id,
+            request_user_first_name=generation.request_user_first_name,
+            request_ip=generation.request_ip,
             created_at=generation.created_at,
         ))
     
@@ -169,14 +191,16 @@ async def delete_generation(
     generation = db.query(DBGeneration).filter_by(id=generation_id).first()
     if not generation:
         return False
+    if generation.deleted_at is not None:
+        return False
     
     # Delete audio file
     audio_path = Path(generation.audio_path)
     if audio_path.exists():
         audio_path.unlink()
     
-    # Delete from database
-    db.delete(generation)
+    # Soft-delete in database (keep record for auditing/restore workflows).
+    generation.deleted_at = datetime.utcnow()
     db.commit()
     
     return True
diff --git a/backend/main.py b/backend/main.py
index 59fb9e1..571b117 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -4,10 +4,12 @@
 Handles voice cloning, generation history, and server mode.
 """
 
-from fastapi import FastAPI, Depends, UploadFile, File, Form, HTTPException
+import logging
+
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Depends, UploadFile, File, Form, HTTPException, Request, Query
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse, StreamingResponse
-from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse, StreamingResponse, JSONResponse
 from sqlalchemy.orm import Session
 from typing import List, Optional
 from datetime import datetime
@@ -19,33 +21,118 @@
 import io
 from pathlib import Path
 import uuid
-import asyncio
 import signal
 import os
 
 from . import database, models, profiles, history, tts, transcribe, config, export_import, channels, stories, __version__
-from .database import get_db, Generation as DBGeneration, VoiceProfile as DBVoiceProfile
+from .database import get_db, Generation as DBGeneration, GenerationJob as DBGenerationJob, VoiceProfile as DBVoiceProfile
 from .utils.progress import get_progress_manager
 from .utils.tasks import get_task_manager
 from .utils.cache import clear_voice_prompt_cache
 from .platform_detect import get_backend_type
 
+logger = logging.getLogger(__name__)
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Startup / shutdown lifecycle for the app."""
+    await _startup()
+    yield
+    await _shutdown()
+
+
 app = FastAPI(
     title="voicebox API",
     description="Production-quality Qwen3-TTS voice cloning API",
     version=__version__,
+    lifespan=lifespan,
 )
 
-# CORS middleware
+# Lock to serialize TTS model access — MLX models are NOT thread-safe.
+# Concurrent generate() calls on the same model instance corrupt state and crash.
+_model_lock = asyncio.Lock()
+
+# Event to wake the job worker when a new job is queued
+_job_signal = asyncio.Event()
+_cancel_requested_jobs: set[str] = set()
+_MAX_ACTIVE_JOBS_PER_USER = 3
+_QUEUED_JOB_TIMEOUT_MINUTES = 15
+_GENERATING_JOB_TIMEOUT_MINUTES = 5
+
+
+def _expire_old_queued_jobs(db: Session):
+    """Expire queued jobs that have sat too long without starting."""
+    from datetime import timedelta
+
+    cutoff = datetime.utcnow() - timedelta(minutes=_QUEUED_JOB_TIMEOUT_MINUTES)
+    stale_queued = db.query(DBGenerationJob).filter(
+        DBGenerationJob.status == "queued",
+        DBGenerationJob.created_at < cutoff,
+    ).all()
+    for job in stale_queued:
+        job.status = "timeout"
+        job.error = "Queue timeout"
+        job.completed_at = datetime.utcnow()
+        try:
+            get_progress_manager().mark_error(job.id, "Queue timeout")
+        except Exception:
+            pass
+    if stale_queued:
+        db.commit()
+
+
+def _extract_request_ip(request: Request) -> str:
+    """Best-effort client IP extraction, including proxy headers."""
+    forwarded_for = request.headers.get("x-forwarded-for")
+    if forwarded_for:
+        return forwarded_for.split(",")[0].strip()
+    real_ip = request.headers.get("x-real-ip")
+    if real_ip:
+        return real_ip.strip()
+    if request.client and request.client.host:
+        return request.client.host
+    return "unknown"
+
+# CORS middleware — allow_credentials=False because we don't use cookies,
+# and allow_origins=["*"] is invalid with credentials per the CORS spec.
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Configure appropriately for production
-    allow_credentials=True,
+    allow_origins=["*"],
+    allow_credentials=False,
     allow_methods=["*"],
     allow_headers=["*"],
+    expose_headers=["X-Health-Model-Loaded", "X-Health-Model-Size", "X-Health-GPU-Type", "X-Health-Backend"],
 )
 
 
+@app.middleware("http")
+async def health_piggyback_middleware(request, call_next):
+    """Attach lightweight health info as headers on every response."""
+    response = await call_next(request)
+    try:
+        tts_model = tts.get_tts_model()
+        loaded = tts_model.is_loaded()
+        response.headers["X-Health-Model-Loaded"] = "1" if loaded else "0"
+        if loaded:
+            size = getattr(tts_model, '_current_model_size', None)
+            if size:
+                response.headers["X-Health-Model-Size"] = size
+        backend_type = get_backend_type()
+        response.headers["X-Health-Backend"] = backend_type
+        gpu_type = None
+        if backend_type == "mlx":
+            gpu_type = "Metal"
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            gpu_type = "MPS"
+        elif torch.cuda.is_available():
+            gpu_type = "CUDA"
+        if gpu_type:
+            response.headers["X-Health-GPU-Type"] = gpu_type
+    except Exception:
+        pass
+    return response
+
+
 # ============================================
 # ROOT & HEALTH ENDPOINTS
 # ============================================
@@ -67,16 +154,25 @@ async def shutdown_async():
     return {"message": "Shutting down..."}
 
 
+@app.get("/auth/me")
+async def auth_me_stub():
+    """Stub for chickenbox OAuth — voicebox backend has no auth."""
+    raise HTTPException(status_code=401, detail="Authentication not available")
+
+
 @app.get("/health", response_model=models.HealthResponse)
 async def health():
     """Health check endpoint."""
-    from huggingface_hub import hf_hub_download, constants as hf_constants
+    from huggingface_hub import constants as hf_constants
     from pathlib import Path
-    import os
 
     tts_model = tts.get_tts_model()
     backend_type = get_backend_type()
 
+    # Touch TTS idle timer — health polls indicate an active user session
+    if tts_model.is_loaded():
+        tts_model._idle_timer.touch()
+
     # Check for GPU availability (CUDA or MPS)
     has_cuda = torch.cuda.is_available()
     has_mps = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
@@ -118,7 +214,7 @@ async def health():
         # Check if the default model (1.7B) is cached
         # Use different model IDs based on backend
         if backend_type == "mlx":
-            default_model_id = "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-bf16"
+            default_model_id = "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-4bit"
         else:
             default_model_id = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
         
@@ -520,87 +616,135 @@ async def set_profile_channels(
 # GENERATION ENDPOINTS
 # ============================================
 
-@app.post("/generate", response_model=models.GenerationResponse)
+@app.post("/generate")
 async def generate_speech(
     data: models.GenerationRequest,
+    request: Request,
+    stream: bool = False,
     db: Session = Depends(get_db),
 ):
-    """Generate speech from text using a voice profile."""
+    """Generate speech from text using a voice profile.
+
+    With stream=False (default): blocks and returns GenerationResponse.
+    With stream=True: returns 202 with job_id, progress via SSE.
+    """
+    job_id = str(uuid.uuid4())
+    request_ip = _extract_request_ip(request)
+
+    if stream:
+        _expire_old_queued_jobs(db)
+
+        # Enforce per-user queue cap (queued/generating/cancelling).
+        user_id = (data.request_user_id or "").strip() or None
+        active_statuses = ["queued", "generating", "cancelling"]
+        if user_id:
+            active_count = db.query(DBGenerationJob).filter(
+                DBGenerationJob.status.in_(active_statuses),
+                DBGenerationJob.request_user_id == user_id,
+            ).count()
+        else:
+            active_count = db.query(DBGenerationJob).filter(
+                DBGenerationJob.status.in_(active_statuses),
+                DBGenerationJob.request_ip == request_ip,
+            ).count()
+
+        if active_count >= _MAX_ACTIVE_JOBS_PER_USER:
+            raise HTTPException(
+                status_code=429,
+                detail=f"Queue limit reached ({_MAX_ACTIVE_JOBS_PER_USER} active jobs per user).",
+            )
+
+        # --- Async (streaming) mode — create a job row, worker picks it up ---
+        job = DBGenerationJob(
+            id=job_id,
+            profile_id=data.profile_id,
+            text=data.text,
+            language=data.language,
+            seed=data.seed,
+            model_size=data.model_size or "1.7B",
+            instruct=data.instruct,
+            request_user_id=data.request_user_id,
+            request_user_first_name=data.request_user_first_name,
+            request_ip=request_ip,
+            status="queued",
+        )
+        db.add(job)
+        db.commit()
+
+        progress_manager = get_progress_manager()
+        # Initialize progress state so SSE endpoint has data immediately
+        progress_manager.update_progress(
+            model_name=job_id,
+            current=0,
+            total=100,
+            status="queued",
+        )
+
+        # Wake the worker
+        _job_signal.set()
+
+        logger.info(f"[TTS] Job {job_id} queued for profile {data.profile_id} from {request_ip}")
+        return JSONResponse(
+            status_code=202,
+            content=models.GenerationStartResponse(
+                generation_id=job_id,
+                status="queued",
+            ).model_dump(),
+        )
+
+    # --- Synchronous (blocking) mode — existing behavior for CLI/tests ---
+
+    # Check DB for any actively generating job
+    active = db.query(DBGenerationJob).filter(
+        DBGenerationJob.status == "generating"
+    ).first()
+    if active or _model_lock.locked():
+        raise HTTPException(
+            status_code=409,
+            detail="A generation is already in progress. Please wait for it to finish.",
+        )
+
     task_manager = get_task_manager()
-    generation_id = str(uuid.uuid4())
-    
     try:
-        # Start tracking generation
         task_manager.start_generation(
-            task_id=generation_id,
+            task_id=job_id,
             profile_id=data.profile_id,
             text=data.text,
         )
-        
-        # Get profile
+
         profile = await profiles.get_profile(data.profile_id, db)
         if not profile:
             raise HTTPException(status_code=404, detail="Profile not found")
-        
-        # Create voice prompt from profile
+
         voice_prompt = await profiles.create_voice_prompt_for_profile(
-            data.profile_id,
-            db,
+            data.profile_id, db,
         )
-        
-        # Generate audio
-        tts_model = tts.get_tts_model()
-        # Load the requested model size if different from current (async to not block)
-        model_size = data.model_size or "1.7B"
-
-        # Check if model needs to be downloaded first
-        model_path = tts_model._get_model_path(model_size)
-        if model_path.startswith("Qwen/"):
-            # Model not cached - check if it exists remotely or needs download
-            from huggingface_hub import constants as hf_constants
-            repo_cache = Path(hf_constants.HF_HUB_CACHE) / ("models--" + model_path.replace("/", "--"))
-            if not repo_cache.exists():
-                # Start download in background
-                model_name = f"qwen-tts-{model_size}"
-
-                async def download_model_background():
-                    try:
-                        await tts_model.load_model_async(model_size)
-                    except Exception as e:
-                        task_manager.error_download(model_name, str(e))
 
-                task_manager.start_download(model_name)
-                asyncio.create_task(download_model_background())
+        generation_started_at = datetime.utcnow()
+        async with _model_lock:
+            tts_model = tts.get_tts_model()
+            model_size = data.model_size or "1.7B"
 
-                # Return 202 Accepted with download info
+            # Don't silently download — require the model to be cached first
+            if not tts_model._is_model_cached(model_size):
+                model_name = f"qwen-tts-{model_size}"
                 raise HTTPException(
-                    status_code=202,
-                    detail={
-                        "message": f"Model {model_size} is being downloaded. Please wait and try again.",
-                        "model_name": model_name,
-                        "downloading": True
-                    }
+                    status_code=400,
+                    detail=f"Model {model_name} is not downloaded. Please download it first from the Models page.",
                 )
 
-        await tts_model.load_model_async(model_size)
-        audio, sample_rate = await tts_model.generate(
-            data.text,
-            voice_prompt,
-            data.language,
-            data.seed,
-            data.instruct,
-        )
+            await tts_model.load_model_async(model_size)
+            save_model_prefs(tts_size=model_size)
+            audio, sample_rate = await tts_model.generate(
+                data.text, voice_prompt, data.language, data.seed, data.instruct,
+            )
+        generation_time_seconds = (datetime.utcnow() - generation_started_at).total_seconds()
 
-        # Calculate duration
         duration = len(audio) / sample_rate
-
-        # Save audio
-        audio_path = config.get_generations_dir() / f"{generation_id}.wav"
-
+        audio_path = config.get_generations_dir() / f"{job_id}.wav"
         from .utils.audio import save_audio
         save_audio(audio, str(audio_path), sample_rate)
 
-        # Create history entry
         generation = await history.create_generation(
             profile_id=data.profile_id,
             text=data.text,
@@ -610,21 +754,190 @@ async def download_model_background():
             seed=data.seed,
             db=db,
             instruct=data.instruct,
+            model_size=model_size,
+            backend_type=get_backend_type(),
+            request_user_id=data.request_user_id,
+            request_user_first_name=data.request_user_first_name,
+            request_ip=request_ip,
+            generation_time_seconds=generation_time_seconds,
         )
-        
-        # Mark generation as complete
-        task_manager.complete_generation(generation_id)
-        
+        task_manager.complete_generation(job_id)
         return generation
-        
+
     except ValueError as e:
-        task_manager.complete_generation(generation_id)
+        task_manager.complete_generation(job_id)
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
-        task_manager.complete_generation(generation_id)
+        task_manager.complete_generation(job_id)
         raise HTTPException(status_code=500, detail=str(e))
 
 
+@app.get("/generate/progress/{generation_id}")
+async def generation_progress(generation_id: str):
+    """Stream generation progress via Server-Sent Events."""
+    progress_manager = get_progress_manager()
+
+    async def event_generator():
+        async for event in progress_manager.subscribe(generation_id):
+            yield event
+
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+
+@app.get("/generate/busy")
+async def generation_busy(db: Session = Depends(get_db)):
+    """Check if a generation is currently running (status=generating, not queued)."""
+    active = db.query(DBGenerationJob).filter(
+        DBGenerationJob.status.in_(["generating", "cancelling"])
+    ).first()
+    return {"busy": active is not None}
+
+
+@app.get("/jobs/pending", response_model=List[models.GenerationJobResponse])
+async def list_pending_jobs(db: Session = Depends(get_db)):
+    """Return all queued and generating jobs, oldest first."""
+    jobs = db.query(DBGenerationJob, DBVoiceProfile.name).join(
+        DBVoiceProfile, DBGenerationJob.profile_id == DBVoiceProfile.id
+    ).filter(
+        DBGenerationJob.status.in_(["queued", "generating", "cancelling"])
+    ).order_by(DBGenerationJob.created_at).all()
+
+    return [
+        models.GenerationJobResponse(
+            id=job.id,
+            profile_id=job.profile_id,
+            profile_name=profile_name,
+            text=job.text,
+            language=job.language,
+            model_size=job.model_size,
+            backend_type=job.backend_type,
+            request_user_id=job.request_user_id,
+            request_user_first_name=job.request_user_first_name,
+            request_ip=job.request_ip,
+            status=job.status,
+            progress=job.progress,
+            generation_id=job.generation_id,
+            instruct=job.instruct,
+            created_at=job.created_at,
+            started_at=job.started_at,
+            completed_at=job.completed_at,
+        )
+        for job, profile_name in jobs
+    ]
+
+
+@app.get("/jobs", response_model=List[models.GenerationJobResponse])
+async def list_jobs(
+    limit: int = Query(default=20, ge=1, le=100),
+    offset: int = Query(default=0, ge=0),
+    status: Optional[str] = Query(default="queued,generating,cancelling,complete"),
+    db: Session = Depends(get_db),
+):
+    """List jobs with optional status filter (comma-separated) and pagination. Default excludes deleted jobs."""
+    query = db.query(DBGenerationJob, DBVoiceProfile.name, DBGeneration).outerjoin(
+        DBVoiceProfile, DBGenerationJob.profile_id == DBVoiceProfile.id
+    ).outerjoin(
+        DBGeneration, DBGenerationJob.generation_id == DBGeneration.id
+    ).filter(
+        DBGenerationJob.status != "deleted"
+    ).filter(
+        (DBGenerationJob.status != "complete") | (DBGeneration.id.isnot(None))
+    )
+
+    if status:
+        statuses = [s.strip() for s in status.split(",") if s.strip()]
+        if statuses:
+            query = query.filter(DBGenerationJob.status.in_(statuses))
+
+    rows = query.order_by(DBGenerationJob.created_at.desc()).offset(offset).limit(limit).all()
+    return [
+        models.GenerationJobResponse(
+            id=job.id,
+            profile_id=job.profile_id,
+            profile_name=profile_name or "Unknown",
+            text=job.text,
+            language=job.language,
+            model_size=job.model_size,
+            backend_type=job.backend_type or (generation.backend_type if generation else None),
+            request_user_id=job.request_user_id,
+            request_user_first_name=job.request_user_first_name,
+            request_ip=job.request_ip,
+            status=job.status,
+            progress=job.progress,
+            generation_id=job.generation_id,
+            audio_path=generation.audio_path if generation else None,
+            duration=generation.duration if generation else None,
+            generation_time_seconds=generation.generation_time_seconds if generation else None,
+            instruct=job.instruct or (generation.instruct if generation else None),
+            created_at=job.created_at,
+            started_at=job.started_at,
+            completed_at=job.completed_at,
+        )
+        for job, profile_name, generation in rows
+    ]
+
+
+@app.post("/jobs/{job_id}/cancel")
+async def cancel_job(job_id: str, db: Session = Depends(get_db)):
+    """Cancel a queued/generating job."""
+    job = db.query(DBGenerationJob).filter(DBGenerationJob.id == job_id).first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+
+    progress_manager = get_progress_manager()
+    now = datetime.utcnow()
+
+    if job.status == "queued":
+        job.status = "cancelled"
+        job.error = "Cancelled by user"
+        job.completed_at = now
+        db.commit()
+        progress_manager.mark_error(job_id, "Cancelled by user")
+        return {"status": "cancelled"}
+
+    if job.status in ("generating", "cancelling"):
+        _cancel_requested_jobs.add(job_id)
+        job.status = "cancelling"
+        db.commit()
+        return {"status": "cancelling"}
+
+    return {"status": job.status}
+
+
+@app.post("/jobs/{job_id}/cancel/force")
+async def force_cancel_job(job_id: str, db: Session = Depends(get_db)):
+    """Force-cancel a job and unload the model backend."""
+    _cancel_requested_jobs.add(job_id)
+
+    job = db.query(DBGenerationJob).filter(DBGenerationJob.id == job_id).first()
+    if job and job.status in ("queued", "generating", "cancelling"):
+        job.status = "cancelled"
+        job.error = "Force-cancelled by user"
+        job.completed_at = datetime.utcnow()
+        db.commit()
+
+    # Best-effort hard stop for current backend.
+    try:
+        tts.unload_tts_model()
+    except Exception:
+        pass
+
+    get_progress_manager().mark_error(job_id, "Force-cancelled by user")
+    
+    # Wake the job worker to process next queued job
+    _job_signal.set()
+    
+    return {"status": "cancelled"}
+
+
 # ============================================
 # HISTORY ENDPOINTS
 # ============================================
@@ -694,7 +1007,8 @@ async def get_generation(
         DBVoiceProfile,
         DBGeneration.profile_id == DBVoiceProfile.id
     ).filter(
-        DBGeneration.id == generation_id
+        DBGeneration.id == generation_id,
+        DBGeneration.deleted_at.is_(None)
     ).first()
     
     if not result:
@@ -709,8 +1023,14 @@ async def get_generation(
         language=gen.language,
         audio_path=gen.audio_path,
         duration=gen.duration,
+        generation_time_seconds=gen.generation_time_seconds,
         seed=gen.seed,
         instruct=gen.instruct,
+        model_size=gen.model_size,
+        backend_type=gen.backend_type,
+        request_user_id=gen.request_user_id,
+        request_user_first_name=gen.request_user_first_name,
+        request_ip=gen.request_ip,
         created_at=gen.created_at,
     )
 
@@ -724,6 +1044,19 @@ async def delete_generation(
     success = await history.delete_generation(generation_id, db)
     if not success:
         raise HTTPException(status_code=404, detail="Generation not found")
+
+    # Mark linked queue/job rows as deleted so /jobs lists do not surface them.
+    jobs = db.query(DBGenerationJob).filter(DBGenerationJob.generation_id == generation_id).all()
+    now = datetime.utcnow()
+    for job in jobs:
+        job.status = "deleted"
+        if not job.completed_at:
+            job.completed_at = now
+        if not job.error:
+            job.error = "Deleted by user"
+    if jobs:
+        db.commit()
+
     return {"message": "Generation deleted successfully"}
 
 
@@ -735,7 +1068,10 @@ async def export_generation(
     """Export a generation as a ZIP archive."""
     try:
         # Get generation to create filename
-        generation = db.query(DBGeneration).filter_by(id=generation_id).first()
+        generation = db.query(DBGeneration).filter(
+            DBGeneration.id == generation_id,
+            DBGeneration.deleted_at.is_(None),
+        ).first()
         if not generation:
             raise HTTPException(status_code=404, detail="Generation not found")
         
@@ -768,7 +1104,10 @@ async def export_generation_audio(
     db: Session = Depends(get_db),
 ):
     """Export only the audio file from a generation."""
-    generation = db.query(DBGeneration).filter_by(id=generation_id).first()
+    generation = db.query(DBGeneration).filter(
+        DBGeneration.id == generation_id,
+        DBGeneration.deleted_at.is_(None),
+    ).first()
     if not generation:
         raise HTTPException(status_code=404, detail="Generation not found")
     
@@ -801,28 +1140,47 @@ async def transcribe_audio(
     language: Optional[str] = Form(None),
 ):
     """Transcribe audio file to text."""
+    logger.debug(f"[Transcribe] Received file: {file.filename}, content_type: {file.content_type}")
+    
     # Save uploaded file to temporary location
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         content = await file.read()
         tmp.write(content)
         tmp_path = tmp.name
     
+    logger.debug(f"[Transcribe] Saved to temp file: {tmp_path}, size: {len(content)} bytes")
+    
     try:
         # Get audio duration
         from .utils.audio import load_audio
+        logger.debug(f"[Transcribe] Loading audio from {tmp_path}")
         audio, sr = load_audio(tmp_path)
         duration = len(audio) / sr
+        logger.debug(f"[Transcribe] Audio loaded: {len(audio)} samples, {sr}Hz, {duration:.2f}s")
         
         # Transcribe
         whisper_model = transcribe.get_whisper_model()
 
-        # Check if Whisper model is downloaded (uses default size "base")
+        # Check if Whisper model is downloaded
         model_size = whisper_model.model_size
-        model_name = f"openai/whisper-{model_size}"
+        logger.debug(f"[Transcribe] Using whisper model size: {model_size}")
+        
+        # Get the correct model path based on backend type
+        backend_type = get_backend_type()
+        if backend_type == "mlx":
+            from .backends.mlx_backend import MLXSTTBackend
+            mlx_whisper_map = MLXSTTBackend.get_mlx_whisper_model_map()
+            model_repo_id = mlx_whisper_map.get(model_size, f"mlx-community/whisper-{model_size}-mlx")
+        else:
+            model_repo_id = f"openai/whisper-{model_size}"
+
+        logger.debug(f"[Transcribe] Model repo ID: {model_repo_id}")
 
         # Check if model is cached
         from huggingface_hub import constants as hf_constants
-        repo_cache = Path(hf_constants.HF_HUB_CACHE) / ("models--" + model_name.replace("/", "--"))
+        repo_cache = Path(hf_constants.HF_HUB_CACHE) / ("models--" + model_repo_id.replace("/", "--"))
+        logger.debug(f"[Transcribe] Checking cache at: {repo_cache}, exists: {repo_cache.exists()}")
+        
         if not repo_cache.exists():
             # Start download in background
             progress_model_name = f"whisper-{model_size}"
@@ -831,6 +1189,7 @@ async def download_whisper_background():
                 try:
                     await whisper_model.load_model_async(model_size)
                 except Exception as e:
+                    logger.exception(f"[Transcribe] Background download error: {e}")
                     get_task_manager().error_download(progress_model_name, str(e))
 
             get_task_manager().start_download(progress_model_name)
@@ -846,14 +1205,20 @@ async def download_whisper_background():
                 }
             )
 
+        logger.debug("[Transcribe] Starting transcription...")
         text = await whisper_model.transcribe(tmp_path, language)
+        save_model_prefs(stt_size=model_size)
+        logger.debug(f"[Transcribe] Transcription complete: {text[:100] if text else '(empty)'}...")
         
         return models.TranscriptionResponse(
             text=text,
             duration=duration,
         )
         
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.exception(f"[Transcribe] ERROR: {e}")
         raise HTTPException(status_code=500, detail=str(e))
     finally:
         # Clean up temp file
@@ -1115,6 +1480,7 @@ async def load_model(model_size: str = "1.7B"):
     try:
         tts_model = tts.get_tts_model()
         await tts_model.load_model_async(model_size)
+        save_model_prefs(tts_size=model_size)
         return {"message": f"Model {model_size} loaded successfully"}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
@@ -1134,14 +1500,14 @@ async def unload_model():
 async def get_model_progress(model_name: str):
     """Get model download progress via Server-Sent Events."""
     from fastapi.responses import StreamingResponse
-    
+
     progress_manager = get_progress_manager()
-    
+
     async def event_generator():
         """Generate SSE events for progress updates."""
         async for event in progress_manager.subscribe(model_name):
             yield event
-    
+
     return StreamingResponse(
         event_generator(),
         media_type="text/event-stream",
@@ -1153,6 +1519,16 @@ async def event_generator():
     )
 
 
+@app.get("/models/progress-snapshot/{model_name}")
+async def get_model_progress_snapshot(model_name: str):
+    """Get current model download progress as a single JSON snapshot (for polling)."""
+    progress_manager = get_progress_manager()
+    progress = progress_manager.get_progress(model_name)
+    if progress is None:
+        return {"model_name": model_name, "status": "idle", "current": 0, "total": 0, "progress": 0, "filename": None}
+    return progress
+
+
 @app.get("/models/status", response_model=models.ModelStatusListResponse)
 async def get_model_status():
     """Get status of all available models."""
@@ -1190,13 +1566,17 @@ def check_whisper_loaded(model_size: str):
     
     # Use backend-specific model IDs
     if backend_type == "mlx":
-        tts_1_7b_id = "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-bf16"
-        tts_0_6b_id = "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-bf16"  # Fallback to 1.7B
-        # MLX backend uses openai/whisper-* models, not mlx-community
-        whisper_base_id = "openai/whisper-base"
-        whisper_small_id = "openai/whisper-small"
-        whisper_medium_id = "openai/whisper-medium"
-        whisper_large_id = "openai/whisper-large"
+        from .backends.mlx_backend import MLXTTSBackend, MLXSTTBackend
+        _mlx_tts = MLXTTSBackend()
+        tts_1_7b_id = _mlx_tts._get_model_path("1.7B")
+        tts_0_6b_id = _mlx_tts._get_model_path("0.6B")
+        # MLX backend uses mlx-community Whisper models
+        mlx_whisper_map = MLXSTTBackend.get_mlx_whisper_model_map()
+        whisper_base_id = mlx_whisper_map["base"]
+        whisper_small_id = mlx_whisper_map["small"]
+        whisper_medium_id = mlx_whisper_map["medium"]
+        whisper_large_id = mlx_whisper_map["large"]
+        whisper_large_v3_turbo_id = mlx_whisper_map["large-v3-turbo"]
     else:
         tts_1_7b_id = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
         tts_0_6b_id = "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
@@ -1204,6 +1584,7 @@ def check_whisper_loaded(model_size: str):
         whisper_small_id = "openai/whisper-small"
         whisper_medium_id = "openai/whisper-medium"
         whisper_large_id = "openai/whisper-large"
+        whisper_large_v3_turbo_id = "openai/whisper-large-v3-turbo"
     
     model_configs = [
         {
@@ -1248,6 +1629,13 @@ def check_whisper_loaded(model_size: str):
             "model_size": "large",
             "check_loaded": lambda: check_whisper_loaded("large"),
         },
+        {
+            "model_name": "whisper-large-v3-turbo",
+            "display_name": "Whisper Large V3 Turbo",
+            "hf_repo_id": whisper_large_v3_turbo_id,
+            "model_size": "large-v3-turbo",
+            "check_loaded": lambda: check_whisper_loaded("large-v3-turbo"),
+        },
     ]
     
     # Build a mapping of model_name -> hf_repo_id so we can check if shared repos are downloading
@@ -1268,7 +1656,7 @@ def check_whisper_loaded(model_size: str):
     
     statuses = []
     
-    for config in model_configs:
+    for model_config in model_configs:
         try:
             downloaded = False
             size_mb = None
@@ -1276,7 +1664,7 @@ def check_whisper_loaded(model_size: str):
             
             # Method 1: Try using scan_cache_dir if available
             if cache_info:
-                repo_id = config["hf_repo_id"]
+                repo_id = model_config["hf_repo_id"]
                 for repo in cache_info.repos:
                     if repo.repo_id == repo_id:
                         # Check if actual model weight files exist (not just config files)
@@ -1316,7 +1704,7 @@ def check_whisper_loaded(model_size: str):
             if not downloaded:
                 try:
                     cache_dir = hf_constants.HF_HUB_CACHE
-                    repo_cache = Path(cache_dir) / ("models--" + config["hf_repo_id"].replace("/", "--"))
+                    repo_cache = Path(cache_dir) / ("models--" + model_config["hf_repo_id"].replace("/", "--"))
                     
                     if repo_cache.exists():
                         # Check for .incomplete files - if any exist, download is still in progress
@@ -1356,42 +1744,53 @@ def check_whisper_loaded(model_size: str):
             
             # Check if loaded in memory
             try:
-                loaded = config["check_loaded"]()
+                loaded = model_config["check_loaded"]()
             except Exception:
                 loaded = False
-            
+
             # Check if this model (or its shared repo) is currently being downloaded
-            is_downloading = config["hf_repo_id"] in active_download_repos
-            
+            is_downloading = model_config["hf_repo_id"] in active_download_repos
+
             # If downloading, don't report as downloaded (partial files exist)
             if is_downloading:
                 downloaded = False
                 size_mb = None  # Don't show partial size during download
-            
+
+            # If no size from disk, query HuggingFace API
+            if size_mb is None and not is_downloading:
+                from .utils.hf_sizes import get_repo_size_mb
+                size_mb = await get_repo_size_mb(model_config["hf_repo_id"])
+
             statuses.append(models.ModelStatus(
-                model_name=config["model_name"],
-                display_name=config["display_name"],
+                model_name=model_config["model_name"],
+                display_name=model_config["display_name"],
                 downloaded=downloaded,
                 downloading=is_downloading,
                 size_mb=size_mb,
                 loaded=loaded,
             ))
-        except Exception as e:
+        except Exception:
             # If check fails, try to at least check if loaded
             try:
-                loaded = config["check_loaded"]()
+                loaded = model_config["check_loaded"]()
             except Exception:
                 loaded = False
-            
+
             # Check if this model (or its shared repo) is currently being downloaded
-            is_downloading = config["hf_repo_id"] in active_download_repos
-            
+            is_downloading = model_config["hf_repo_id"] in active_download_repos
+
+            # If not downloading, try to get size from HuggingFace API
+            size_mb = None
+            if not is_downloading:
+                from .utils.hf_sizes import get_repo_size_mb
+                size_mb = await get_repo_size_mb(model_config["hf_repo_id"])
+
             statuses.append(models.ModelStatus(
-                model_name=config["model_name"],
-                display_name=config["display_name"],
+                model_name=model_config["model_name"],
+                display_name=model_config["display_name"],
                 downloaded=False,  # Assume not downloaded if check failed
                 downloading=is_downloading,
-                size_mb=None,
+                size_mb=size_mb,
                 loaded=loaded,
             ))
     
@@ -1431,6 +1830,10 @@ async def trigger_model_download(request: models.ModelDownloadRequest):
             "model_size": "large",
             "load_func": lambda: transcribe.get_whisper_model().load_model("large"),
         },
+        "whisper-large-v3-turbo": {
+            "model_size": "large-v3-turbo",
+            "load_func": lambda: transcribe.get_whisper_model().load_model("large-v3-turbo"),
+        },
     }
     
     if request.model_name not in model_configs:
@@ -1446,13 +1849,18 @@ async def download_in_background():
             # If it's a coroutine, await it
             if asyncio.iscoroutine(result):
                 await result
+            # Mark progress as complete - this notifies SSE listeners
+            # This is needed because _load_model_sync only marks complete if
+            # the model wasn't already cached, but we always init progress here
+            progress_manager.mark_complete(request.model_name)
             task_manager.complete_download(request.model_name)
         except Exception as e:
+            progress_manager.mark_error(request.model_name, str(e))
             task_manager.error_download(request.model_name, str(e))
 
     # Start tracking download
     task_manager.start_download(request.model_name)
-    
+
     # Initialize progress state so SSE endpoint has initial data to send.
     # This fixes a race condition where the frontend connects to SSE before
     # any progress callbacks have fired (especially for large models like Qwen
@@ -1465,18 +1873,33 @@ async def download_in_background():
         status="downloading",
     )
 
-    # Start download in background task (don't await)
-    asyncio.create_task(download_in_background())
+    # Start download in background task and store reference for cancellation
+    bg_task = asyncio.create_task(download_in_background())
+    task_manager.set_download_task(request.model_name, bg_task)
 
     # Return immediately - frontend should poll progress endpoint
     return {"message": f"Model {request.model_name} download started"}
 
 
+@app.post("/models/cancel/{model_name}")
+async def cancel_model_download(model_name: str):
+    """Cancel an in-progress model download."""
+    task_manager = get_task_manager()
+    progress_manager = get_progress_manager()
+
+    if not task_manager.is_download_active(model_name):
+        raise HTTPException(status_code=404, detail=f"No active download for {model_name}")
+
+    cancelled = task_manager.cancel_download(model_name)
+    if cancelled:
+        progress_manager.mark_error(model_name, "Download cancelled")
+    return {"message": f"Download of {model_name} cancelled"}
+
+
 @app.delete("/models/{model_name}")
 async def delete_model(model_name: str):
     """Delete a downloaded model from the HuggingFace cache."""
     import shutil
-    import os
     from huggingface_hub import constants as hf_constants
     
     # Map model names to HuggingFace repo IDs
@@ -1561,7 +1984,7 @@ async def clear_cache():
     try:
         deleted_count = clear_voice_prompt_cache()
         return {
-            "message": f"Voice prompt cache cleared successfully",
+            "message": "Voice prompt cache cleared successfully",
             "files_deleted": deleted_count,
         }
     except Exception as e:
@@ -1650,39 +2073,346 @@ def _get_gpu_status() -> str:
     return "None (CPU only)"
 
 
-@app.on_event("startup")
-async def startup_event():
+def _get_model_prefs_path() -> Path:
+    """Get path to model preferences JSON file."""
+    return config.get_data_dir() / "model_prefs.json"
+
+
+def _load_model_prefs() -> dict:
+    """Load model preferences from disk."""
+    import json
+    prefs_path = _get_model_prefs_path()
+    if prefs_path.exists():
+        try:
+            return json.loads(prefs_path.read_text())
+        except Exception:
+            pass
+    return {}
+
+
+def save_model_prefs(tts_size: str = None, stt_size: str = None):
+    """Save model preferences to disk (called after successful model load)."""
+    import json
+    prefs = _load_model_prefs()
+    if tts_size:
+        prefs["tts_model_size"] = tts_size
+    if stt_size:
+        prefs["stt_model_size"] = stt_size
+    try:
+        _get_model_prefs_path().write_text(json.dumps(prefs, indent=2))
+    except Exception as e:
+        logger.warning(f"Could not save model preferences: {e}")
+
+
+def _cleanup_stale_jobs():
+    """Mark any leftover queued/generating jobs as timeout on server start."""
+    db = database.SessionLocal()
+    try:
+        stale = db.query(DBGenerationJob).filter(
+            DBGenerationJob.status.in_(["queued", "generating", "cancelling"])
+        ).all()
+        for job in stale:
+            job.status = "timeout"
+            job.completed_at = datetime.utcnow()
+            logger.info(f"[TTS] Marked stale job {job.id} as timeout (server restart)")
+        if stale:
+            db.commit()
+            logger.info(f"[TTS] Cleaned up {len(stale)} stale jobs from previous run")
+    finally:
+        db.close()
+
+
+async def _job_worker():
+    """Background worker that processes queued generation jobs one at a time."""
+    logger.info("[TTS] Job worker started")
+    while True:
+        try:
+            # Wait for signal or poll every 2s
+            try:
+                await asyncio.wait_for(_job_signal.wait(), timeout=2.0)
+                _job_signal.clear()
+            except asyncio.TimeoutError:
+                pass
+
+            db = database.SessionLocal()
+            try:
+                # Check for stuck jobs (generating > 5 min)
+                from datetime import timedelta
+                _expire_old_queued_jobs(db)
+
+                cutoff = datetime.utcnow() - timedelta(minutes=_GENERATING_JOB_TIMEOUT_MINUTES)
+                stuck = db.query(DBGenerationJob).filter(
+                    DBGenerationJob.status.in_(["generating", "cancelling"]),
+                    DBGenerationJob.started_at < cutoff,
+                ).all()
+                for job in stuck:
+                    job.status = "timeout"
+                    job.error = "Generation timeout"
+                    job.completed_at = datetime.utcnow()
+                    try:
+                        get_progress_manager().mark_error(job.id, "Generation timeout")
+                    except Exception:
+                        pass
+                    logger.warning(f"[TTS] Job {job.id} timed out (stuck >5 min)")
+                if stuck:
+                    db.commit()
+
+                # Skip if something is already generating
+                active = db.query(DBGenerationJob).filter(
+                    DBGenerationJob.status.in_(["generating", "cancelling"])
+                ).first()
+                if active:
+                    continue
+
+                # Pick oldest queued job
+                job = db.query(DBGenerationJob).filter(
+                    DBGenerationJob.status == "queued"
+                ).order_by(DBGenerationJob.created_at).first()
+                if not job:
+                    continue
+
+                # Mark as generating
+                job.status = "generating"
+                job.started_at = datetime.utcnow()
+                db.commit()
+
+                job_id = job.id
+                profile_id = job.profile_id
+                text = job.text
+                language = job.language
+                seed = job.seed
+                model_size = job.model_size or "1.7B"
+                instruct = job.instruct
+                request_user_id = job.request_user_id
+                request_user_first_name = job.request_user_first_name
+                request_ip = job.request_ip
+                backend_type = get_backend_type()
+                job.backend_type = backend_type
+                db.commit()
+            finally:
+                db.close()
+
+            logger.info(f"[TTS] Job {job_id} starting generation (ip={request_ip or 'unknown'})")
+
+            progress_manager = get_progress_manager()
+            task_manager = get_task_manager()
+
+            task_manager.start_generation(
+                task_id=job_id,
+                profile_id=profile_id,
+                text=text,
+            )
+
+            # Update SSE to "generating" status
+            progress_manager.update_progress(
+                model_name=job_id,
+                current=0,
+                total=100,
+                status="generating",
+            )
+
+            gen_db = database.SessionLocal()
+            try:
+                generation_started_at = datetime.utcnow()
+                async with _model_lock:
+                    def on_progress(pct):
+                        if job_id in _cancel_requested_jobs:
+                            raise RuntimeError("Cancelled by user")
+                        progress_manager.update_progress(
+                            model_name=job_id,
+                            current=int(pct),
+                            total=100,
+                            status="generating",
+                        )
+                        task_manager.update_generation_progress(job_id, pct)
+                        # Throttled DB update (~every 5%)
+                        if int(pct) % 5 == 0:
+                            try:
+                                upd_db = database.SessionLocal()
+                                upd_job = upd_db.query(DBGenerationJob).get(job_id)
+                                if upd_job:
+                                    upd_job.progress = pct
+                                    upd_db.commit()
+                                upd_db.close()
+                            except Exception:
+                                pass
+
+                    profile = await profiles.get_profile(profile_id, gen_db)
+                    if not profile:
+                        raise ValueError("Profile not found")
+
+                    voice_prompt = await profiles.create_voice_prompt_for_profile(
+                        profile_id, gen_db,
+                    )
+
+                    tts_model = tts.get_tts_model()
+
+                    # Don't silently download — require the model to be cached first
+                    if not tts_model._is_model_cached(model_size):
+                        model_name = f"qwen-tts-{model_size}"
+                        raise ValueError(f"Model {model_name} is not downloaded. Please download it first from the Models page.")
+
+                    await tts_model.load_model_async(model_size)
+                    save_model_prefs(tts_size=model_size)
+
+                    audio, sample_rate = await tts_model.generate(
+                        text, voice_prompt, language, seed, instruct,
+                        progress_callback=on_progress,
+                    )
+
+                duration = len(audio) / sample_rate
+                generation_time_seconds = (datetime.utcnow() - generation_started_at).total_seconds()
+                audio_path = config.get_generations_dir() / f"{job_id}.wav"
+                from .utils.audio import save_audio
+                save_audio(audio, str(audio_path), sample_rate)
+
+                generation = await history.create_generation(
+                    profile_id=profile_id,
+                    text=text,
+                    language=language,
+                    audio_path=str(audio_path),
+                    duration=duration,
+                    seed=seed,
+                    db=gen_db,
+                    instruct=instruct,
+                    model_size=model_size,
+                    backend_type=backend_type,
+                    request_user_id=request_user_id,
+                    request_user_first_name=request_user_first_name,
+                    request_ip=request_ip,
+                    generation_time_seconds=generation_time_seconds,
+                )
+
+                # Mark job complete
+                job_row = gen_db.query(DBGenerationJob).get(job_id)
+                if job_row:
+                    job_row.status = "complete"
+                    job_row.progress = 100.0
+                    job_row.generation_id = generation.id if hasattr(generation, 'id') else None
+                    job_row.completed_at = datetime.utcnow()
+                    gen_db.commit()
+
+                progress_manager.mark_complete(job_id)
+                task_manager.complete_generation(job_id)
+                _cancel_requested_jobs.discard(job_id)
+                logger.info(f"[TTS] Job {job_id} complete ({duration:.1f}s audio)")
+
+            except Exception as e:
+                logger.exception(f"[TTS] Job {job_id} failed: {e}")
+                is_cancelled = job_id in _cancel_requested_jobs or "cancel" in str(e).lower()
+                progress_manager.mark_error(job_id, "Cancelled by user" if is_cancelled else str(e))
+                task_manager.complete_generation(job_id)
+
+                err_db = database.SessionLocal()
+                try:
+                    err_job = err_db.query(DBGenerationJob).get(job_id)
+                    if err_job:
+                        err_job.status = "cancelled" if is_cancelled else "error"
+                        err_job.error = ("Cancelled by user" if is_cancelled else str(e))[:1000]
+                        err_job.completed_at = datetime.utcnow()
+                        err_db.commit()
+                finally:
+                    err_db.close()
+                _cancel_requested_jobs.discard(job_id)
+            finally:
+                gen_db.close()
+
+            # Immediately check for more queued jobs
+            _job_signal.set()
+
+        except asyncio.CancelledError:
+            logger.info("[TTS] Job worker shutting down")
+            break
+        except Exception as e:
+            logger.exception(f"[TTS] Job worker error: {e}")
+            await asyncio.sleep(2)
+
+
+async def _startup():
     """Run on application startup."""
-    print("voicebox API starting up...")
+    _log_level = os.environ.get("LOG_LEVEL", "INFO").upper()
+    logging.basicConfig(
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+        level=getattr(logging, _log_level, logging.INFO),
+    )
+    logger.info("voicebox API starting up...")
     database.init_db()
-    print(f"Database initialized at {database._db_path}")
+    logger.info(f"Database initialized at {database._db_path}")
     backend_type = get_backend_type()
-    print(f"Backend: {backend_type.upper()}")
-    print(f"GPU available: {_get_gpu_status()}")
+    logger.info(f"Backend: {backend_type.upper()}")
+    logger.info(f"GPU available: {_get_gpu_status()}")
 
     # Initialize progress manager with main event loop for thread-safe operations
     try:
         progress_manager = get_progress_manager()
         progress_manager._set_main_loop(asyncio.get_running_loop())
-        print("Progress manager initialized with event loop")
+        logger.info("Progress manager initialized with event loop")
     except Exception as e:
-        print(f"Warning: Could not initialize progress manager event loop: {e}")
+        logger.warning(f"Could not initialize progress manager event loop: {e}")
 
-    # Ensure HuggingFace cache directory exists
+    # HuggingFace setup
     try:
         from huggingface_hub import constants as hf_constants
+
+        # Check for HF_TOKEN (huggingface_hub reads it automatically)
+        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+        if hf_token:
+            logger.info(f"HuggingFace token: {'*' * 4}{hf_token[-4:]}")
+        else:
+            logger.info("HuggingFace token: not set (set HF_TOKEN env var for gated models)")
+
         cache_dir = Path(hf_constants.HF_HUB_CACHE)
         cache_dir.mkdir(parents=True, exist_ok=True)
-        print(f"HuggingFace cache directory: {cache_dir}")
+        logger.info(f"HuggingFace cache directory: {cache_dir}")
     except Exception as e:
-        print(f"Warning: Could not create HuggingFace cache directory: {e}")
-        print("Model downloads may fail. Please ensure the directory exists and has write permissions.")
+        logger.warning(f"Could not set up HuggingFace: {e}")
+        logger.warning("Model downloads may fail. Please ensure the directory exists and has write permissions.")
+
+    # Set event loop on idle timers so they can schedule unloads
+    loop = asyncio.get_running_loop()
+    try:
+        tts.get_tts_model()._idle_timer.set_loop(loop)
+    except Exception:
+        pass
+    try:
+        transcribe.get_whisper_model()._idle_timer.set_loop(loop)
+    except Exception:
+        pass
+
+    # Clear stale jobs from previous server run
+    _cleanup_stale_jobs()
+
+    # Preload models in the background based on last-used preferences
+    asyncio.create_task(_preload_models())
+
+    # Start the job worker
+    asyncio.create_task(_job_worker())
+
+
+async def _preload_models():
+    """Preload TTS and STT models based on saved preferences."""
+    prefs = _load_model_prefs()
+    tts_size = prefs.get("tts_model_size", "1.7B")
+
+    # Preload TTS model
+    try:
+        tts_backend = tts.get_tts_model()
+        if tts_backend._is_model_cached(tts_size):
+            logger.info(f"Preloading TTS model ({tts_size})...")
+            await tts_backend.load_model_async(tts_size)
+            logger.info(f"TTS model ({tts_size}) preloaded")
+        else:
+            logger.info(f"TTS model ({tts_size}) not cached, skipping preload")
+    except Exception as e:
+        logger.warning(f"TTS preload failed: {e}", exc_info=True)
+
+    # STT model is NOT preloaded — it loads on first /transcribe call.
+    # This saves memory when the user doesn't use Create Voice.
 
 
-@app.on_event("shutdown")
-async def shutdown_event():
+async def _shutdown():
     """Run on application shutdown."""
-    print("voicebox API shutting down...")
+    logger.info("voicebox API shutting down...")
     # Unload models to free memory
     tts.unload_tts_model()
     transcribe.unload_whisper_model()
@@ -1721,9 +2451,11 @@ async def shutdown_event():
     # Initialize database after data directory is set
     database.init_db()
 
+    _log_level = os.environ.get("LOG_LEVEL", "info").lower()
     uvicorn.run(
         "backend.main:app",
         host=args.host,
         port=args.port,
         reload=False,  # Disable reload in production
+        log_level=_log_level,
     )
diff --git a/backend/models.py b/backend/models.py
index 59e4540..2bda6ff 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -57,6 +57,8 @@ class GenerationRequest(BaseModel):
     seed: Optional[int] = Field(None, ge=0)
     model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B)$")
     instruct: Optional[str] = Field(None, max_length=500)
+    request_user_id: Optional[str] = Field(None, max_length=128)
+    request_user_first_name: Optional[str] = Field(None, max_length=64)
 
 
 class GenerationResponse(BaseModel):
@@ -67,14 +69,62 @@ class GenerationResponse(BaseModel):
     language: str
     audio_path: str
     duration: float
+    generation_time_seconds: Optional[float] = None
     seed: Optional[int]
     instruct: Optional[str]
+    model_size: Optional[str] = None
+    backend_type: Optional[str] = None
+    request_user_id: Optional[str] = None
+    request_user_first_name: Optional[str] = None
+    request_ip: Optional[str] = None
     created_at: datetime
 
     class Config:
         from_attributes = True
 
 
+class GenerationStartResponse(BaseModel):
+    """Response model for async generation start."""
+    generation_id: str
+    status: str = "queued"
+
+
+class GenerationJobResponse(BaseModel):
+    """Response model for a pending generation job."""
+    id: str
+    profile_id: str
+    profile_name: str
+    text: str
+    language: str
+    model_size: Optional[str] = None
+    backend_type: Optional[str] = None
+    request_user_id: Optional[str] = None
+    request_user_first_name: Optional[str] = None
+    request_ip: Optional[str] = None
+    status: str  # queued | generating | cancelling | complete | cancelled | error | timeout
+    progress: float
+    generation_id: Optional[str] = None
+    audio_path: Optional[str] = None
+    duration: Optional[float] = None
+    generation_time_seconds: Optional[float] = None
+    instruct: Optional[str] = None
+    created_at: datetime
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+
+    class Config:
+        from_attributes = True
+
+
+class GenerationProgressEvent(BaseModel):
+    """SSE event for generation progress."""
+    generation_id: str
+    progress: float  # 0-100
+    status: str  # "generating", "complete", "failed"
+    duration: Optional[float] = None  # set on complete
+    audio_path: Optional[str] = None  # set on complete
+
+
 class HistoryQuery(BaseModel):
     """Query model for generation history."""
     profile_id: Optional[str] = None
@@ -92,8 +142,14 @@ class HistoryResponse(BaseModel):
     language: str
     audio_path: str
     duration: float
+    generation_time_seconds: Optional[float] = None
     seed: Optional[int]
     instruct: Optional[str]
+    model_size: Optional[str] = None
+    backend_type: Optional[str] = None
+    request_user_id: Optional[str] = None
+    request_user_first_name: Optional[str] = None
+    request_ip: Optional[str] = None
     created_at: datetime
 
     class Config:
diff --git a/backend/platform_detect.py b/backend/platform_detect.py
index c4db19d..eec60cc 100644
--- a/backend/platform_detect.py
+++ b/backend/platform_detect.py
@@ -9,7 +9,7 @@
 def is_apple_silicon() -> bool:
     """
     Check if running on Apple Silicon (arm64 macOS).
-    
+
     Returns:
         True if on Apple Silicon, False otherwise
     """
diff --git a/backend/requirements-mlx.txt b/backend/requirements-mlx.txt
index 80ab173..15fa842 100644
--- a/backend/requirements-mlx.txt
+++ b/backend/requirements-mlx.txt
@@ -3,3 +3,4 @@
 
 mlx>=0.30.0
 mlx-audio>=0.3.1
+tiktoken  # Required by mlx-audio whisper models
diff --git a/backend/requirements.txt b/backend/requirements.txt
index e0f6ded..3ff508a 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -9,16 +9,30 @@ alembic>=1.13.0
 
 # ML models
 torch>=2.1.0
-transformers>=4.36.0
+torchvision>=0.16.0
+torchaudio  # For qwen-tts
+# transformers: flexible version to support both PyTorch (4.57.3) and MLX (5.0.0rc3) on macOS
+transformers>=4.57.3
 accelerate>=0.26.0
 huggingface_hub>=0.20.0
-qwen-tts>=0.0.5
+# qwen-tts and its dependencies (installed from git on macOS, from PyPI otherwise)
+einops
+gradio
+onnxruntime
+sox
 
 # Audio processing
 librosa>=0.10.0
 soundfile>=0.12.0
 numpy>=1.24.0
+pyloudnorm>=0.1.0
+
+# HTTP client
+httpx>=0.24.0
 
 # Utilities
 python-multipart>=0.0.6
 Pillow>=10.0.0
+
+# RunPod Serverless
+runpod>=1.7.0
diff --git a/backend/server.py b/backend/server.py
index b5621cd..f5015dd 100644
--- a/backend/server.py
+++ b/backend/server.py
@@ -5,6 +5,10 @@
 absolute imports instead of relative imports.
 """
 
+import multiprocessing
+multiprocessing.freeze_support()
+
+import os
 import sys
 import logging
 
@@ -64,7 +68,9 @@
             default=None,
             help="Data directory for database, profiles, and generated audio",
         )
-        args = parser.parse_args()
+        # Use parse_known_args to tolerate extra args from multiprocessing
+        # resource tracker (-B -S -I -c ...) on PyInstaller bundles
+        args, _unknown = parser.parse_known_args()
         logger.info(f"Parsed arguments: host={args.host}, port={args.port}, data_dir={args.data_dir}")
 
         # Set data directory if provided
@@ -77,12 +83,13 @@
         database.init_db()
         logger.info("Database initialized successfully")
 
-        logger.info(f"Starting uvicorn server on {args.host}:{args.port}...")
+        _log_level = os.environ.get("LOG_LEVEL", "info").lower()
+        logger.info(f"Starting uvicorn server on {args.host}:{args.port} (log_level={_log_level})...")
         uvicorn.run(
             app,
             host=args.host,
             port=args.port,
-            log_level="info",
+            log_level=_log_level,
         )
     except Exception as e:
         logger.error(f"Server startup failed: {e}", exc_info=True)
diff --git a/backend/serverless_handler.py b/backend/serverless_handler.py
new file mode 100644
index 0000000..446320d
--- /dev/null
+++ b/backend/serverless_handler.py
@@ -0,0 +1,158 @@
+"""
+RunPod Serverless handler for voicebox TTS server.
+
+Starts the FastAPI/uvicorn server in a background thread and proxies
+RunPod job requests as HTTP calls to the local server.
+
+Usage (RunPod serverless):
+    CMD ["python3", "-u", "-m", "backend.serverless_handler"]
+
+Local testing:
+    python3 -m backend.serverless_handler --rp_serve_api
+"""
+
+import os
+import sys
+import time
+import logging
+import threading
+import base64
+
+import httpx
+import runpod
+import uvicorn
+
+logger = logging.getLogger(__name__)
+
+# ── Configuration ─────────────────────────────────────────────
+_HOST = "127.0.0.1"
+_PORT = 17493
+_BASE_URL = f"http://{_HOST}:{_PORT}"
+_STARTUP_TIMEOUT = 300  # 5 min max for cold start model downloads
+_STARTUP_POLL = 2  # seconds between health checks
+
+# ── Server lifecycle ──────────────────────────────────────────
+_server_ready = threading.Event()
+_server_thread: threading.Thread | None = None
+
+
+def _start_server():
+    """Start the FastAPI/uvicorn server in a background thread."""
+    global _server_thread
+
+    if _server_thread is not None and _server_thread.is_alive():
+        return
+
+    _server_ready.clear()
+
+    # Set SERVERLESS before importing the app so backends disable idle timers
+    os.environ["SERVERLESS"] = "1"
+
+    from backend import config, database
+    from backend.main import app
+
+    config.set_data_dir("/app/data")
+
+    def _run():
+        uvicorn.run(app, host=_HOST, port=_PORT, log_level="info")
+
+    _server_thread = threading.Thread(target=_run, daemon=True)
+    _server_thread.start()
+
+
+def _wait_for_server():
+    """Block until /health responds 200 or timeout."""
+    if _server_ready.is_set():
+        return
+
+    deadline = time.time() + _STARTUP_TIMEOUT
+    while time.time() < deadline:
+        try:
+            r = httpx.get(f"{_BASE_URL}/health", timeout=5)
+            if r.status_code == 200:
+                logger.info("Voicebox server is ready")
+                _server_ready.set()
+                return
+        except httpx.RequestError:
+            pass
+        time.sleep(_STARTUP_POLL)
+
+    raise RuntimeError(
+        f"Voicebox server did not become healthy within {_STARTUP_TIMEOUT}s"
+    )
+
+
+# ── RunPod handler ────────────────────────────────────────────
+
+def handler(job: dict) -> dict:
+    """
+    RunPod serverless handler.
+
+    Expected job["input"]:
+        method  (str)  — HTTP method, default "POST"
+        path    (str)  — required, e.g. "/generate"
+        body    (dict) — optional, JSON body for POST/PUT
+        params  (dict) — optional, query params
+        headers (dict) — optional
+    """
+    _start_server()
+    _wait_for_server()
+
+    inp = job.get("input", {})
+
+    path = inp.get("path")
+    if not path:
+        return {"error": "Missing 'path' in job input"}
+
+    method = inp.get("method", "POST").upper()
+    body = inp.get("body")
+    params = inp.get("params")
+    headers = inp.get("headers", {})
+
+    url = f"{_BASE_URL}{path}"
+
+    try:
+        with httpx.Client(timeout=600) as client:
+            response = client.request(
+                method=method,
+                url=url,
+                json=body if method in ("POST", "PUT", "PATCH") else None,
+                params=params,
+                headers=headers,
+            )
+
+        content_type = response.headers.get("content-type", "")
+        is_binary = (
+            "audio/" in content_type
+            or "application/octet-stream" in content_type
+            or "application/zip" in content_type
+        )
+
+        if is_binary:
+            return {
+                "status_code": response.status_code,
+                "headers": dict(response.headers),
+                "body_base64": base64.b64encode(response.content).decode("ascii"),
+                "is_binary": True,
+            }
+
+        try:
+            result = response.json()
+        except Exception:
+            result = response.text
+
+        return {
+            "status_code": response.status_code,
+            "headers": dict(response.headers),
+            "body": result,
+        }
+
+    except httpx.TimeoutException:
+        return {"error": "Request to voicebox server timed out (600s)"}
+    except Exception as e:
+        return {"error": f"Request failed: {e}"}
+
+
+# ── Entry point ───────────────────────────────────────────────
+if __name__ == "__main__":
+    runpod.serverless.start({"handler": handler})
diff --git a/backend/setup-linux.sh b/backend/setup-linux.sh
new file mode 100755
index 0000000..ae84bfc
--- /dev/null
+++ b/backend/setup-linux.sh
@@ -0,0 +1,375 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Voicebox Linux setup script
+# Installs the backend server with CUDA support on x86_64 Linux
+
+INSTALL_DIR="/opt/voicebox"
+DATA_DIR="/var/lib/voicebox"
+VENV_DIR="$INSTALL_DIR/venv"
+SERVICE_USER="voicebox"
+PYTHON_MIN="3.10"
+CUDA_MIN="11.8"
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+info()  { echo -e "${GREEN}[+]${NC} $*"; }
+warn()  { echo -e "${YELLOW}[!]${NC} $*"; }
+error() { echo -e "${RED}[x]${NC} $*" >&2; }
+die()   { error "$*"; exit 1; }
+
+usage() {
+    cat <<EOF
+Voicebox Linux Setup
+
+Usage: $0 <command>
+
+Commands:
+  check       Check system requirements (GPU, Python, etc.)
+  install     Install voicebox server to $INSTALL_DIR
+  service     Install and enable systemd service
+  uninstall   Remove voicebox installation
+
+Options:
+  --no-cuda       Skip CUDA, use CPU only
+  --install-dir   Custom install dir (default: $INSTALL_DIR)
+  --data-dir      Custom data dir (default: $DATA_DIR)
+  --port          Server port (default: 17493)
+
+EOF
+    exit 0
+}
+
+# --- Checks ---
+
+check_root() {
+    if [[ $EUID -ne 0 ]]; then
+        die "This script must be run as root (try: sudo $0 $*)"
+    fi
+}
+
+check_arch() {
+    local arch
+    arch=$(uname -m)
+    if [[ "$arch" != "x86_64" ]]; then
+        die "Unsupported architecture: $arch (need x86_64)"
+    fi
+    info "Architecture: $arch"
+}
+
+check_python() {
+    local py=""
+    for candidate in python3.13 python3.12 python3.11 python3.10 python3; do
+        if command -v "$candidate" &>/dev/null; then
+            py="$candidate"
+            break
+        fi
+    done
+
+    if [[ -z "$py" ]]; then
+        die "Python 3.10+ not found. Install with: apt install python3 python3-venv python3-pip"
+    fi
+
+    local ver
+    ver=$($py -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+    if python3 -c "import sys; sys.exit(0 if sys.version_info >= (3, 10) else 1)"; then
+        info "Python: $py ($ver)"
+    else
+        die "Python $ver is too old (need $PYTHON_MIN+)"
+    fi
+
+    # Check venv module
+    if ! $py -c "import venv" 2>/dev/null; then
+        die "Python venv module missing. Install with: apt install python3-venv"
+    fi
+
+    PYTHON_BIN="$py"
+}
+
+check_nvidia() {
+    if [[ "${NO_CUDA:-0}" == "1" ]]; then
+        warn "CUDA skipped (--no-cuda)"
+        return
+    fi
+
+    info "Checking NVIDIA GPU..."
+
+    # Check for nvidia device nodes
+    if [[ ! -e /dev/nvidia0 ]]; then
+        warn "/dev/nvidia0 not found — NVIDIA driver may not be loaded"
+        warn "Try: nvidia-smi  or  modprobe nvidia"
+    else
+        info "Device: /dev/nvidia0 exists"
+        # Check permissions
+        local perms
+        perms=$(stat -c '%a' /dev/nvidia0 2>/dev/null || echo "???")
+        local group
+        group=$(stat -c '%G' /dev/nvidia0 2>/dev/null || echo "???")
+        info "  /dev/nvidia0 permissions: $perms (group: $group)"
+
+        if [[ ! -r /dev/nvidia0 ]] || [[ ! -w /dev/nvidia0 ]]; then
+            warn "  Current user cannot access /dev/nvidia0"
+            warn "  Fix: usermod -aG $group $SERVICE_USER"
+        fi
+    fi
+
+    if [[ ! -e /dev/nvidiactl ]]; then
+        warn "/dev/nvidiactl not found"
+    fi
+
+    if [[ ! -e /dev/nvidia-uvm ]]; then
+        warn "/dev/nvidia-uvm not found (needed for CUDA)"
+        warn "Fix: modprobe nvidia-uvm"
+    fi
+
+    # nvidia-smi
+    if ! command -v nvidia-smi &>/dev/null; then
+        warn "nvidia-smi not found — NVIDIA driver not installed?"
+        warn "Install: apt install nvidia-driver-535 (or newer)"
+        return
+    fi
+
+    local driver_ver gpu_name gpu_mem
+    driver_ver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits 2>/dev/null | head -1)
+    gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
+    gpu_mem=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null | head -1)
+
+    info "GPU: $gpu_name ($gpu_mem)"
+    info "Driver: $driver_ver"
+
+    # Check CUDA toolkit
+    if command -v nvcc &>/dev/null; then
+        local cuda_ver
+        cuda_ver=$(nvcc --version | grep -oP 'release \K[\d.]+')
+        info "CUDA toolkit: $cuda_ver"
+    else
+        warn "nvcc not found — CUDA toolkit not installed (PyTorch bundles its own, this is OK)"
+    fi
+}
+
+check_system() {
+    info "=== System Check ==="
+    check_arch
+    check_python
+    check_nvidia
+
+    # Disk space
+    local avail
+    avail=$(df -BG --output=avail / | tail -1 | tr -d ' G')
+    if (( avail < 15 )); then
+        warn "Low disk space: ${avail}G free (models need ~10G)"
+    else
+        info "Disk: ${avail}G available"
+    fi
+
+    # RAM
+    local ram_gb
+    ram_gb=$(awk '/MemTotal/ {printf "%.0f", $2/1024/1024}' /proc/meminfo)
+    if (( ram_gb < 8 )); then
+        warn "Low RAM: ${ram_gb}G (recommend 16G+)"
+    else
+        info "RAM: ${ram_gb}G"
+    fi
+
+    echo
+    info "=== Check complete ==="
+}
+
+# --- Install ---
+
+do_install() {
+    check_root
+    check_arch
+    check_python
+
+    local src_dir
+    src_dir="$(cd "$(dirname "$0")" && pwd)"
+
+    info "Installing voicebox to $INSTALL_DIR..."
+
+    # Create service user
+    if ! id "$SERVICE_USER" &>/dev/null; then
+        info "Creating user: $SERVICE_USER"
+        useradd --system --home-dir "$INSTALL_DIR" --shell /usr/sbin/nologin "$SERVICE_USER"
+    fi
+
+    # Add to video group for GPU access
+    if getent group video &>/dev/null; then
+        usermod -aG video "$SERVICE_USER"
+        info "Added $SERVICE_USER to video group (GPU access)"
+    fi
+    if getent group render &>/dev/null; then
+        usermod -aG render "$SERVICE_USER"
+        info "Added $SERVICE_USER to render group (GPU access)"
+    fi
+
+    # Create directories
+    mkdir -p "$INSTALL_DIR" "$DATA_DIR"
+
+    # Copy backend source
+    info "Copying backend source..."
+    rsync -a --delete \
+        --exclude='venv/' \
+        --exclude='__pycache__/' \
+        --exclude='*.pyc' \
+        --exclude='.claude/' \
+        "$src_dir/" "$INSTALL_DIR/backend/"
+
+    # Create venv
+    info "Creating Python venv..."
+    $PYTHON_BIN -m venv "$VENV_DIR"
+
+    # Install PyTorch with CUDA
+    info "Installing dependencies (this may take a while)..."
+    if [[ "${NO_CUDA:-0}" == "1" ]]; then
+        "$VENV_DIR/bin/pip" install --upgrade pip
+        "$VENV_DIR/bin/pip" install torch --index-url https://download.pytorch.org/whl/cpu
+    else
+        "$VENV_DIR/bin/pip" install --upgrade pip
+        "$VENV_DIR/bin/pip" install torch --index-url https://download.pytorch.org/whl/cu121
+    fi
+
+    "$VENV_DIR/bin/pip" install -r "$INSTALL_DIR/backend/requirements.txt"
+
+    # Copy CLI
+    cp "$src_dir/../voicebox" "$INSTALL_DIR/voicebox" 2>/dev/null || true
+
+    # Create launcher
+    cat > /usr/local/bin/voicebox <<LAUNCHER
+#!/bin/bash
+source "$VENV_DIR/bin/activate"
+exec python "$INSTALL_DIR/backend/cli.py" "\$@"
+LAUNCHER
+    chmod +x /usr/local/bin/voicebox
+
+    # Fix ownership
+    chown -R "$SERVICE_USER:$SERVICE_USER" "$INSTALL_DIR" "$DATA_DIR"
+
+    echo
+    info "=== Install complete ==="
+    info "Server dir:  $INSTALL_DIR"
+    info "Data dir:    $DATA_DIR"
+    info "CLI:         /usr/local/bin/voicebox"
+    echo
+    info "Next steps:"
+    info "  voicebox server -d --data-dir $DATA_DIR"
+    info "  voicebox import <voice.zip>"
+    info "  voicebox say -v 'Will' -t 'Hello from Linux'"
+    echo
+    info "Or install as a systemd service:"
+    info "  sudo $0 service"
+}
+
+# --- Systemd ---
+
+do_service() {
+    check_root
+
+    local port="${PORT:-17493}"
+
+    cat > /etc/systemd/system/voicebox.service <<SERVICE
+[Unit]
+Description=Voicebox TTS Server
+After=network.target
+
+[Service]
+Type=simple
+User=$SERVICE_USER
+Group=$SERVICE_USER
+WorkingDirectory=$INSTALL_DIR
+ExecStart=$VENV_DIR/bin/python -m backend.main --host 0.0.0.0 --port $port
+Environment=HOME=$INSTALL_DIR
+Environment=HF_HOME=$DATA_DIR/huggingface
+Restart=on-failure
+RestartSec=5
+
+# GPU access
+SupplementaryGroups=video render
+
+# Hardening
+NoNewPrivileges=yes
+ProtectSystem=strict
+ReadWritePaths=$DATA_DIR $INSTALL_DIR
+PrivateTmp=yes
+
+[Install]
+WantedBy=multi-user.target
+SERVICE
+
+    systemctl daemon-reload
+    systemctl enable voicebox
+    systemctl start voicebox
+
+    info "Systemd service installed and started"
+    info "  Status:  systemctl status voicebox"
+    info "  Logs:    journalctl -u voicebox -f"
+    info "  Stop:    systemctl stop voicebox"
+}
+
+# --- Uninstall ---
+
+do_uninstall() {
+    check_root
+
+    warn "This will remove voicebox from $INSTALL_DIR"
+    read -rp "Continue? [y/N] " confirm
+    [[ "$confirm" =~ ^[Yy]$ ]] || exit 0
+
+    # Stop service
+    if systemctl is-active voicebox &>/dev/null; then
+        systemctl stop voicebox
+    fi
+    if [[ -f /etc/systemd/system/voicebox.service ]]; then
+        systemctl disable voicebox 2>/dev/null || true
+        rm -f /etc/systemd/system/voicebox.service
+        systemctl daemon-reload
+    fi
+
+    rm -rf "$INSTALL_DIR"
+    rm -f /usr/local/bin/voicebox
+
+    info "Removed $INSTALL_DIR and /usr/local/bin/voicebox"
+    warn "Data dir preserved at $DATA_DIR (delete manually if unwanted)"
+
+    # Don't remove user if data dir still exists
+    if id "$SERVICE_USER" &>/dev/null; then
+        warn "User '$SERVICE_USER' preserved (remove with: userdel $SERVICE_USER)"
+    fi
+}
+
+# --- Parse args ---
+
+NO_CUDA=0
+PORT=17493
+COMMAND=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        check|install|service|uninstall)
+            COMMAND="$1" ;;
+        --no-cuda)
+            NO_CUDA=1 ;;
+        --install-dir)
+            INSTALL_DIR="$2"; VENV_DIR="$INSTALL_DIR/venv"; shift ;;
+        --data-dir)
+            DATA_DIR="$2"; shift ;;
+        --port)
+            PORT="$2"; shift ;;
+        -h|--help)
+            usage ;;
+        *)
+            die "Unknown argument: $1" ;;
+    esac
+    shift
+done
+
+case "$COMMAND" in
+    check)     check_system ;;
+    install)   do_install ;;
+    service)   do_service ;;
+    uninstall) do_uninstall ;;
+    *)         usage ;;
+esac
diff --git a/backend/tests/test_tqdm_patch.py b/backend/tests/test_tqdm_patch.py
new file mode 100644
index 0000000..338aa06
--- /dev/null
+++ b/backend/tests/test_tqdm_patch.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Test whether our tqdm monkey-patch actually intercepts HuggingFace Hub downloads.
+
+Run from the voicebox/backend directory:
+    python -m tests.test_tqdm_patch
+"""
+
+import sys
+import os
+import logging
+import threading
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+# Add parent to path so we can import our modules
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+def test_1_hf_tqdm_class_identity():
+    """Verify what class huggingface_hub uses for tqdm."""
+    from huggingface_hub.utils import tqdm as hf_tqdm
+    from huggingface_hub._snapshot_download import hf_tqdm as snapshot_hf_tqdm
+    import tqdm as tqdm_module
+
+    print(f"\n=== Test 1: Class identity ===")
+    print(f"  hf_tqdm class: {hf_tqdm}")
+    print(f"  snapshot hf_tqdm: {snapshot_hf_tqdm}")
+    print(f"  same object: {hf_tqdm is snapshot_hf_tqdm}")
+    print(f"  MRO: {[c.__name__ for c in hf_tqdm.__mro__]}")
+    print(f"  'update' in hf_tqdm.__dict__: {'update' in hf_tqdm.__dict__}")
+    print(f"  'update' defined on: {[c.__name__ for c in hf_tqdm.__mro__ if 'update' in c.__dict__]}")
+    print(f"  tqdm.tqdm is base: {tqdm_module.tqdm}")
+
+
+def test_2_patch_intercepts_update():
+    """Verify our class-level patch intercepts .update() calls."""
+    from huggingface_hub.utils import tqdm as hf_tqdm_class
+
+    print(f"\n=== Test 2: Patch intercepts update ===")
+
+    original_update = hf_tqdm_class.update
+    calls = []
+
+    def patched(self, n=1):
+        calls.append({"n": n, "self_n": getattr(self, "n", "?"), "total": getattr(self, "total", "?")})
+        return original_update(self, n)
+
+    hf_tqdm_class.update = patched
+
+    try:
+        # Create instance AFTER patching (like snapshot_download would)
+        bar = hf_tqdm_class(total=1000, disable=False)
+        bar.update(100)
+        bar.update(200)
+        bar.close()
+
+        print(f"  calls: {calls}")
+        print(f"  bar.n: {bar.n}")
+        print(f"  PASS: {'update intercepted' if len(calls) == 2 else 'FAIL - not intercepted'}")
+    finally:
+        hf_tqdm_class.update = original_update
+
+
+def test_3_patch_with_our_tracker():
+    """Test our actual HFProgressTracker patch_download context manager."""
+    from utils.hf_progress import HFProgressTracker, create_hf_progress_callback
+    from utils.progress import ProgressManager
+
+    print(f"\n=== Test 3: Our HFProgressTracker ===")
+
+    pm = ProgressManager()
+    callback = create_hf_progress_callback("test-model", pm)
+    tracker = HFProgressTracker(callback, filter_non_downloads=False)
+
+    with tracker.patch_download():
+        # Check if hf_tqdm_class.update was patched
+        from huggingface_hub.utils import tqdm as hf_tqdm_class
+        print(f"  'update' in hf_tqdm_class.__dict__ after patch: {'update' in hf_tqdm_class.__dict__}")
+        print(f"  update is patched: {hf_tqdm_class.update is not tracker._hf_tqdm_original_update}")
+
+        # Simulate what snapshot_download does
+        bar = hf_tqdm_class(total=50_000_000, desc="Downloading test...", disable=False)
+        bar.update(10_000_000)
+        bar.update(20_000_000)
+        bar.close()
+
+    progress = pm.get_progress("test-model")
+    print(f"  progress: {progress}")
+    print(f"  PASS: {'progress tracked' if progress and progress.get('current', 0) > 0 else 'FAIL - no progress'}")
+
+
+def test_4_real_small_download():
+    """Test with a real (tiny) HuggingFace download to see if progress fires."""
+    from utils.hf_progress import HFProgressTracker, create_hf_progress_callback
+    from utils.progress import ProgressManager
+
+    print(f"\n=== Test 4: Real HF download (tiny model config) ===")
+
+    updates = []
+
+    def capture_callback(downloaded, total, filename):
+        updates.append({"downloaded": downloaded, "total": total, "filename": filename})
+        if len(updates) <= 3 or len(updates) % 10 == 0:
+            print(f"  progress: {downloaded}/{total} ({filename})")
+
+    pm = ProgressManager()
+    tracker = HFProgressTracker(capture_callback, filter_non_downloads=False)
+
+    with tracker.patch_download():
+        from huggingface_hub import snapshot_download
+        import tempfile
+
+        # Download a tiny model (just config files)
+        print("  Starting snapshot_download of a tiny repo...")
+        try:
+            result = snapshot_download(
+                "hf-internal-testing/tiny-random-gpt2",
+                cache_dir=tempfile.mkdtemp(),
+                allow_patterns=["*.json"],
+            )
+            print(f"  Downloaded to: {result}")
+        except Exception as e:
+            print(f"  Download error (may be expected): {e}")
+
+    print(f"  Total updates captured: {len(updates)}")
+    print(f"  PASS: {'progress captured' if len(updates) > 0 else 'FAIL - no progress updates'}")
+
+
+def test_5_check_thread_safety():
+    """The download runs in a thread via asyncio.to_thread — test that patch works across threads."""
+    from huggingface_hub.utils import tqdm as hf_tqdm_class
+
+    print(f"\n=== Test 5: Thread safety ===")
+
+    original_update = hf_tqdm_class.update
+    calls = []
+
+    def patched(self, n=1):
+        calls.append(threading.current_thread().name)
+        return original_update(self, n)
+
+    hf_tqdm_class.update = patched
+
+    try:
+        def run_in_thread():
+            bar = hf_tqdm_class(total=1000, disable=False)
+            bar.update(100)
+            bar.close()
+
+        # Patch in main thread
+        t = threading.Thread(target=run_in_thread, name="worker-thread")
+        t.start()
+        t.join()
+
+        print(f"  calls from threads: {calls}")
+        print(f"  PASS: {'cross-thread works' if len(calls) > 0 else 'FAIL - not called from thread'}")
+    finally:
+        hf_tqdm_class.update = original_update
+
+
+if __name__ == "__main__":
+    test_1_hf_tqdm_class_identity()
+    test_2_patch_intercepts_update()
+    test_3_patch_with_our_tracker()
+    test_4_real_small_download()
+    test_5_check_thread_safety()
+    print("\n=== All tests complete ===")
diff --git a/backend/utils/audio.py b/backend/utils/audio.py
index 302dff2..4065d83 100644
--- a/backend/utils/audio.py
+++ b/backend/utils/audio.py
@@ -1,46 +1,107 @@
 """
 Audio processing utilities.
+
+Includes EBU R128 loudness normalization with true-peak limiting,
+matching broadcast standards. Uses pyloudnorm for LUFS measurement
+and normalization — pure Python, no ffmpeg dependency.
 """
 
+import logging
 import numpy as np
 import soundfile as sf
 import librosa
 from typing import Tuple, Optional
 
+logger = logging.getLogger(__name__)
+
 
 def normalize_audio(
     audio: np.ndarray,
-    target_db: float = -20.0,
-    peak_limit: float = 0.85,
+    sample_rate: int = 24000,
+    target_lufs: float = -16.0,
+    true_peak_limit_db: float = -2.0,
 ) -> np.ndarray:
     """
-    Normalize audio to target loudness with peak limiting.
-    
+    Normalize audio to target loudness (EBU R128) with true-peak limiting.
+
+    Matches the behavior of ffmpeg's loudnorm filter:
+        loudnorm=I=-16:TP=-2:LRA=11,alimiter=limit=-2dB
+
+    Falls back to simple RMS normalization if pyloudnorm is unavailable
+    or audio is too short for LUFS measurement.
+
     Args:
-        audio: Input audio array
-        target_db: Target RMS level in dB
-        peak_limit: Peak limit (0.0-1.0)
-        
+        audio: Input audio array (mono, float32)
+        sample_rate: Audio sample rate
+        target_lufs: Target integrated loudness in LUFS (default: -16)
+        true_peak_limit_db: True-peak ceiling in dBTP (default: -2)
+
     Returns:
         Normalized audio array
     """
-    # Convert to float32
+    import warnings
     audio = audio.astype(np.float32)
-    
-    # Calculate current RMS
-    rms = np.sqrt(np.mean(audio**2))
-    
-    # Calculate target RMS
-    target_rms = 10**(target_db / 20)
-    
-    # Apply gain
+
+    if len(audio) == 0:
+        return audio
+
+    # True-peak limit as linear amplitude
+    peak_limit = 10 ** (true_peak_limit_db / 20)
+
+    try:
+        import pyloudnorm as pyln
+
+        meter = pyln.Meter(sample_rate)
+
+        # pyloudnorm requires at least 0.4s of audio for LUFS measurement
+        min_samples = int(sample_rate * 0.4)
+        if len(audio) < min_samples:
+            logger.debug("Audio too short for LUFS normalization, using RMS fallback")
+            return _normalize_rms(audio, target_db=target_lufs, peak_limit=peak_limit)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            current_lufs = meter.integrated_loudness(audio)
+
+        # If audio is essentially silent, LUFS returns -inf
+        if not np.isfinite(current_lufs) or current_lufs < -70:
+            logger.debug("Audio too quiet for LUFS normalization (%.1f LUFS)", current_lufs)
+            return audio
+
+        # Apply loudness normalization (suppress clipping warnings — we clip intentionally below)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message=".*Possible clipping.*")
+            audio = pyln.normalize.loudness(audio, current_lufs, target_lufs)
+
+        # True-peak limiting via clipping (simple but effective for TTS output)
+        audio = np.clip(audio, -peak_limit, peak_limit)
+
+        return audio
+
+    except ImportError:
+        logger.warning("pyloudnorm not installed, falling back to RMS normalization")
+        return _normalize_rms(audio, target_db=target_lufs, peak_limit=peak_limit)
+
+
+def _normalize_rms(
+    audio: np.ndarray,
+    target_db: float = -20.0,
+    peak_limit: float = 0.85,
+) -> np.ndarray:
+    """
+    Simple RMS-based normalization fallback.
+
+    Used when pyloudnorm is unavailable or audio is too short for LUFS.
+    """
+    audio = audio.astype(np.float32)
+    rms = np.sqrt(np.mean(audio ** 2))
+    target_rms = 10 ** (target_db / 20)
+
     if rms > 0:
         gain = target_rms / rms
         audio = audio * gain
-    
-    # Peak limiting
+
     audio = np.clip(audio, -peak_limit, peak_limit)
-    
     return audio
 
 
@@ -68,52 +129,88 @@ def save_audio(
     audio: np.ndarray,
     path: str,
     sample_rate: int = 24000,
+    normalize: bool = True,
 ) -> None:
     """
-    Save audio file.
-    
+    Save audio file, optionally with loudness normalization.
+
+    When normalize=True (the default), applies EBU R128 loudness
+    normalization to -16 LUFS with -2 dBTP true-peak limiting before
+    saving. This ensures consistent output volume across generations.
+
     Args:
         audio: Audio array
         path: Output path
         sample_rate: Sample rate
+        normalize: Apply loudness normalization before saving (default: True)
     """
+    if normalize:
+        audio = normalize_audio(audio, sample_rate=sample_rate)
     sf.write(path, audio, sample_rate)
 
 
-def validate_reference_audio(
+def validate_and_normalize_reference_audio(
     audio_path: str,
     min_duration: float = 2.0,
     max_duration: float = 30.0,
+    trim_threshold: float = 45.0,
     min_rms: float = 0.01,
 ) -> Tuple[bool, Optional[str]]:
     """
-    Validate reference audio for voice cloning.
-    
+    Validate, auto-trim, and normalize reference audio for voice cloning.
+
+    Does the heavy lifting so users don't have to manually prepare audio:
+    - Clips slightly over max_duration are auto-trimmed (up to trim_threshold)
+    - Audio is loudness-normalized to broadcast standards (EBU R128)
+    - The old peak > 0.99 "clipping" rejection is gone entirely
+
     Args:
-        audio_path: Path to audio file
+        audio_path: Path to audio file (will be overwritten with processed version)
         min_duration: Minimum duration in seconds
-        max_duration: Maximum duration in seconds
-        min_rms: Minimum RMS level
-        
+        max_duration: Maximum duration in seconds (inclusive — 30.0s is allowed)
+        trim_threshold: Auto-trim clips up to this length to max_duration.
+            Clips longer than this are rejected outright.
+        min_rms: Minimum RMS level (below this = silence)
+
     Returns:
         Tuple of (is_valid, error_message)
     """
     try:
         audio, sr = load_audio(audio_path)
         duration = len(audio) / sr
-        
+
         if duration < min_duration:
-            return False, f"Audio too short (minimum {min_duration} seconds)"
+            return False, f"Audio too short ({duration:.1f}s, minimum {min_duration}s)"
+
+        # Auto-trim clips that are over max_duration but within trim_threshold
         if duration > max_duration:
-            return False, f"Audio too long (maximum {max_duration} seconds)"
-        
-        rms = np.sqrt(np.mean(audio**2))
+            if duration <= trim_threshold:
+                # Trim to max_duration — take the first N seconds
+                max_samples = int(max_duration * sr)
+                audio = audio[:max_samples]
+                logger.info(
+                    "Auto-trimmed reference audio from %.1fs to %.1fs",
+                    duration, max_duration,
+                )
+                duration = max_duration
+            else:
+                return False, (
+                    f"Audio too long ({duration:.1f}s, maximum {max_duration}s). "
+                    f"Clips up to {trim_threshold:.0f}s are auto-trimmed."
+                )
+
+        rms = np.sqrt(np.mean(audio ** 2))
         if rms < min_rms:
             return False, "Audio is too quiet or silent"
-        
-        if np.abs(audio).max() > 0.99:
-            return False, "Audio is clipping (reduce input gain)"
-        
+
+        # Normalize to consistent loudness (EBU R128, -16 LUFS)
+        audio = normalize_audio(audio, sample_rate=sr)
+        sf.write(audio_path, audio, sr)
+
         return True, None
     except Exception as e:
-        return False, f"Error validating audio: {str(e)}"
+        return False, f"Error processing audio: {str(e)}"
+
+
+# Keep old name as alias for backward compatibility
+validate_reference_audio = validate_and_normalize_reference_audio
diff --git a/backend/utils/cache.py b/backend/utils/cache.py
index 1c420ba..9af3219 100644
--- a/backend/utils/cache.py
+++ b/backend/utils/cache.py
@@ -3,12 +3,15 @@
 """
 
 import hashlib
+import logging
 import torch
 from pathlib import Path
 from typing import Optional, Union, Dict, Any
 
 from .. import config
 
+logger = logging.getLogger(__name__)
+
 
 def _get_cache_dir() -> Path:
     """Get cache directory from config."""
@@ -111,7 +114,7 @@ def clear_voice_prompt_cache() -> int:
                 cache_file.unlink()
                 deleted_count += 1
             except Exception as e:
-                print(f"Failed to delete cache file {cache_file}: {e}")
+                logger.warning(f"Failed to delete cache file {cache_file}: {e}")
         
         # Delete combined audio files
         for audio_file in cache_dir.glob("combined_*.wav"):
@@ -119,8 +122,8 @@ def clear_voice_prompt_cache() -> int:
                 audio_file.unlink()
                 deleted_count += 1
             except Exception as e:
-                print(f"Failed to delete combined audio file {audio_file}: {e}")
-    
+                logger.warning(f"Failed to delete combined audio file {audio_file}: {e}")
+
     return deleted_count
 
 
@@ -145,6 +148,6 @@ def clear_profile_cache(profile_id: str) -> int:
                 audio_file.unlink()
                 deleted_count += 1
             except Exception as e:
-                print(f"Failed to delete combined audio file {audio_file}: {e}")
-    
+                logger.warning(f"Failed to delete combined audio file {audio_file}: {e}")
+
     return deleted_count
diff --git a/backend/utils/hf_progress.py b/backend/utils/hf_progress.py
index 7fc88ed..5497c7c 100644
--- a/backend/utils/hf_progress.py
+++ b/backend/utils/hf_progress.py
@@ -2,11 +2,14 @@
 HuggingFace Hub download progress tracking.
 """
 
+import logging
 from typing import Optional, Callable
 from contextlib import contextmanager
 import threading
 import sys
 
+logger = logging.getLogger(__name__)
+
 
 class HFProgressTracker:
     """Tracks HuggingFace Hub download progress by intercepting tqdm."""
@@ -39,7 +42,7 @@ def __init__(self, *args, **kwargs):
                     first_arg = args[0]
                     if isinstance(first_arg, str):
                         desc = first_arg
-                
+
                 filename = ""
                 if desc:
                     # Try to extract filename from description
@@ -48,7 +51,12 @@ def __init__(self, *args, **kwargs):
                         filename = desc.split(":")[0].strip()
                     else:
                         filename = desc.strip()
-                
+
+                # When model is cached, suppress all tqdm output (e.g. "Fetching 12 files")
+                # to keep the console clean on repeat startups.
+                if tracker.filter_non_downloads:
+                    kwargs['disable'] = True
+
                 # Filter out non-standard kwargs that huggingface_hub might pass
                 # These are custom kwargs that tqdm doesn't understand
                 filtered_kwargs = {}
@@ -63,7 +71,7 @@ def __init__(self, *args, **kwargs):
                 for key, value in kwargs.items():
                     if key in tqdm_kwargs:
                         filtered_kwargs[key] = value
-                
+
                 # Try to initialize with filtered kwargs, fall back to all kwargs if that fails
                 try:
                     super().__init__(*args, **filtered_kwargs)
@@ -208,79 +216,54 @@ def patch_download(self):
                 self._original_tqdm_auto = tqdm_module.auto.tqdm
                 tqdm_module.auto.tqdm = tracked_tqdm
             
-            # Patch in sys.modules to catch already-imported references
-            # huggingface_hub uses: from tqdm.auto import tqdm as base_tqdm
-            # So we need to patch both 'tqdm' and 'base_tqdm' attributes
+            # NOTE: We intentionally do NOT replace tqdm classes in sys.modules
+            # with TrackedTqdm. That approach causes `super()` errors because
+            # huggingface_hub's tqdm subclass instances aren't TrackedTqdm instances.
+            # Instead, we monkey-patch .update() directly on HF's tqdm class below.
             self._patched_modules = {}
-            tqdm_attr_names = ['tqdm', 'base_tqdm', 'old_tqdm']  # Various names used
-            
             patched_count = 0
-            for module_name in list(sys.modules.keys()):
-                if "huggingface" in module_name or module_name.startswith("tqdm"):
-                    try:
-                        module = sys.modules[module_name]
-                        for attr_name in tqdm_attr_names:
-                            if hasattr(module, attr_name):
-                                attr = getattr(module, attr_name)
-                                # Only patch if it's a tqdm class (not already patched)
-                                is_tqdm_class = (
-                                    attr is self._original_tqdm_class or 
-                                    (self._original_tqdm_auto and attr is self._original_tqdm_auto) or
-                                    (hasattr(attr, "__name__") and attr.__name__ == "tqdm" and 
-                                     hasattr(attr, "update"))  # tqdm classes have update method
-                                )
-                                if is_tqdm_class:
-                                    key = f"{module_name}.{attr_name}"
-                                    self._patched_modules[key] = (module, attr_name, attr)
-                                    setattr(module, attr_name, tracked_tqdm)
-                                    patched_count += 1
-                    except (AttributeError, TypeError):
-                        pass
-            
-            # ALSO monkey-patch the update method on huggingface_hub's tqdm class
-            # This is needed because the class was already defined at import time
+
+            # Monkey-patch the update method on huggingface_hub's tqdm class.
+            # `from huggingface_hub.utils import tqdm` imports the CLASS directly
+            # (not a module), so we patch .update on the class itself.
             self._hf_tqdm_original_update = None
             try:
-                from huggingface_hub.utils import tqdm as hf_tqdm_module
-                if hasattr(hf_tqdm_module, 'tqdm'):
-                    hf_tqdm_class = hf_tqdm_module.tqdm
-                    self._hf_tqdm_original_update = hf_tqdm_class.update
-                    
-                    # Create a wrapper that calls our tracking
-                    tracker = self  # Reference to HFProgressTracker instance
-                    def patched_update(tqdm_self, n=1):
-                        result = tracker._hf_tqdm_original_update(tqdm_self, n)
-                        
-                        # Track this progress
-                        with tracker._lock:
-                            desc = getattr(tqdm_self, 'desc', '') or ''
-                            current = getattr(tqdm_self, 'n', 0)
-                            total = getattr(tqdm_self, 'total', 0) or 0
-                            
-                            # Skip non-byte progress bars
-                            if 'fetching' in desc.lower():
-                                return result
-                            
-                            # Skip until we have a meaningful total (at least 1MB)
-                            # This avoids the "100% at 0MB" issue when small config
-                            # files are counted before the real model files
-                            MIN_TOTAL_BYTES = 1_000_000  # 1MB
-                            if total >= MIN_TOTAL_BYTES:
-                                tracker._total_downloaded = current
-                                tracker._total_size = total
-                                
-                                if tracker.progress_callback:
-                                    tracker.progress_callback(current, total, desc)
-                        
-                        return result
-                    
-                    hf_tqdm_class.update = patched_update
-                    patched_count += 1
-                    print(f"[HFProgressTracker] Monkey-patched huggingface_hub.utils.tqdm.tqdm.update")
-            except (ImportError, AttributeError) as e:
-                print(f"[HFProgressTracker] Could not monkey-patch hf_tqdm: {e}")
-            
-            print(f"[HFProgressTracker] Patched {patched_count} tqdm references")
+                from huggingface_hub.utils import tqdm as hf_tqdm_class
+                self._hf_tqdm_original_update = hf_tqdm_class.update
+
+                # Create a wrapper that calls our tracking
+                tracker = self  # Reference to HFProgressTracker instance
+                def patched_update(tqdm_self, n=1):
+                    result = tracker._hf_tqdm_original_update(tqdm_self, n)
+
+                    # Track this progress
+                    with tracker._lock:
+                        desc = getattr(tqdm_self, 'desc', '') or ''
+                        current = getattr(tqdm_self, 'n', 0)
+                        total = getattr(tqdm_self, 'total', 0) or 0
+
+                        # Skip non-byte progress bars
+                        if 'fetching' in desc.lower():
+                            return result
+
+                        # Skip until we have a meaningful total (at least 1MB)
+                        MIN_TOTAL_BYTES = 1_000_000  # 1MB
+                        if total >= MIN_TOTAL_BYTES:
+                            tracker._total_downloaded = current
+                            tracker._total_size = total
+
+                            if tracker.progress_callback:
+                                tracker.progress_callback(current, total, desc)
+
+                    return result
+
+                hf_tqdm_class.update = patched_update
+                patched_count += 1
+            except (ImportError, AttributeError):
+                pass
+
+            if not self.filter_non_downloads:
+                logger.debug(f"[HFProgressTracker] Patched {patched_count} tqdm references")
             
             yield
             
@@ -309,9 +292,8 @@ def patched_update(tqdm_self, n=1):
                     # Restore hf_tqdm's original update method
                     if self._hf_tqdm_original_update:
                         try:
-                            from huggingface_hub.utils import tqdm as hf_tqdm_module
-                            if hasattr(hf_tqdm_module, 'tqdm'):
-                                hf_tqdm_module.tqdm.update = self._hf_tqdm_original_update
+                            from huggingface_hub.utils import tqdm as hf_tqdm_class
+                            hf_tqdm_class.update = self._hf_tqdm_original_update
                         except (ImportError, AttributeError):
                             pass
                         self._hf_tqdm_original_update = None
@@ -320,6 +302,37 @@ def patched_update(tqdm_self, n=1):
                     pass
 
 
+@contextmanager
+def hf_offline_for_cached(is_cached: bool):
+    """Force HF offline mode when model is cached to skip remote validation ('Fetching N files').
+
+    huggingface_hub caches HF_HUB_OFFLINE at import time in constants.py,
+    so setting the env var after import has no effect. We must patch the
+    module-level constant directly.
+    """
+    if not is_cached:
+        yield
+        return
+
+    import os
+    from huggingface_hub import constants as hf_constants
+
+    old_env = os.environ.get("HF_HUB_OFFLINE")
+    old_const = hf_constants.HF_HUB_OFFLINE
+
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    hf_constants.HF_HUB_OFFLINE = True
+
+    try:
+        yield
+    finally:
+        if old_env is None:
+            os.environ.pop("HF_HUB_OFFLINE", None)
+        else:
+            os.environ["HF_HUB_OFFLINE"] = old_env
+        hf_constants.HF_HUB_OFFLINE = old_const
+
+
 def create_hf_progress_callback(model_name: str, progress_manager):
     """Create a progress callback for HuggingFace downloads."""
     def callback(downloaded: int, total: int, filename: str = ""):
diff --git a/backend/utils/hf_sizes.py b/backend/utils/hf_sizes.py
new file mode 100644
index 0000000..349b3ba
--- /dev/null
+++ b/backend/utils/hf_sizes.py
@@ -0,0 +1,107 @@
+"""
+Query HuggingFace API for model repository sizes.
+Caches results on disk with a 1-day TTL plus in-memory for the server lifetime.
+"""
+
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+# In-memory cache: repo_id -> size_mb
+_size_cache: Dict[str, Optional[float]] = {}
+
+# Disk cache: ~/.cache/voicebox/hf_sizes.json
+_CACHE_FILE = Path.home() / ".cache" / "voicebox" / "hf_sizes.json"
+_CACHE_TTL = 86400  # 1 day in seconds
+_disk_cache_loaded = False
+
+
+def _load_disk_cache() -> Dict:
+    """Load disk cache, returning {repo_id: {size_mb: float|null, ts: epoch}}."""
+    global _disk_cache_loaded
+    _disk_cache_loaded = True
+    try:
+        if _CACHE_FILE.exists():
+            data = json.loads(_CACHE_FILE.read_text())
+            if isinstance(data, dict):
+                return data
+    except Exception:
+        pass
+    return {}
+
+
+def _save_disk_cache(cache: Dict) -> None:
+    """Persist cache to disk."""
+    try:
+        _CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
+        _CACHE_FILE.write_text(json.dumps(cache, indent=2))
+    except Exception as e:
+        logger.debug(f"Failed to save HF size cache: {e}")
+
+
+async def get_repo_size_mb(repo_id: str) -> Optional[float]:
+    """
+    Get the total size of a HuggingFace model repo in MB.
+
+    Uses a 1-day disk cache to avoid repeated API calls across restarts.
+    Returns None if the query fails (network error, repo not found, etc).
+    """
+    # Check in-memory cache first (includes None for repos with no size info)
+    if repo_id in _size_cache:
+        return _size_cache[repo_id]
+
+    # Check disk cache (1-day TTL) — only caches successful (non-None) results
+    disk_cache = _load_disk_cache()
+    entry = disk_cache.get(repo_id)
+    if entry and time.time() - entry.get("ts", 0) < _CACHE_TTL:
+        size_mb = entry.get("size_mb")
+        if size_mb is not None:
+            _size_cache[repo_id] = size_mb
+            return size_mb
+
+    # Fetch from HuggingFace API
+    size_mb = await _fetch_repo_size(repo_id)
+
+    # Always cache in memory (avoids repeated API calls within same session)
+    _size_cache[repo_id] = size_mb
+
+    # Only persist non-None results to disk (don't cache failures for a day)
+    if size_mb is not None:
+        disk_cache[repo_id] = {"size_mb": size_mb, "ts": time.time()}
+        _save_disk_cache(disk_cache)
+
+    return size_mb
+
+
+async def _fetch_repo_size(repo_id: str) -> Optional[float]:
+    """Fetch model repo size from the HuggingFace API."""
+    try:
+        import httpx
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.get(f"https://huggingface.co/api/models/{repo_id}")
+            if resp.status_code != 200:
+                logger.warning(f"HF API returned {resp.status_code} for {repo_id}")
+                return None
+
+            data = resp.json()
+
+            # Method 1: Use safetensors metadata if available
+            safetensors = data.get("safetensors")
+            if safetensors and "total" in safetensors:
+                return safetensors["total"] / (1024 * 1024)
+
+            # Method 2: Sum sibling file sizes
+            siblings = data.get("siblings", [])
+            total_bytes = sum(s.get("size", 0) for s in siblings if s.get("size"))
+            if total_bytes > 0:
+                return total_bytes / (1024 * 1024)
+
+            return None
+
+    except Exception as e:
+        logger.warning(f"Failed to query HF for {repo_id}: {e}")
+        return None
diff --git a/backend/utils/idle_timer.py b/backend/utils/idle_timer.py
new file mode 100644
index 0000000..42c451d
--- /dev/null
+++ b/backend/utils/idle_timer.py
@@ -0,0 +1,67 @@
+"""
+Idle timer for auto-unloading models after inactivity.
+
+Usage:
+    timer = IdleTimer(timeout=180, on_timeout=backend.unload_model, label="TTS")
+    timer.touch()  # Reset timer on each model use
+    timer.cancel()  # Cancel before explicit unload
+"""
+
+import asyncio
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class IdleTimer:
+    """Calls a callback after a period of inactivity.
+
+    The timer lives on the asyncio event loop.  Call ``touch()`` every time
+    the model is used to reset the countdown.  When the timeout elapses
+    without a ``touch()``, ``on_timeout`` is invoked.
+    """
+
+    def __init__(
+        self,
+        timeout: float,
+        on_timeout: callable,
+        label: str = "model",
+        loop: asyncio.AbstractEventLoop | None = None,
+    ):
+        self.timeout = timeout
+        self.on_timeout = on_timeout
+        self.label = label
+        self._loop = loop
+        self._handle: asyncio.TimerHandle | None = None
+
+    def set_loop(self, loop: asyncio.AbstractEventLoop):
+        """Set (or update) the event loop used for scheduling."""
+        self._loop = loop
+
+    def touch(self):
+        """Reset the idle countdown.  Safe to call from any context."""
+        self.cancel()
+        if self.timeout <= 0:
+            return  # Disabled (serverless mode)
+        if self._loop is None:
+            return  # No loop yet — skip scheduling
+        try:
+            self._handle = self._loop.call_later(self.timeout, self._fire)
+        except RuntimeError:
+            # Loop is closed — nothing to schedule on
+            pass
+
+    def cancel(self):
+        """Cancel the pending timeout (if any)."""
+        if self._handle is not None:
+            self._handle.cancel()
+            self._handle = None
+
+    def _fire(self):
+        """Called when the timer expires."""
+        self._handle = None
+        logger.info(f"[IdleTimer] {self.label} idle for {self.timeout}s — unloading")
+        try:
+            self.on_timeout()
+        except Exception:
+            logger.exception(f"[IdleTimer] Error unloading {self.label}")
diff --git a/backend/utils/progress.py b/backend/utils/progress.py
index 418a88c..a58c31b 100644
--- a/backend/utils/progress.py
+++ b/backend/utils/progress.py
@@ -143,6 +143,13 @@ def update_progress(
         else:
             logger.debug(f"No listeners for {model_name}, progress update stored: {progress_pct:.1f}%")
     
+    def clear_progress(self, model_name: str):
+        """Remove progress entry without notifying listeners (e.g. after a cached load)."""
+        with self._lock:
+            self._progress.pop(model_name, None)
+            self._last_notify_time.pop(model_name, None)
+            self._last_notify_progress.pop(model_name, None)
+
     def get_progress(self, model_name: str) -> Optional[Dict]:
         """Get current progress for a model. Thread-safe."""
         with self._lock:
@@ -220,13 +227,13 @@ async def subscribe(self, model_name: str):
             
             if initial_progress:
                 status = initial_progress.get('status')
-                # Only send initial progress if download is actually in progress
-                # Don't send old 'complete' or 'error' status from previous downloads
-                if status in ('downloading', 'extracting'):
-                    logger.info(f"Sending initial progress for {model_name}: {status}")
-                    yield f"data: {json.dumps(initial_progress)}\n\n"
-                else:
-                    logger.info(f"Skipping initial progress for {model_name} (status: {status})")
+                logger.info(f"Sending initial progress for {model_name}: {status}")
+                yield f"data: {json.dumps(initial_progress)}\n\n"
+                # If already complete or error, close immediately so late
+                # subscribers still receive the terminal status
+                if status in ('complete', 'error'):
+                    logger.info(f"Download already {status} for {model_name}, closing SSE")
+                    return
             else:
                 logger.info(f"No initial progress available for {model_name}")
 
@@ -271,7 +278,14 @@ def mark_complete(self, model_name: str):
         logger.info(f"Marked {model_name} as complete")
         # Notify listeners (thread-safe)
         self._notify_listeners_threadsafe(model_name, progress_data)
-    
+
+        # Remove the entry after notifying so future subscribers don't see stale
+        # "complete" status from a previous download and close immediately.
+        with self._lock:
+            self._progress.pop(model_name, None)
+            self._last_notify_time.pop(model_name, None)
+            self._last_notify_progress.pop(model_name, None)
+
     def mark_error(self, model_name: str, error: str):
         """Mark a model download as failed. Thread-safe."""
         import logging
diff --git a/backend/utils/tasks.py b/backend/utils/tasks.py
index 05b8e01..914cc44 100644
--- a/backend/utils/tasks.py
+++ b/backend/utils/tasks.py
@@ -2,6 +2,7 @@
 Task tracking for active downloads and generations.
 """
 
+import asyncio
 from typing import Optional, Dict, List
 from datetime import datetime
 from dataclasses import dataclass, field
@@ -14,6 +15,7 @@ class DownloadTask:
     status: str = "downloading"  # downloading, extracting, complete, error
     started_at: datetime = field(default_factory=datetime.utcnow)
     error: Optional[str] = None
+    asyncio_task: Optional[asyncio.Task] = field(default=None, repr=False)
 
 
 @dataclass
@@ -23,33 +25,53 @@ class GenerationTask:
     profile_id: str
     text_preview: str  # First 50 chars of text
     started_at: datetime = field(default_factory=datetime.utcnow)
+    progress: float = 0.0  # 0-100
 
 
 class TaskManager:
     """Manages active downloads and generations."""
-    
+
     def __init__(self):
         self._active_downloads: Dict[str, DownloadTask] = {}
         self._active_generations: Dict[str, GenerationTask] = {}
-    
-    def start_download(self, model_name: str) -> None:
-        """Mark a download as started."""
+
+    def start_download(self, model_name: str, asyncio_task: Optional[asyncio.Task] = None) -> None:
+        """Mark a download as started. Does not overwrite an existing entry (preserves asyncio_task)."""
+        if model_name in self._active_downloads:
+            return
         self._active_downloads[model_name] = DownloadTask(
             model_name=model_name,
             status="downloading",
+            asyncio_task=asyncio_task,
         )
-    
+
+    def set_download_task(self, model_name: str, asyncio_task: asyncio.Task) -> None:
+        """Attach the asyncio task to an existing download."""
+        if model_name in self._active_downloads:
+            self._active_downloads[model_name].asyncio_task = asyncio_task
+
+    def cancel_download(self, model_name: str) -> bool:
+        """Cancel an active download. Returns True if cancelled."""
+        task = self._active_downloads.get(model_name)
+        if task and task.asyncio_task:
+            task.asyncio_task.cancel()
+            del self._active_downloads[model_name]
+            return True
+        if task:
+            del self._active_downloads[model_name]
+            return True
+        return False
+
     def complete_download(self, model_name: str) -> None:
         """Mark a download as complete."""
         if model_name in self._active_downloads:
             del self._active_downloads[model_name]
-    
+
     def error_download(self, model_name: str, error: str) -> None:
-        """Mark a download as failed."""
+        """Mark a download as failed and remove from active downloads."""
         if model_name in self._active_downloads:
-            self._active_downloads[model_name].status = "error"
-            self._active_downloads[model_name].error = error
-    
+            del self._active_downloads[model_name]
+
     def start_generation(self, task_id: str, profile_id: str, text: str) -> None:
         """Mark a generation as started."""
         text_preview = text[:50] + "..." if len(text) > 50 else text
@@ -58,24 +80,29 @@ def start_generation(self, task_id: str, profile_id: str, text: str) -> None:
             profile_id=profile_id,
             text_preview=text_preview,
         )
-    
+
+    def update_generation_progress(self, task_id: str, progress: float) -> None:
+        """Update generation progress."""
+        if task_id in self._active_generations:
+            self._active_generations[task_id].progress = progress
+
     def complete_generation(self, task_id: str) -> None:
         """Mark a generation as complete."""
         if task_id in self._active_generations:
             del self._active_generations[task_id]
-    
+
     def get_active_downloads(self) -> List[DownloadTask]:
         """Get all active downloads."""
         return list(self._active_downloads.values())
-    
+
     def get_active_generations(self) -> List[GenerationTask]:
         """Get all active generations."""
         return list(self._active_generations.values())
-    
+
     def is_download_active(self, model_name: str) -> bool:
         """Check if a download is active."""
         return model_name in self._active_downloads
-    
+
     def is_generation_active(self, task_id: str) -> bool:
         """Check if a generation is active."""
         return task_id in self._active_generations
diff --git a/data/cache/combined_795ba722-0c1a-4574-8169-405025752360_8a2218374f55.wav b/data/cache/combined_795ba722-0c1a-4574-8169-405025752360_8a2218374f55.wav
new file mode 100644
index 0000000..bbd8d92
Binary files /dev/null and b/data/cache/combined_795ba722-0c1a-4574-8169-405025752360_8a2218374f55.wav differ
diff --git a/landing/src/components/PlatformIcons.tsx b/landing/src/components/PlatformIcons.tsx
index 5fa37a1..3a020f5 100644
--- a/landing/src/components/PlatformIcons.tsx
+++ b/landing/src/components/PlatformIcons.tsx
@@ -1,6 +1,7 @@
 export function AppleIcon({ className }: { className?: string }) {
   return (
-    <svg className={className} viewBox="0 0 24 24" fill="currentColor">
+    <svg className={className} viewBox="0 0 24 24" fill="currentColor" aria-label="Apple logo">
+      <title>Apple</title>
       <path d="M17.05 20.28c-.98.95-2.05.88-3.08.4-1.09-.5-2.08-.48-3.24 0-1.44.62-2.2.44-3.06-.4C2.79 15.25 3.51 7.59 9.05 7.31c1.35.07 2.29.74 3.08.8 1.18-.24 2.31-.93 3.57-.84 1.51.12 2.65.72 3.4 1.8-3.12 1.87-2.38 5.98.48 7.13-.57 1.5-1.31 2.99-2.54 4.09l.01-.01zM12.03 7.25c-.15-2.23 1.66-4.07 3.74-4.25.29 2.58-2.34 4.5-3.74 4.25z" />
     </svg>
   );
@@ -8,7 +9,8 @@ export function AppleIcon({ className }: { className?: string }) {
 
 export function WindowsIcon({ className }: { className?: string }) {
   return (
-    <svg className={className} viewBox="0 0 24 24" fill="currentColor">
+    <svg className={className} viewBox="0 0 24 24" fill="currentColor" aria-label="Windows logo">
+      <title>Windows</title>
       <path d="M3 12V6.75l6-1.32v6.48L3 12zm17-9v8.75l-10 .15V5.21L20 3zM3 13l6 .09v7.81l-6-1.15V13zm17 .25V22l-10-1.8v-7.15l10 .15z" />
     </svg>
   );
@@ -16,7 +18,8 @@ export function WindowsIcon({ className }: { className?: string }) {
 
 export function LinuxIcon({ className }: { className?: string }) {
   return (
-    <svg className={className} viewBox="0 0 24 24" fill="currentColor">
+    <svg className={className} viewBox="0 0 24 24" fill="currentColor" aria-label="Linux logo">
+      <title>Linux</title>
       <path d="M12.504 0c-.155 0-.315.008-.48.021-4.226.333-3.105 4.807-3.17 6.298-.076 1.092-.3 1.953-1.05 3.02-.885 1.051-2.127 2.75-2.716 4.521-.278.832-.41 1.684-.287 2.489a.424.424 0 00-.11.135c-.26.26-.195.69-.133 1.001.054.27.112.553.077.784-.12.794-.3 1.593-.3 2.406 0 .599.18 1.193.3 1.791.12.599.3 1.193.3 1.792 0 .812.18 1.611.3 2.405.035.23-.023.514-.077.783-.062.312-.127.742.133 1.002a.424.424 0 00.11.135c-.123.805.01 1.657.287 2.489.589 1.771 1.831 3.47 2.716 4.521.75 1.067 0.974 1.928 1.05 3.02.065 1.491-1.056 5.965 3.17 6.298.165.013.325.021.48.021.155 0 .315-.008.48-.021 4.226-.333 3.105-4.807 3.17-6.298.076-1.092.3-1.953 1.05-3.02.885-1.051 2.127-2.75 2.716-4.521.278-.832.41-1.684.287-2.489a.424.424 0 00.11-.135c.26-.26.195-.69.133-1.001-.054-.27-.112-.553-.077-.784.12-.794.3-1.593.3-2.406 0-.599-.18-1.193-.3-1.791-.12-.599-.3-1.193-.3-1.792 0-.812-.18-1.611-.3-2.405-.035-.23.023-.514.077-.783.062-.312.127-.742-.133-1.002a.424.424 0 00-.11-.135c.123-.805-.01-1.657-.287-2.489-.589-1.771-1.831-3.47-2.716-4.521-.75-1.067-.974-1.928-1.05-3.02-.065-1.491 1.056-5.965-3.17-6.298C12.819.008 12.659 0 12.504 0z" />
     </svg>
   );
diff --git a/package.json b/package.json
index c0f3c21..6847a57 100644
--- a/package.json
+++ b/package.json
@@ -12,7 +12,8 @@
     "dev": "bun run setup:dev && cd tauri && bun run tauri dev",
     "dev:web": "cd web && bun run dev",
     "dev:landing": "cd landing && bun run dev",
-    "dev:server": "uvicorn backend.main:app --reload --port 17493",
+    "dev:server": "bash ./scripts/dev-backend-watch.sh",
+    "dev:server:watch": "bash ./scripts/dev-backend-watch.sh",
     "setup:dev": "bun run scripts/setup-dev-sidecar.js",
     "build": "./scripts/build-server.sh && cd tauri && bun run tauri build",
     "build:web": "cd web && bun run build",
diff --git a/scripts/dev-backend-watch.sh b/scripts/dev-backend-watch.sh
new file mode 100755
index 0000000..d6382fd
--- /dev/null
+++ b/scripts/dev-backend-watch.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+EXPECTED_VENV="$ROOT_DIR/backend/venv"
+EXPECTED_PY="$EXPECTED_VENV/bin/python"
+PORT="${PORT:-17493}"
+
+# Load .env from voicebox/ then ../ (parent wins on conflicts)
+for env_file in "$ROOT_DIR/.env" "$ROOT_DIR/../.env"; do
+  if [[ -f "$env_file" ]]; then
+    echo "[dev-backend-watch] Loading env: $env_file"
+    set -a
+    # shellcheck disable=SC1090
+    source "$env_file"
+    set +a
+  fi
+done
+
+if [[ ! -x "$EXPECTED_PY" ]]; then
+  echo "[dev-backend-watch] Missing virtualenv python: $EXPECTED_PY" >&2
+  echo "[dev-backend-watch] Run: make -C $ROOT_DIR setup-python" >&2
+  exit 1
+fi
+
+ACTIVE_VENV="${VIRTUAL_ENV:-}"
+if [[ -n "$ACTIVE_VENV" && "$ACTIVE_VENV" != "$EXPECTED_VENV" ]]; then
+  echo "[dev-backend-watch] Warning: active VIRTUAL_ENV is '$ACTIVE_VENV'" >&2
+  echo "[dev-backend-watch] Using project venv instead: '$EXPECTED_VENV'" >&2
+fi
+
+echo "[dev-backend-watch] Using python: $EXPECTED_PY"
+"$EXPECTED_PY" -c 'import sys; print(f"[dev-backend-watch] Python {sys.version.split()[0]}")'
+
+cd "$ROOT_DIR"
+exec "$EXPECTED_PY" -m uvicorn backend.main:app \
+  --host 127.0.0.1 \
+  --port "$PORT" \
+  --reload \
+  --reload-dir backend
diff --git a/scripts/serverless-build.sh b/scripts/serverless-build.sh
new file mode 100755
index 0000000..9649834
--- /dev/null
+++ b/scripts/serverless-build.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Build and push the voicebox serverless Docker image for RunPod.
+#
+# Usage:
+#   ./scripts/serverless-build.sh                        # build only
+#   ./scripts/serverless-build.sh --push                 # build + push
+#   ./scripts/serverless-build.sh --push --tag ghcr.io/you/voicebox-serverless:latest
+#
+# Environment:
+#   RUNPOD_API_KEY  — set in .env at the project root (not used by the image itself)
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VOICEBOX_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# Defaults
+IMAGE_TAG="voicebox-serverless:latest"
+PUSH=0
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --push)
+            PUSH=1
+            shift
+            ;;
+        --tag)
+            if [[ -z "${2:-}" ]]; then
+                echo "Error: --tag requires a value" >&2
+                echo "Usage: $0 [--push] [--tag IMAGE:TAG]" >&2
+                exit 1
+            fi
+            IMAGE_TAG="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            echo "Usage: $0 [--push] [--tag IMAGE:TAG]"
+            exit 1
+            ;;
+    esac
+done
+
+echo "Building voicebox serverless image: $IMAGE_TAG"
+echo "Context: $VOICEBOX_DIR"
+
+DOCKER_BUILDKIT=1 docker build \
+    --build-arg CUDA=1 \
+    --build-arg SERVERLESS=1 \
+    -t "$IMAGE_TAG" \
+    "$VOICEBOX_DIR"
+
+echo "Build complete: $IMAGE_TAG"
+
+if [ "$PUSH" = "1" ]; then
+    echo "Pushing $IMAGE_TAG..."
+    docker push "$IMAGE_TAG"
+    echo "Pushed: $IMAGE_TAG"
+    echo ""
+    echo "Use this image URL when creating your RunPod endpoint."
+fi
diff --git a/tauri/src-tauri/build.rs b/tauri/src-tauri/build.rs
index ea61259..2d3c952 100644
--- a/tauri/src-tauri/build.rs
+++ b/tauri/src-tauri/build.rs
@@ -38,6 +38,37 @@ fn main() {
             println!("cargo:rerun-if-changed={}/icon.json", icon_source);
             println!("cargo:rerun-if-changed={}/Assets", icon_source);
 
+            // Save original xcode-select path
+            let original_path = Command::new("xcode-select")
+                .arg("-p")
+                .output()
+                .ok()
+                .and_then(|out| String::from_utf8(out.stdout).ok())
+                .map(|s| s.trim().to_string());
+
+            // Check if we need to switch to Xcode
+            let needs_switch = original_path
+                .as_ref()
+                .map(|path| !path.contains("Xcode.app"))
+                .unwrap_or(false);
+
+            if needs_switch {
+                // Switch to Xcode temporarily
+                let switch_result = Command::new("sudo")
+                    .args([
+                        "xcode-select",
+                        "--switch",
+                        "/Applications/Xcode.app/Contents/Developer",
+                    ])
+                    .status();
+
+                if switch_result.is_err() || !switch_result.unwrap().success() {
+                    println!("cargo:warning=Failed to switch to Xcode - skipping icon compilation");
+                    println!("cargo:warning=Install full Xcode or run manually: sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer");
+                    return;
+                }
+            }
+
             let partial_plist = format!("{}/partial.plist", gen_dir);
             let output = Command::new("xcrun")
                 .args([
@@ -61,19 +92,27 @@ fn main() {
                 ])
                 .output();
 
+            // Switch back to original path only if we changed it
+            if needs_switch && original_path.is_some() {
+                let orig_path = original_path.unwrap();
+                let _ = Command::new("sudo")
+                    .args(["xcode-select", "--switch", &orig_path])
+                    .status();
+            }
+
             match output {
                 Ok(output) => {
                     if !output.status.success() {
                         eprintln!("actool stderr: {}", String::from_utf8_lossy(&output.stderr));
                         eprintln!("actool stdout: {}", String::from_utf8_lossy(&output.stdout));
-                        panic!("actool failed to compile icon");
+                        println!("cargo:warning=actool failed to compile icon - continuing without custom icon");
+                    } else {
+                        println!("Successfully compiled icon to {}", gen_dir);
                     }
-                    println!("Successfully compiled icon to {}", gen_dir);
                 }
                 Err(e) => {
                     eprintln!("Failed to execute xcrun actool: {}", e);
-                    eprintln!("Make sure you have Xcode Command Line Tools installed");
-                    panic!("Icon compilation failed");
+                    println!("cargo:warning=Icon compilation skipped - continuing without custom icon");
                 }
             }
         } else {
diff --git a/tauri/src-tauri/gen/Assets.car b/tauri/src-tauri/gen/Assets.car
deleted file mode 100644
index de0e9a0..0000000
Binary files a/tauri/src-tauri/gen/Assets.car and /dev/null differ
diff --git a/tauri/src-tauri/gen/voicebox.icns b/tauri/src-tauri/gen/voicebox.icns
index 59661d9..94bc0c9 100644
Binary files a/tauri/src-tauri/gen/voicebox.icns and b/tauri/src-tauri/gen/voicebox.icns differ
diff --git a/tauri/src-tauri/src/audio_output.rs b/tauri/src-tauri/src/audio_output.rs
index d2b2c63..dc47447 100644
--- a/tauri/src-tauri/src/audio_output.rs
+++ b/tauri/src-tauri/src/audio_output.rs
@@ -3,6 +3,174 @@ use cpal::{Device, Host, SampleFormat, StreamConfig};
 use std::sync::{Arc, Mutex};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 
+// ── CoreAudio direct enumeration (macOS only) ────────────────────────────────
+// cpal's output_devices() misses HDMI/DisplayPort and some other transports on
+// macOS because it only walks kAudioObjectPropertyScopeOutput devices that
+// happen to have streams.  Querying AudioObjectGetPropertyData directly with
+// kAudioHardwarePropertyDevices gives the full list that Audio MIDI Setup shows.
+#[cfg(target_os = "macos")]
+mod coreaudio_enum {
+    use coreaudio_sys::{
+        kAudioHardwarePropertyDefaultInputDevice,
+        kAudioHardwarePropertyDefaultOutputDevice,
+        kAudioHardwarePropertyDevices,
+        kAudioObjectPropertyScopeGlobal,
+        kAudioObjectPropertyElementMaster,
+        kAudioObjectSystemObject,
+        kAudioDevicePropertyDeviceNameCFString,
+        kAudioDevicePropertyStreams,
+        kAudioObjectPropertyScopeOutput,
+        kAudioObjectPropertyScopeInput,
+        AudioDeviceID,
+        AudioObjectGetPropertyData,
+        AudioObjectGetPropertyDataSize,
+        AudioObjectPropertyAddress,
+    };
+    use core_foundation_sys::string::{
+        CFStringGetCString, CFStringGetCStringPtr, CFStringRef, kCFStringEncodingUTF8,
+    };
+    use std::mem;
+    use std::ffi::CStr;
+
+    fn get_all_device_ids() -> Vec<AudioDeviceID> {
+        unsafe {
+            let addr = AudioObjectPropertyAddress {
+                mSelector: kAudioHardwarePropertyDevices,
+                mScope: kAudioObjectPropertyScopeGlobal,
+                mElement: kAudioObjectPropertyElementMaster,
+            };
+            let mut size: u32 = 0;
+            if AudioObjectGetPropertyDataSize(
+                kAudioObjectSystemObject,
+                &addr,
+                0,
+                std::ptr::null(),
+                &mut size,
+            ) != 0 { return vec![]; }
+
+            let count = size as usize / mem::size_of::<AudioDeviceID>();
+            let mut ids: Vec<AudioDeviceID> = vec![0u32; count];
+            if AudioObjectGetPropertyData(
+                kAudioObjectSystemObject,
+                &addr,
+                0,
+                std::ptr::null(),
+                &mut size,
+                ids.as_mut_ptr() as *mut _,
+            ) != 0 { return vec![]; }
+            ids
+        }
+    }
+
+    fn device_name(id: AudioDeviceID) -> Option<String> {
+        unsafe {
+            let addr = AudioObjectPropertyAddress {
+                mSelector: kAudioDevicePropertyDeviceNameCFString,
+                mScope: kAudioObjectPropertyScopeGlobal,
+                mElement: kAudioObjectPropertyElementMaster,
+            };
+            let mut cf_str: CFStringRef = std::ptr::null();
+            let mut size = mem::size_of::<CFStringRef>() as u32;
+            if AudioObjectGetPropertyData(
+                id,
+                &addr,
+                0,
+                std::ptr::null(),
+                &mut size,
+                &mut cf_str as *mut _ as *mut _,
+            ) != 0 { return None; }
+            if cf_str.is_null() { return None; }
+            // CFStringGetCStringPtr may return NULL even for valid strings (e.g. when
+            // the internal storage uses a non-UTF-8 encoding).  Fall back to
+            // CFStringGetCString with a stack buffer in that case.
+            let ptr = CFStringGetCStringPtr(cf_str, kCFStringEncodingUTF8);
+            if !ptr.is_null() {
+                return Some(CStr::from_ptr(ptr).to_string_lossy().into_owned());
+            }
+            let mut buf = [0i8; 512];
+            if CFStringGetCString(cf_str, buf.as_mut_ptr(), buf.len() as _, kCFStringEncodingUTF8) == 0 {
+                return None;
+            }
+            Some(CStr::from_ptr(buf.as_ptr()).to_string_lossy().into_owned())
+        }
+    }
+
+    /// Returns true if the device has at least one stream in `scope`
+    /// (kAudioObjectPropertyScopeOutput or kAudioObjectPropertyScopeInput).
+    fn has_streams(id: AudioDeviceID, scope: u32) -> bool {
+        unsafe {
+            let addr = AudioObjectPropertyAddress {
+                mSelector: kAudioDevicePropertyStreams,
+                mScope: scope,
+                mElement: kAudioObjectPropertyElementMaster,
+            };
+            let mut size: u32 = 0;
+            AudioObjectGetPropertyDataSize(
+                id,
+                &addr,
+                0,
+                std::ptr::null(),
+                &mut size,
+            ) == 0 && size > 0
+        }
+    }
+
+    fn default_device_id(selector: u32) -> Option<AudioDeviceID> {
+        unsafe {
+            let addr = AudioObjectPropertyAddress {
+                mSelector: selector,
+                mScope: kAudioObjectPropertyScopeGlobal,
+                mElement: kAudioObjectPropertyElementMaster,
+            };
+            let mut id: AudioDeviceID = 0;
+            let mut size = mem::size_of::<AudioDeviceID>() as u32;
+            if AudioObjectGetPropertyData(
+                kAudioObjectSystemObject,
+                &addr,
+                0,
+                std::ptr::null(),
+                &mut size,
+                &mut id as *mut _ as *mut _,
+            ) != 0 { return None; }
+            Some(id)
+        }
+    }
+
+    pub struct CoreAudioDevice {
+        pub id: String,
+        pub name: String,
+        pub is_default: bool,
+    }
+
+    pub fn list_output_devices() -> Vec<CoreAudioDevice> {
+        let default_id = default_device_id(kAudioHardwarePropertyDefaultOutputDevice);
+        get_all_device_ids()
+            .into_iter()
+            .filter(|&id| has_streams(id, kAudioObjectPropertyScopeOutput))
+            .filter_map(|id| {
+                let name = device_name(id)?;
+                let dev_id = format!("device_{}", name.replace(' ', "_").to_lowercase());
+                let is_default = default_id.map_or(false, |d| d == id);
+                Some(CoreAudioDevice { id: dev_id, name, is_default })
+            })
+            .collect()
+    }
+
+    pub fn list_input_devices() -> Vec<CoreAudioDevice> {
+        let default_id = default_device_id(kAudioHardwarePropertyDefaultInputDevice);
+        get_all_device_ids()
+            .into_iter()
+            .filter(|&id| has_streams(id, kAudioObjectPropertyScopeInput))
+            .filter_map(|id| {
+                let name = device_name(id)?;
+                let dev_id = format!("input_{}", name.replace(' ', "_").to_lowercase());
+                let is_default = default_id.map_or(false, |d| d == id);
+                Some(CoreAudioDevice { id: dev_id, name, is_default })
+            })
+            .collect()
+    }
+}
+
 #[derive(Debug, Clone, serde::Serialize)]
 pub struct AudioOutputDevice {
     pub id: String,
@@ -10,19 +178,30 @@ pub struct AudioOutputDevice {
     pub is_default: bool,
 }
 
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct AudioInputDevice {
+    pub id: String,
+    pub name: String,
+    pub is_default: bool,
+}
+
 pub struct AudioOutputState {
-    host: Host,
     stop_flag: Arc<AtomicBool>,
 }
 
 impl AudioOutputState {
     pub fn new() -> Self {
         Self {
-            host: cpal::default_host(),
             stop_flag: Arc::new(AtomicBool::new(false)),
         }
     }
 
+    fn host() -> Host {
+        // Create a fresh host each time so newly connected devices are visible.
+        // cpal's default_host() on macOS re-queries CoreAudio on each call.
+        cpal::default_host()
+    }
+
     pub fn stop_all_playback(&self) -> Result<(), String> {
         eprintln!("stop_all_playback: Setting stop flag");
         self.stop_flag.store(true, Ordering::Relaxed);
@@ -31,35 +210,73 @@ impl AudioOutputState {
     }
 
     pub fn list_output_devices(&self) -> Result<Vec<AudioOutputDevice>, String> {
-        let devices = self
-            .host
-            .output_devices()
-            .map_err(|e| format!("Failed to enumerate output devices: {}", e))?;
-
-        let default_device = self.host.default_output_device();
-
-        let mut result = Vec::new();
-        for device in devices {
-            let name = device
-                .name()
-                .map_err(|e| format!("Failed to get device name: {}", e))?;
-
-            // Generate a stable ID from the device name (cpal doesn't provide stable IDs)
-            let id = format!("device_{}", name.replace(' ', "_").to_lowercase());
+        // On macOS use CoreAudio directly — cpal misses HDMI/DisplayPort devices.
+        #[cfg(target_os = "macos")]
+        {
+            let devices = coreaudio_enum::list_output_devices();
+            return Ok(devices
+                .into_iter()
+                .map(|d| AudioOutputDevice { id: d.id, name: d.name, is_default: d.is_default })
+                .collect());
+        }
 
-            let is_default = default_device
-                .as_ref()
-                .map(|d| d.name().unwrap_or_default() == name)
-                .unwrap_or(false);
+        // Fallback: cpal (Windows / Linux)
+        #[cfg(not(target_os = "macos"))]
+        {
+            let host = Self::host();
+            let devices = host
+                .output_devices()
+                .map_err(|e| format!("Failed to enumerate output devices: {}", e))?;
+            let default_device = host.default_output_device();
+            let mut result = Vec::new();
+            for device in devices {
+                let name = device
+                    .name()
+                    .map_err(|e| format!("Failed to get device name: {}", e))?;
+                let id = format!("device_{}", name.replace(' ', "_").to_lowercase());
+                let is_default = default_device
+                    .as_ref()
+                    .map(|d| d.name().unwrap_or_default() == name)
+                    .unwrap_or(false);
+                result.push(AudioOutputDevice { id, name, is_default });
+            }
+            return Ok(result);
+        }
+    }
 
-            result.push(AudioOutputDevice {
-                id,
-                name,
-                is_default,
-            });
+    pub fn list_input_devices(&self) -> Result<Vec<AudioInputDevice>, String> {
+        // On macOS use CoreAudio directly for full device list.
+        #[cfg(target_os = "macos")]
+        {
+            let devices = coreaudio_enum::list_input_devices();
+            return Ok(devices
+                .into_iter()
+                .map(|d| AudioInputDevice { id: d.id, name: d.name, is_default: d.is_default })
+                .collect());
         }
 
-        Ok(result)
+        // Fallback: cpal (Windows / Linux)
+        #[cfg(not(target_os = "macos"))]
+        {
+            let host = Self::host();
+            let devices = host
+                .input_devices()
+                .map_err(|e| format!("Failed to enumerate input devices: {}", e))?;
+            let default_device = host.default_input_device();
+            let mut result = Vec::new();
+            for device in devices {
+                let name = device
+                    .name()
+                    .map_err(|e| format!("Failed to get device name: {}", e))?;
+                let id = format!("input_{}", name.replace(' ', "_").to_lowercase());
+                let is_default = default_device
+                    .as_ref()
+                    .map(|d| d.name().unwrap_or_default() == name)
+                    .unwrap_or(false);
+                result.push(AudioInputDevice { id, name, is_default });
+            }
+            return Ok(result);
+        }
     }
 
     pub async fn play_audio_to_devices(
@@ -75,10 +292,15 @@ impl AudioOutputState {
         let (samples, sample_rate, channels) = self.decode_wav(&audio_data)?;
         eprintln!("Audio decoded: {} samples, {}Hz, {} channels", samples.len(), sample_rate, channels);
 
-        // Find devices by ID
+        // Find devices by ID.
+        // On macOS, list_output_devices() uses CoreAudio which may include devices
+        // (e.g. HDMI/DisplayPort) that cpal cannot open.  We match by the same
+        // name-derived ID scheme used during enumeration so that any device cpal
+        // *can* see is routed correctly.  CoreAudio-only devices will simply not
+        // match and will produce a clear error rather than silent failure.
         eprintln!("Enumerating output devices...");
-        let devices: Vec<Device> = self
-            .host
+        let host = Self::host();
+        let devices: Vec<Device> = host
             .output_devices()
             .map_err(|e| format!("Failed to enumerate devices: {}", e))?
             .filter_map(|device| {
diff --git a/tauri/src-tauri/src/main.rs b/tauri/src-tauri/src/main.rs
index 255655a..06940f8 100644
--- a/tauri/src-tauri/src/main.rs
+++ b/tauri/src-tauri/src/main.rs
@@ -581,6 +581,13 @@ fn list_audio_output_devices(
     state.list_output_devices()
 }
 
+#[command]
+fn list_audio_input_devices(
+    state: State<'_, audio_output::AudioOutputState>,
+) -> Result<Vec<audio_output::AudioInputDevice>, String> {
+    state.list_input_devices()
+}
+
 #[command]
 async fn play_audio_to_devices(
     state: State<'_, audio_output::AudioOutputState>,
@@ -645,6 +652,7 @@ pub fn run() {
             stop_system_audio_capture,
             is_system_audio_supported,
             list_audio_output_devices,
+            list_audio_input_devices,
             play_audio_to_devices,
             stop_audio_playback
         ])
diff --git a/tauri/src/platform/audio.ts b/tauri/src/platform/audio.ts
index bdcf0b2..5e90e2f 100644
--- a/tauri/src/platform/audio.ts
+++ b/tauri/src/platform/audio.ts
@@ -30,6 +30,10 @@ export const tauriAudio: PlatformAudio = {
     return await invoke<AudioDevice[]>('list_audio_output_devices');
   },
 
+  async listInputDevices(): Promise<AudioDevice[]> {
+    return await invoke<AudioDevice[]>('list_audio_input_devices');
+  },
+
   async playToDevices(audioData: Uint8Array, deviceIds: string[]): Promise<void> {
     await invoke('play_audio_to_devices', {
       audioData: Array.from(audioData),
diff --git a/tauri/src/platform/lifecycle.ts b/tauri/src/platform/lifecycle.ts
index 562c75a..d8908de 100644
--- a/tauri/src/platform/lifecycle.ts
+++ b/tauri/src/platform/lifecycle.ts
@@ -44,7 +44,6 @@ class TauriLifecycle implements PlatformLifecycle {
         const keepRunning = useServerStore.getState().keepServerRunningOnClose;
 
         // Check if server was started by this app instance
-        // @ts-expect-error - accessing module-level variable from another module
         const serverStartedByApp = window.__voiceboxServerStartedByApp ?? false;
 
         if (!keepRunning && serverStartedByApp) {
diff --git a/tauri/src/platform/metadata.ts b/tauri/src/platform/metadata.ts
index cb12ac9..11e8358 100644
--- a/tauri/src/platform/metadata.ts
+++ b/tauri/src/platform/metadata.ts
@@ -10,5 +10,17 @@ export const tauriMetadata: PlatformMetadata = {
       return '0.1.0';
     }
   },
+  getBuildInfo(): string {
+    try {
+      const hash = __GIT_HASH__;
+      const count = __GIT_COMMIT_COUNT__;
+      if (import.meta.env.DEV) {
+        return `dev-${hash}`;
+      }
+      return `${hash} #${count}`;
+    } catch {
+      return '';
+    }
+  },
   isTauri: true,
 };
diff --git a/tauri/src/platform/updater.ts b/tauri/src/platform/updater.ts
index 24a1b6b..5bc446f 100644
--- a/tauri/src/platform/updater.ts
+++ b/tauri/src/platform/updater.ts
@@ -20,7 +20,9 @@ class TauriUpdater implements PlatformUpdater {
   private subscribers: Set<(status: UpdateStatus) => void> = new Set();
 
   private notifySubscribers() {
-    this.subscribers.forEach((callback) => callback(this.status));
+    for (const callback of this.subscribers) {
+      callback(this.status);
+    }
   }
 
   subscribe(callback: (status: UpdateStatus) => void): () => void {
diff --git a/tauri/tsconfig.json b/tauri/tsconfig.json
index 1c14806..47351b9 100644
--- a/tauri/tsconfig.json
+++ b/tauri/tsconfig.json
@@ -21,6 +21,6 @@
       "@/*": ["../app/src/*"]
     }
   },
-  "include": ["src"],
+  "include": ["src", "../app/src/global.d.ts"],
   "references": [{ "path": "./tsconfig.node.json" }]
 }
diff --git a/tauri/vite.config.ts b/tauri/vite.config.ts
index 71caf5f..542f972 100644
--- a/tauri/vite.config.ts
+++ b/tauri/vite.config.ts
@@ -1,9 +1,30 @@
 import path from 'node:path';
+import { execSync } from 'node:child_process';
 import react from '@vitejs/plugin-react';
 import tailwindcss from '@tailwindcss/vite';
 import { defineConfig } from 'vite';
 
+function getGitHash(): string {
+  try {
+    return execSync('git rev-parse --short HEAD', { encoding: 'utf8' }).trim();
+  } catch {
+    return 'unknown';
+  }
+}
+
+function getGitCommitCount(): number {
+  try {
+    return parseInt(execSync('git rev-list --count HEAD', { encoding: 'utf8' }).trim(), 10);
+  } catch {
+    return 0;
+  }
+}
+
 export default defineConfig({
+  define: {
+    __GIT_HASH__: JSON.stringify(getGitHash()),
+    __GIT_COMMIT_COUNT__: getGitCommitCount(),
+  },
   plugins: [react(), tailwindcss()],
   resolve: {
     alias: {
diff --git a/voicebox-cli b/voicebox-cli
new file mode 100755
index 0000000..53482a3
--- /dev/null
+++ b/voicebox-cli
@@ -0,0 +1,1122 @@
+#!/usr/bin/env python3
+"""
+voicebox — self-contained CLI for talking to a Voicebox backend server.
+
+No dependencies beyond the Python 3 standard library.
+
+Configuration (env vars):
+  VOICEBOX_URL   Full server URL, e.g. http://10.0.0.5:17493
+  VOICEBOX_HOST  Server host/IP (default: 127.0.0.1). Ignored if VOICEBOX_URL is set.
+  VOICEBOX_PORT  Server port (default: 17493). Ignored if VOICEBOX_URL is set.
+"""
+
+import argparse
+import json
+import mimetypes
+import os
+import shutil
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.error
+import urllib.request
+from pathlib import Path
+
+DEFAULT_HOST = "127.0.0.1"
+DEFAULT_PORT = 17493
+SERVER_BIN = "/Applications/Voicebox.app/Contents/MacOS/voicebox-server"
+DEFAULT_DATA_DIR = Path.home() / "Library/Application Support/sh.voicebox.app"
+PID_FILE = Path.home() / ".voicebox.pid"
+LOG_FILE = Path.home() / ".voicebox.log"
+CONFIG_DIR = Path.home() / ".config" / "voicebox"
+CONFIG_FILE = CONFIG_DIR / "config.json"
+LEGACY_CONFIG_FILE = Path.home() / ".voicebox.json"
+TRANSCRIPT_TEXT_KEYS = ("text", "transcript", "transcription")
+TRANSCRIPT_MODEL_KEYS = ("model", "transcribe_model", "whisper_model", "model_name")
+
+
+def resolve_base_url(args):
+    """Determine server URL from env vars and CLI flags."""
+    if os.environ.get("VOICEBOX_URL"):
+        return os.environ["VOICEBOX_URL"].rstrip("/")
+    host = os.environ.get("VOICEBOX_HOST", DEFAULT_HOST)
+    port = os.environ.get("VOICEBOX_PORT", str(DEFAULT_PORT))
+    if hasattr(args, "port") and args.port != DEFAULT_PORT:
+        port = str(args.port)
+    return f"http://{host}:{port}"
+
+
+# --- HTTP helpers (stdlib only, no requests) ---
+
+
+def parse_error_detail(raw_text, fallback_limit=200):
+    """Extract useful API error detail from raw response text."""
+    try:
+        return json.loads(raw_text).get("detail", raw_text[:fallback_limit])
+    except Exception:
+        return raw_text[:fallback_limit]
+
+
+def print_request_error(err):
+    """Print a normalized request error message and optional connection hint."""
+    if err.status_code is None:
+        print(f"Error: {err.detail}", file=sys.stderr)
+        print("Start it with: voicebox server", file=sys.stderr)
+    else:
+        print(f"Error: {err.status_code} — {err.detail}", file=sys.stderr)
+
+
+def require_file(path_str, label="file"):
+    """Validate and return a file path."""
+    path = Path(path_str)
+    if not path.exists() or not path.is_file():
+        print(f"Error: {label} not found: {path}", file=sys.stderr)
+        sys.exit(1)
+    return path
+
+
+def resolve_transcribe_model(override_model=None):
+    """Get transcribe model from CLI override or saved config."""
+    return override_model or load_cli_config().get("default_transcribe_model")
+
+
+def extract_transcription_payload(result):
+    """Normalize varied transcription payload shapes to (text, used_model)."""
+    used_model = None
+    if isinstance(result, bytes):
+        return result.decode(errors="replace").strip(), used_model
+    if isinstance(result, dict):
+        for key in TRANSCRIPT_MODEL_KEYS:
+            if result.get(key):
+                used_model = result[key]
+                break
+        for key in TRANSCRIPT_TEXT_KEYS:
+            if result.get(key):
+                return str(result[key]).strip(), used_model
+        return json.dumps(result, ensure_ascii=False, indent=2).strip(), used_model
+    return str(result).strip(), used_model
+
+def api(method, base_url, path, json_body=None, file_data=None, timeout=30):
+    """Make an API call. Returns (status, parsed_json_or_bytes)."""
+    url = f"{base_url}{path}"
+
+    if file_data:
+        # multipart upload
+        boundary = f"----voiceboxboundary{int(time.time())}"
+        filename, file_bytes, content_type = file_data
+        body = (
+            f"--{boundary}\r\n"
+            f'Content-Disposition: form-data; name="file"; filename="{filename}"\r\n'
+            f"Content-Type: {content_type}\r\n\r\n"
+        ).encode() + file_bytes + f"\r\n--{boundary}--\r\n".encode()
+        req = urllib.request.Request(url, data=body, method=method.upper())
+        req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
+    elif json_body is not None:
+        body = json.dumps(json_body).encode()
+        req = urllib.request.Request(url, data=body, method=method.upper())
+        req.add_header("Content-Type", "application/json")
+    else:
+        req = urllib.request.Request(url, method=method.upper())
+
+    try:
+        resp = urllib.request.urlopen(req, timeout=timeout)
+        data = resp.read()
+        ct = resp.headers.get("Content-Type", "")
+        if "application/json" in ct:
+            return resp.status, json.loads(data)
+        return resp.status, data
+    except urllib.error.HTTPError as e:
+        body = e.read().decode(errors="replace")
+        detail = parse_error_detail(body)
+        print(f"Error: {e.code} — {detail}", file=sys.stderr)
+        sys.exit(1)
+    except urllib.error.URLError as e:
+        print(f"Error: cannot connect to server at {base_url}", file=sys.stderr)
+        if hasattr(e, "reason"):
+            print(f"  Reason: {e.reason}", file=sys.stderr)
+        print("Start it with: voicebox server", file=sys.stderr)
+        sys.exit(1)
+    except TimeoutError:
+        print(f"Error: request to {base_url}{path} timed out after {timeout}s", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        # Catches RemoteDisconnected, ConnectionResetError, etc. — server likely crashed.
+        print(f"Error: server closed connection unexpectedly ({type(e).__name__}: {e})", file=sys.stderr)
+        print("Check server logs for details.", file=sys.stderr)
+        sys.exit(1)
+
+
+def api_get_json(base_url, path, timeout=30):
+    _, data = api("GET", base_url, path, timeout=timeout)
+    return data
+
+
+def load_cli_config():
+    """Load CLI config from ~/.config/voicebox/config.json."""
+    config_path = CONFIG_FILE if CONFIG_FILE.exists() else LEGACY_CONFIG_FILE
+    if not config_path.exists():
+        return {}
+    try:
+        data = json.loads(config_path.read_text())
+        return data if isinstance(data, dict) else {}
+    except Exception:
+        return {}
+
+
+def save_cli_config(config):
+    """Persist CLI config to ~/.config/voicebox/config.json."""
+    CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+    CONFIG_FILE.write_text(json.dumps(config, indent=2, sort_keys=True) + "\n")
+
+
+class ApiRequestError(Exception):
+    """Structured API request error for cases where caller handles retries."""
+
+    def __init__(self, status_code, detail):
+        super().__init__(detail)
+        self.status_code = status_code
+        self.detail = detail
+
+
+def api_multipart_or_error(base_url, path, fields, file_field_name, file_path, timeout=120):
+    """POST multipart/form-data and raise ApiRequestError on failure."""
+    url = f"{base_url}{path}"
+    boundary = f"----voiceboxboundary{int(time.time())}"
+    file_name = file_path.name
+    content_type = mimetypes.guess_type(file_name)[0] or "application/octet-stream"
+
+    body = bytearray()
+    for key, value in fields.items():
+        if value is None:
+            continue
+        body.extend(
+            (
+                f"--{boundary}\r\n"
+                f'Content-Disposition: form-data; name="{key}"\r\n\r\n'
+                f"{value}\r\n"
+            ).encode()
+        )
+
+    body.extend(
+        (
+            f"--{boundary}\r\n"
+            f'Content-Disposition: form-data; name="{file_field_name}"; filename="{file_name}"\r\n'
+            f"Content-Type: {content_type}\r\n\r\n"
+        ).encode()
+    )
+    body.extend(file_path.read_bytes())
+    body.extend(f"\r\n--{boundary}--\r\n".encode())
+
+    req = urllib.request.Request(url, data=bytes(body), method="POST")
+    req.add_header("Content-Type", f"multipart/form-data; boundary={boundary}")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            data = resp.read()
+            ct = resp.headers.get("Content-Type", "")
+            if "application/json" in ct:
+                return resp.status, json.loads(data)
+            return resp.status, data
+    except urllib.error.HTTPError as e:
+        body_txt = e.read().decode(errors="replace")
+        raise ApiRequestError(e.code, parse_error_detail(body_txt))
+    except urllib.error.URLError:
+        raise ApiRequestError(None, f"cannot connect to server at {base_url}")
+
+
+def normalize_audio_sample(input_path, gain_db=0.0):
+    """Create a normalized mono WAV from input_path. Returns temp Path.
+
+    Uses pyloudnorm (EBU R128 LUFS) if available, falls back to ffmpeg,
+    then falls back to a simple stdlib peak-normalization.
+    """
+    fd, temp_name = tempfile.mkstemp(prefix="voicebox-normalized-", suffix=".wav")
+    os.close(fd)
+    out_path = Path(temp_name)
+
+    # --- Try pyloudnorm (pure Python, best quality) ---
+    try:
+        import numpy as np
+        import soundfile as _sf
+        import pyloudnorm as pyln
+
+        audio, sr = _sf.read(str(input_path))
+        # Convert to mono if stereo
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+        audio = audio.astype(np.float32)
+
+        # Resample to 24kHz if needed
+        if sr != 24000:
+            try:
+                import librosa
+                audio = librosa.resample(audio, orig_sr=sr, target_sr=24000)
+                sr = 24000
+            except ImportError:
+                pass  # Keep original sample rate
+
+        # LUFS normalization (matching ffmpeg loudnorm=I=-16:TP=-2)
+        meter = pyln.Meter(sr)
+        current_lufs = meter.integrated_loudness(audio)
+        if np.isfinite(current_lufs) and current_lufs > -70:
+            target_lufs = -16.0 + gain_db
+            audio = pyln.normalize.loudness(audio, current_lufs, target_lufs)
+
+        # True-peak limiting at -2 dBTP
+        peak_limit = 10 ** (-2.0 / 20)
+        audio = np.clip(audio, -peak_limit, peak_limit)
+
+        _sf.write(str(out_path), audio, sr)
+        return out_path
+    except ImportError:
+        pass  # pyloudnorm or soundfile not available
+
+    # --- Fallback: ffmpeg ---
+    if shutil.which("ffmpeg"):
+        filter_chain = "loudnorm=I=-16:TP=-2:LRA=11,alimiter=limit=-2dB"
+        if gain_db != 0:
+            sign = "+" if gain_db > 0 else ""
+            filter_chain = f"{filter_chain},volume={sign}{gain_db}dB"
+
+        cmd = [
+            "ffmpeg", "-hide_banner", "-loglevel", "error", "-y",
+            "-i", str(input_path), "-af", filter_chain,
+            "-ac", "1", "-ar", "24000", str(out_path),
+        ]
+        proc = subprocess.run(cmd, capture_output=True, text=True)
+        if proc.returncode == 0:
+            return out_path
+        out_path.unlink(missing_ok=True)
+
+    # --- Last resort: simple WAV peak normalization (stdlib only) ---
+    try:
+        import wave
+        import struct
+
+        with wave.open(str(input_path), "rb") as wf:
+            params = wf.getparams()
+            frames = wf.readframes(params.nframes)
+
+        if params.sampwidth == 2:
+            fmt = f"<{params.nframes * params.nchannels}h"
+            samples = list(struct.unpack(fmt, frames))
+            peak = max(abs(s) for s in samples) or 1
+            target = int(32767 * 0.79)  # ~-2 dBTP
+            gain = target / peak
+            samples = [max(-32768, min(32767, int(s * gain))) for s in samples]
+
+            with wave.open(str(out_path), "wb") as wf_out:
+                wf_out.setparams(params)
+                wf_out.writeframes(struct.pack(fmt, *samples))
+            return out_path
+    except Exception:
+        pass
+
+    out_path.unlink(missing_ok=True)
+    raise RuntimeError("Cannot normalize audio: install pyloudnorm or ffmpeg")
+
+
+def print_model_used(label, used_model):
+    if used_model:
+        print(f"{label}: {used_model}", file=sys.stderr)
+    else:
+        print(f"{label}: (not reported by server)", file=sys.stderr)
+
+
+def upload_profile_sample_with_retry(base_url, profile_id, sample_path, reference_text, timeout):
+    """Upload sample, normalizing first for best quality.
+
+    The backend now normalizes on its own, so this is belt-and-suspenders.
+    If the initial upload fails with a clipping error (older servers),
+    retries once with a locally normalized copy.
+    """
+    normalized_path = None
+    try:
+        # First attempt: upload original
+        try:
+            _, sample = api_multipart_or_error(
+                base_url,
+                f"/profiles/{profile_id}/samples",
+                fields={"reference_text": reference_text},
+                file_field_name="file",
+                file_path=sample_path,
+                timeout=timeout,
+            )
+            return sample
+        except ApiRequestError as e:
+            detail_lower = str(e.detail).lower()
+            is_clipping = e.status_code == 400 and "clipping" in detail_lower
+            if not is_clipping:
+                raise
+
+        # Retry with normalized audio
+        print("Server reported clipping; normalizing and retrying...", file=sys.stderr)
+        try:
+            normalized_path = normalize_audio_sample(sample_path)
+        except RuntimeError as norm_err:
+            print(f"Warning: {norm_err}", file=sys.stderr)
+            raise e  # Re-raise original clipping error
+
+        _, sample = api_multipart_or_error(
+            base_url,
+            f"/profiles/{profile_id}/samples",
+            fields={"reference_text": reference_text},
+            file_field_name="file",
+            file_path=normalized_path,
+            timeout=timeout,
+        )
+        print("Retry succeeded with normalized audio.", file=sys.stderr)
+        return sample
+    finally:
+        if normalized_path:
+            normalized_path.unlink(missing_ok=True)
+
+
+# --- Subcommands ---
+
+def _file_starts_with_python_shebang(path):
+    """Check if a file starts with a Python shebang line."""
+    try:
+        with open(path, "rb") as f:
+            first_line = f.readline(120)
+            return first_line.startswith(b"#!") and b"python" in first_line
+    except Exception:
+        return False
+
+
+def _find_venv_python(project_dir):
+    """Find the venv Python interpreter for a project directory.
+
+    Checks common venv locations (backend/venv, .venv, venv) and returns the
+    Python path if found and working, else falls back to sys.executable.
+    """
+    candidates = [
+        Path(project_dir) / "backend" / "venv" / "bin" / "python",
+        Path(project_dir) / ".venv" / "bin" / "python",
+        Path(project_dir) / "venv" / "bin" / "python",
+    ]
+    for candidate in candidates:
+        if candidate.exists():
+            try:
+                subprocess.run(
+                    [str(candidate), "-c", "import sys"],
+                    capture_output=True, timeout=5,
+                )
+                return str(candidate)
+            except Exception:
+                continue
+    return sys.executable
+
+
+def _check_port_in_use(port):
+    """Check if something is already listening on the given port.
+
+    Returns:
+        "voicebox"  — a voicebox server is responding on that port
+        "occupied"  — something else is listening on that port
+        None        — port is free
+    """
+    port = int(port)
+    # Quick socket probe
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.settimeout(1)
+    try:
+        sock.connect(("127.0.0.1", port))
+        sock.close()
+    except (ConnectionRefusedError, OSError):
+        return None  # Port is free
+
+    # Port is occupied — check if it's a voicebox server
+    try:
+        r = urllib.request.urlopen(f"http://127.0.0.1:{port}/health", timeout=2)
+        if r.status == 200:
+            return "voicebox"
+    except Exception:
+        pass
+    return "occupied"
+
+
+def cmd_server(args):
+    """Start the backend server (no frontend)."""
+    data_dir = args.data_dir or str(DEFAULT_DATA_DIR)
+    port = str(args.port)
+
+    if args.stop:
+        _stop_server()
+        return
+
+    # Handle --server-path: save to config if provided
+    config = load_cli_config()
+    if args.server_path:
+        resolved = str(Path(args.server_path).resolve())
+        config["server_path"] = resolved
+        save_cli_config(config)
+        print(f"Saved server path: {resolved}")
+
+    # Check if something is already running on the target port
+    port_status = _check_port_in_use(port)
+    if port_status == "voicebox":
+        print(f"Voicebox server already running on port {port}.")
+        return
+    elif port_status == "occupied":
+        print(f"Error: port {port} is already in use by another process.", file=sys.stderr)
+        print(f"Either stop the process or use a different port: voicebox server --port <port>", file=sys.stderr)
+        sys.exit(1)
+
+    # Resolve server binary (in priority order):
+    #   1. --server-path flag or saved config
+    #   2. ./backend/main.py in cwd (local dev checkout)
+    #   3. /Applications/Voicebox.app bundle
+    #   4. voicebox-server on PATH
+    #   5. python -m backend.main from CLI's own directory
+    custom_server = args.server_path or config.get("server_path")
+    local_backend = Path.cwd() / "backend" / "main.py"
+
+    if custom_server:
+        custom_path = Path(custom_server).resolve()
+        if not custom_path.exists():
+            print(f"Error: server path not found: {custom_path}", file=sys.stderr)
+            print("Clear it with: voicebox config --clear-server-path", file=sys.stderr)
+            sys.exit(1)
+        if custom_path.is_dir():
+            # Directory: run `python -m backend.main` from that directory
+            # Use the project's venv Python if available
+            python = _find_venv_python(str(custom_path))
+            bin_path = None
+            cwd = str(custom_path)
+            cmd = [python, "-m", "backend.main", "--data-dir", data_dir, "--port", port]
+        elif custom_path.suffix == ".py" or _file_starts_with_python_shebang(custom_path):
+            # Python script: run with interpreter from the project's venv
+            python = _find_venv_python(str(custom_path.parent))
+            bin_path = str(custom_path)
+            cwd = str(custom_path.parent)
+            cmd = [python, bin_path, "--data-dir", data_dir, "--port", port]
+        else:
+            # Binary executable
+            bin_path = str(custom_path)
+            cwd = None
+            cmd = [bin_path, "--data-dir", data_dir, "--port", port]
+    elif local_backend.exists():
+        # Running from a project checkout — use local backend with venv Python
+        python = _find_venv_python(str(Path.cwd()))
+        bin_path = None
+        cwd = str(Path.cwd())
+        cmd = [python, "-m", "backend.main", "--data-dir", data_dir, "--port", port]
+    elif Path(SERVER_BIN).exists():
+        bin_path = SERVER_BIN
+        cwd = None
+        cmd = [bin_path, "--data-dir", data_dir, "--port", port]
+    elif shutil.which("voicebox-server"):
+        bin_path = "voicebox-server"
+        cwd = None
+        cmd = [bin_path, "--data-dir", data_dir, "--port", port]
+    else:
+        # Fallback: run from CLI's own directory with its venv
+        cli_dir = str(Path(__file__).resolve().parent)
+        python = _find_venv_python(cli_dir)
+        bin_path = None
+        cwd = cli_dir
+        cmd = [python, "-m", "backend.main", "--data-dir", data_dir, "--port", port]
+
+    if args.detach:
+        if PID_FILE.exists():
+            pid = int(PID_FILE.read_text().strip())
+            try:
+                os.kill(pid, 0)
+                print(f"Server already running (pid {pid})")
+                return
+            except ProcessLookupError:
+                PID_FILE.unlink(missing_ok=True)
+
+        log = open(LOG_FILE, "w")
+        proc = subprocess.Popen(
+            cmd,
+            stdout=log,
+            stderr=log,
+            start_new_session=True,
+            cwd=cwd,
+        )
+        PID_FILE.write_text(str(proc.pid))
+        print(f"Waiting for server (pid {proc.pid}, port {port})...", end="", flush=True)
+        health_url = f"http://127.0.0.1:{port}/health"
+        for _ in range(60):
+            time.sleep(0.5)
+            if proc.poll() is not None:
+                print(" failed.")
+                print(f"Server exited. Check log: {LOG_FILE}", file=sys.stderr)
+                PID_FILE.unlink(missing_ok=True)
+                sys.exit(1)
+            try:
+                r = urllib.request.urlopen(health_url, timeout=2)
+                if r.status == 200:
+                    print(" ready.")
+                    print("Stop with: voicebox server --stop")
+                    return
+            except Exception:
+                print(".", end="", flush=True)
+        print(" timed out.")
+        print(f"Server didn't respond within 30s. Check log: {LOG_FILE}", file=sys.stderr)
+    else:
+        print(f"Starting voicebox server on port {port}...")
+        if custom_server:
+            print(f"Server: {custom_server}")
+        elif local_backend.exists() and cwd == str(Path.cwd()):
+            print(f"Server: local backend ({cwd})")
+        # Show Python path if using a venv (helps debug wrong-python issues)
+        if cmd[0] != sys.executable:
+            print(f"Python: {cmd[0]}")
+        print(f"Data dir: {data_dir}")
+        print("Press Ctrl+C to stop.\n")
+        try:
+            subprocess.run(cmd, cwd=cwd)
+        except KeyboardInterrupt:
+            print("\nServer stopped.")
+
+
+def _stop_server():
+    """Stop a detached server."""
+    if not PID_FILE.exists():
+        print("No server running (no pid file).")
+        return
+    pid = int(PID_FILE.read_text().strip())
+    try:
+        os.kill(pid, signal.SIGTERM)
+        print(f"Stopped server (pid {pid})")
+    except ProcessLookupError:
+        print(f"Server not running (stale pid {pid})")
+    PID_FILE.unlink(missing_ok=True)
+
+
+def cmd_voices(args):
+    """List or delete voice profiles."""
+    if args.delete:
+        profiles = api_get_json(args.url, "/profiles")
+        if not profiles:
+            print("No voice profiles found.")
+            return
+
+        target = args.delete.strip().lower()
+        match = [p for p in profiles if p["id"].lower() == target]
+        if not match:
+            exact_name = [p for p in profiles if p["name"].lower() == target]
+            match = exact_name or [p for p in profiles if target in p["name"].lower()]
+
+        if not match:
+            print(f"Error: no voice matching '{args.delete}'.", file=sys.stderr)
+            sys.exit(1)
+        if len(match) > 1:
+            print(f"Error: '{args.delete}' matches multiple voices. Use exact ID.", file=sys.stderr)
+            for p in match:
+                print(f"  - {p['name']} ({p['id']})", file=sys.stderr)
+            sys.exit(1)
+
+        victim = match[0]
+        if not args.yes:
+            confirm = input(f"Delete voice '{victim['name']}' ({victim['id']})? [y/N]: ").strip().lower()
+            if confirm not in ("y", "yes"):
+                print("Cancelled.")
+                return
+
+        api("DELETE", args.url, f"/profiles/{victim['id']}", timeout=30)
+        print(f"Deleted voice: {victim['name']} ({victim['id']})")
+        return
+
+    profiles = api_get_json(args.url, "/profiles")
+    if not profiles:
+        print("No voice profiles found. Import one with: voicebox import <file.zip>")
+        return
+    print(f"{'Name':<30} {'Language':<10} {'ID'}")
+    print("-" * 75)
+    for p in profiles:
+        print(f"{p['name']:<30} {p['language']:<10} {p['id']}")
+
+
+def cmd_import(args):
+    """Import a voice profile from a ZIP file."""
+    zip_path = require_file(args.file)
+
+    print(f"Importing {zip_path.name}...")
+    file_bytes = zip_path.read_bytes()
+    _, profile = api("POST", args.url, "/profiles/import",
+                     file_data=(zip_path.name, file_bytes, "application/zip"),
+                     timeout=60)
+    print(f"Imported: {profile['name']} ({profile['id']})")
+
+
+def _sse_stream(base_url, path, timeout=300):
+    """Read a Server-Sent Events stream, yielding parsed data dicts."""
+    import http.client, urllib.parse
+    parsed = urllib.parse.urlparse(f"{base_url}{path}")
+    conn = http.client.HTTPConnection(parsed.hostname, parsed.port or 80, timeout=timeout)
+    try:
+        conn.request("GET", parsed.path)
+        resp = conn.getresponse()
+        if resp.status != 200:
+            body = resp.read().decode(errors="replace")
+            print(f"Error: SSE stream returned {resp.status} — {parse_error_detail(body)}", file=sys.stderr)
+            sys.exit(1)
+        buf = b""
+        while True:
+            chunk = resp.read(1024)
+            if not chunk:
+                break
+            buf += chunk
+            while b"\n\n" in buf:
+                msg, buf = buf.split(b"\n\n", 1)
+                for line in msg.decode(errors="replace").splitlines():
+                    if line.startswith("data: "):
+                        try:
+                            yield json.loads(line[6:])
+                        except json.JSONDecodeError:
+                            pass
+    except Exception as e:
+        print(f"Error: SSE connection failed ({type(e).__name__}: {e})", file=sys.stderr)
+        print("Check server logs for details.", file=sys.stderr)
+        sys.exit(1)
+    finally:
+        conn.close()
+
+
+def cmd_generate(args):
+    """Generate speech from text."""
+    if args.text:
+        text = args.text
+    elif args.file:
+        text = Path(args.file).read_text().strip()
+    elif not sys.stdin.isatty():
+        text = sys.stdin.read().strip()
+    else:
+        # No text provided — show usage
+        # We can't easily get the subparser reference here, so print usage manually
+        print("Generate speech from text.\n", file=sys.stderr)
+        print("Usage: voicebox say --text 'Hello world'", file=sys.stderr)
+        print("       voicebox say --file input.txt", file=sys.stderr)
+        print("       echo 'Hello world' | voicebox say\n", file=sys.stderr)
+        print("Options:", file=sys.stderr)
+        print("  --text, -t TEXT       Text to speak", file=sys.stderr)
+        print("  --file, -f FILE       Read text from a file", file=sys.stderr)
+        print("  --voice, -v VOICE     Voice name (interactive picker if omitted)", file=sys.stderr)
+        print("  --output, -o OUTPUT   Output path (default: output_<epoch>.m4a)", file=sys.stderr)
+        print("  --instruct INSTRUCT   Style instruction (e.g. 'speak slowly')", file=sys.stderr)
+        print("  --language, -l LANG   Language code", file=sys.stderr)
+        print("  --seed, -s SEED       Random seed", file=sys.stderr)
+        print("  --no-open             Don't open file after generating", file=sys.stderr)
+        print("  -h, --help            Show this help message", file=sys.stderr)
+        sys.exit(1)
+
+    if not text:
+        print("Error: text is empty.", file=sys.stderr)
+        sys.exit(1)
+
+    profile = resolve_profile(args.url, args.voice)
+
+    payload = {
+        "profile_id": profile["id"],
+        "text": text,
+        "language": args.language or profile.get("language", "en"),
+    }
+    if args.seed is not None:
+        payload["seed"] = args.seed
+    if args.instruct:
+        payload["instruct"] = args.instruct
+
+    print(f"Generating with voice '{profile['name']}'...")
+    start = time.time()
+
+    # Queue the job asynchronously
+    _, start_data = api("POST", args.url, "/generate?stream=true", json_body=payload, timeout=30)
+    generation_id = start_data.get("generation_id")
+    if not generation_id:
+        print(f"Error: server did not return a generation_id", file=sys.stderr)
+        sys.exit(1)
+
+    # Stream progress via SSE
+    final_data = None
+    for event in _sse_stream(args.url, f"/generate/progress/{generation_id}", timeout=300):
+        status = event.get("status", "")
+        pct = event.get("progress", 0)
+        bar_len = 20
+        filled = int(pct / 100 * bar_len)
+        bar = "#" * filled + "-" * (bar_len - filled)
+        print(f"\r[{bar}] {pct:.0f}%  ", end="", flush=True)
+        if status in ("complete", "error"):
+            print()  # newline after progress bar
+            final_data = event
+            break
+
+    elapsed = time.time() - start
+
+    if final_data and final_data.get("status") == "error":
+        error_msg = final_data.get("error", "Unknown error")
+        print(f"Error: generation failed — {error_msg}", file=sys.stderr)
+        sys.exit(1)
+
+    # Fetch the generation record for duration/id
+    _, history = api("GET", args.url, "/history", timeout=10)
+    items = history.get("items", []) if isinstance(history, dict) else []
+    result = next((i for i in items if i.get("id") == generation_id), None)
+    if result is None and items:
+        result = items[0]  # fall back to latest
+
+    if result:
+        print(f"Done in {elapsed:.1f}s (audio duration: {result['duration']:.1f}s)")
+    else:
+        print(f"Done in {elapsed:.1f}s")
+
+    audio_id = (result or {}).get("id", generation_id)
+    tag = str(int(time.time()))[-5:]
+    wav_path = f"output_{tag}.wav"
+    _, wav_data = api("GET", args.url, f"/audio/{audio_id}", timeout=60)
+    Path(wav_path).write_bytes(wav_data)
+
+    if not shutil.which("ffmpeg"):
+        print("Warning: ffmpeg not found, keeping .wav", file=sys.stderr)
+        output = wav_path
+    else:
+        output = args.output or f"output_{tag}.m4a"
+        r = subprocess.run(
+            ["ffmpeg", "-y", "-i", wav_path, "-c:a", "aac", "-b:a", "128k", output],
+            capture_output=True,
+        )
+        if r.returncode != 0:
+            print("Warning: ffmpeg failed, keeping .wav", file=sys.stderr)
+            output = wav_path
+        else:
+            Path(wav_path).unlink()
+            print(f"Saved: {output}")
+
+    if not args.no_open:
+        subprocess.run(["open", output])
+
+
+def cmd_health(args):
+    """Check server health."""
+    h = api_get_json(args.url, "/health")
+    print(f"Status:       {h['status']}")
+    print(f"Model loaded: {h['model_loaded']}")
+    print(f"Backend:      {h.get('backend_type', '?')}")
+    print(f"GPU:          {h.get('gpu_type', 'none')}")
+    if h.get("vram_used_mb"):
+        print(f"VRAM used:    {h['vram_used_mb']:.0f} MB")
+
+
+def run_transcription(base_url, input_path, language=None, model=None, timeout=600):
+    """Transcribe media using backend Whisper endpoint."""
+    fields = {}
+    if language:
+        fields["language"] = language
+    if model:
+        # Send both names for backend compatibility while API settles.
+        fields["model"] = model
+        fields["transcribe_model"] = model
+
+    try:
+        _, result = api_multipart_or_error(
+            base_url,
+            "/transcribe",
+            fields=fields,
+            file_field_name="file",
+            file_path=input_path,
+            timeout=timeout,
+        )
+    except ApiRequestError as e:
+        print_request_error(e)
+        sys.exit(1)
+
+    return extract_transcription_payload(result)
+
+
+def cmd_config(args):
+    """Manage CLI defaults."""
+    config = load_cli_config()
+
+    if args.set_transcribe_model:
+        config["default_transcribe_model"] = args.set_transcribe_model
+        save_cli_config(config)
+        print(f"Set default transcribe model: {args.set_transcribe_model}")
+        return
+
+    if args.clear_transcribe_model:
+        if "default_transcribe_model" in config:
+            del config["default_transcribe_model"]
+            save_cli_config(config)
+            print("Cleared default transcribe model.")
+        else:
+            print("Default transcribe model was not set.")
+        return
+
+    if args.set_server_path:
+        resolved = str(Path(args.set_server_path).resolve())
+        if not Path(resolved).exists():
+            print(f"Warning: path does not exist: {resolved}", file=sys.stderr)
+        config["server_path"] = resolved
+        save_cli_config(config)
+        print(f"Set server path: {resolved}")
+        return
+
+    if args.clear_server_path:
+        if "server_path" in config:
+            del config["server_path"]
+            save_cli_config(config)
+            print("Cleared custom server path. Will use default resolution.")
+        else:
+            print("Custom server path was not set.")
+        return
+
+    # Show all config
+    model = config.get("default_transcribe_model")
+    server_path = config.get("server_path")
+    print(f"default_transcribe_model={model or '(not set)'}")
+    print(f"server_path={server_path or '(not set)'}")
+
+
+def cmd_transcribe(args):
+    """Transcribe an audio/video file with backend Whisper."""
+    input_path = require_file(args.file)
+
+    model = resolve_transcribe_model(args.model)
+    print(f"Transcribing {input_path.name}...")
+    start = time.time()
+    text, used_model = run_transcription(
+        args.url,
+        input_path,
+        language=args.language,
+        model=model,
+        timeout=args.timeout,
+    )
+    elapsed = time.time() - start
+
+    if args.output:
+        Path(args.output).write_text(text + "\n")
+        print(f"Saved transcript: {args.output}")
+    else:
+        print(text)
+
+    print_model_used("Model used", used_model)
+
+    print(f"Done in {elapsed:.1f}s")
+
+
+def cmd_create_voice(args):
+    """Create a new voice profile from an audio sample."""
+    sample_path = require_file(args.file, label="sample file")
+
+    profile_name = args.name or sample_path.stem
+    language = args.language
+    reference_text = args.reference_text
+
+    transcribe_model = resolve_transcribe_model(args.transcribe_model)
+
+    if not reference_text:
+        print(f"Transcribing sample for reference text: {sample_path.name}...")
+        tx_start = time.time()
+        reference_text, used_model = run_transcription(
+            args.url,
+            sample_path,
+            language=language,
+            model=transcribe_model,
+            timeout=args.timeout,
+        )
+        tx_elapsed = time.time() - tx_start
+        if not reference_text:
+            print("Error: transcription returned empty text. Pass --reference-text manually.", file=sys.stderr)
+            sys.exit(1)
+        if args.print_transcript:
+            print("\n--- Transcript ---")
+            print(reference_text)
+            print("--- End Transcript ---\n")
+        print_model_used("Transcription model used", used_model)
+        print(f"Transcription done in {tx_elapsed:.1f}s")
+
+    print(f"Creating profile '{profile_name}' ({language})...")
+    _, profile = api(
+        "POST",
+        args.url,
+        "/profiles",
+        json_body={
+            "name": profile_name,
+            "description": args.description or "",
+            "language": language,
+        },
+        timeout=60,
+    )
+
+    profile_id = profile["id"]
+    print(f"Profile created: {profile_name} ({profile_id})")
+    print("Uploading sample...")
+    try:
+        sample = upload_profile_sample_with_retry(
+            args.url,
+            profile_id,
+            sample_path=sample_path,
+            reference_text=reference_text,
+            timeout=args.timeout,
+        )
+    except ApiRequestError as e:
+        print_request_error(e)
+        sys.exit(1)
+
+    sample_id = sample["id"] if isinstance(sample, dict) and "id" in sample else "(unknown)"
+    print(f"Sample added: {sample_id}")
+    print(f"Done. Voice ready: {profile_name} ({profile_id})")
+
+
+# --- Profile resolution ---
+
+def resolve_profile(base_url, voice_name):
+    profiles = api_get_json(base_url, "/profiles")
+    if not profiles:
+        print("Error: no voice profiles found.", file=sys.stderr)
+        print("Import one with: voicebox import <file.zip>", file=sys.stderr)
+        sys.exit(1)
+
+    # Fall back to last-used voice when none specified
+    if not voice_name:
+        voice_name = load_cli_config().get("last_voice")
+        if voice_name:
+            print(f"Using last voice: {voice_name}")
+
+    if voice_name:
+        match = [p for p in profiles if p["name"].lower() == voice_name.lower()]
+        if not match:
+            match = [p for p in profiles if voice_name.lower() in p["name"].lower()]
+        if not match:
+            print(f"Error: no voice matching '{voice_name}'. Available:", file=sys.stderr)
+            for p in profiles:
+                print(f"  - {p['name']}", file=sys.stderr)
+            sys.exit(1)
+        if len(match) > 1:
+            print(f"Multiple voices match '{voice_name}':", file=sys.stderr)
+            for p in match:
+                print(f"  - {p['name']}", file=sys.stderr)
+            sys.exit(1)
+        profile = match[0]
+        config = load_cli_config()
+        config["last_voice"] = profile["name"]
+        save_cli_config(config)
+        return profile
+    else:
+        print("Available voices:")
+        for i, p in enumerate(profiles, 1):
+            print(f"  {i}. {p['name']} ({p['language']})")
+        print()
+        while True:
+            choice = input(f"Choose a voice [1-{len(profiles)}]: ").strip()
+            try:
+                idx = int(choice) - 1
+                if 0 <= idx < len(profiles):
+                    profile = profiles[idx]
+                    config = load_cli_config()
+                    config["last_voice"] = profile["name"]
+                    save_cli_config(config)
+                    return profile
+            except ValueError:
+                pass
+            print("Invalid choice, try again.")
+
+
+# --- Main ---
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="voicebox",
+        description="Voicebox CLI — headless TTS",
+        epilog="Environment: VOICEBOX_URL, VOICEBOX_HOST, VOICEBOX_PORT",
+    )
+    parser.add_argument("--url", default=None, help="Server URL (overrides env vars)")
+    sub = parser.add_subparsers(dest="command")
+
+    # server
+    p_server = sub.add_parser("server", help="Start the backend server")
+    p_server.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port (default: {DEFAULT_PORT})")
+    p_server.add_argument("--data-dir", help="Data directory")
+    p_server.add_argument("-d", "--detach", action="store_true", help="Run in background")
+    p_server.add_argument("--stop", action="store_true", help="Stop a detached server")
+    p_server.add_argument("--server-path", help="Path to server binary or Python entry point (saved to config)")
+
+    # voices
+    p_voices = sub.add_parser("voices", help="List voice profiles (or delete one)")
+    p_voices.add_argument("--delete", "-d", metavar="VOICE", help="Delete voice by ID or name")
+    p_voices.add_argument("--yes", "-y", action="store_true", help="Skip delete confirmation")
+
+    # import
+    p_import = sub.add_parser("import", help="Import a voice profile from ZIP")
+    p_import.add_argument("file", help="Path to .zip file")
+
+    # generate
+    p_gen = sub.add_parser("generate", aliases=["gen", "say"], help="Generate speech")
+    p_gen.add_argument("--voice", "-v", help="Voice name (interactive picker if omitted)")
+    p_gen.add_argument("--text", "-t", help="Text to speak")
+    p_gen.add_argument("--file", "-f", help="Read text from a file")
+    p_gen.add_argument("--output", "-o", help="Output path (default: output_<epoch>.m4a)")
+    p_gen.add_argument("--language", "-l", help="Language code")
+    p_gen.add_argument("--seed", "-s", type=int, help="Random seed")
+    p_gen.add_argument("--instruct", help="Style instruction (e.g. 'speak slowly')")
+    p_gen.add_argument("--no-open", action="store_true", help="Don't open file after generating")
+
+    # health
+    sub.add_parser("health", help="Check server status")
+
+    # config
+    p_config = sub.add_parser("config", help="Show or set CLI defaults")
+    g_cfg = p_config.add_mutually_exclusive_group()
+    g_cfg.add_argument("--set-transcribe-model", metavar="MODEL", help="Set default transcribe model")
+    g_cfg.add_argument("--clear-transcribe-model", action="store_true", help="Clear default transcribe model")
+    g_cfg.add_argument("--set-server-path", metavar="PATH", help="Set custom server binary/script path")
+    g_cfg.add_argument("--clear-server-path", action="store_true", help="Clear custom server path (use default)")
+
+    # transcribe
+    p_transcribe = sub.add_parser("transcribe", aliases=["stt"], help="Transcribe an audio/video file")
+    p_transcribe.add_argument("file", help="Path to input media (mp3, wav, m4a, mp4, ...)")
+    p_transcribe.add_argument("--language", "-l", help="Language hint (e.g. en)")
+    p_transcribe.add_argument("--model", "-m", help="Transcription model (overrides CLI default)")
+    p_transcribe.add_argument("--output", "-o", help="Save transcript to file instead of stdout")
+    p_transcribe.add_argument("--timeout", type=int, default=600, help="Request timeout seconds (default: 600)")
+
+    # create voice
+    p_create_voice = sub.add_parser(
+        "create-voice",
+        aliases=["clone", "new-voice"],
+        help="Create a voice profile from a sample (auto-transcribes by default)",
+    )
+    p_create_voice.add_argument("file", help="Path to sample audio/video (mp3, wav, m4a, mp4, ...)")
+    p_create_voice.add_argument("--name", "-n", help="Profile name (default: sample filename stem)")
+    p_create_voice.add_argument("--description", "-d", help="Profile description")
+    p_create_voice.add_argument("--language", "-l", default="en", help="Language code (default: en)")
+    p_create_voice.add_argument("--reference-text", "-t", help="Exact spoken text; skips auto-transcribe when set")
+    p_create_voice.add_argument("--transcribe-model", "-m", help="Model used for auto-transcription")
+    p_create_voice.add_argument("--print-transcript", action="store_true", help="Print auto transcript before creating profile")
+    p_create_voice.add_argument("--timeout", type=int, default=600, help="Request timeout seconds (default: 600)")
+
+    args = parser.parse_args()
+
+    # Resolve URL: --url flag > VOICEBOX_URL > VOICEBOX_HOST:PORT > default
+    if args.url:
+        args.url = args.url.rstrip("/")
+    else:
+        args.url = resolve_base_url(args)
+
+    if args.command == "server":
+        cmd_server(args)
+    elif args.command == "voices":
+        cmd_voices(args)
+    elif args.command == "import":
+        cmd_import(args)
+    elif args.command in ("generate", "gen", "say"):
+        cmd_generate(args)
+    elif args.command == "health":
+        cmd_health(args)
+    elif args.command == "config":
+        cmd_config(args)
+    elif args.command in ("transcribe", "stt"):
+        cmd_transcribe(args)
+    elif args.command in ("create-voice", "clone", "new-voice"):
+        cmd_create_voice(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/web/src/platform/audio.ts b/web/src/platform/audio.ts
index 220a5ad..f001fac 100644
--- a/web/src/platform/audio.ts
+++ b/web/src/platform/audio.ts
@@ -17,6 +17,10 @@ export const webAudio: PlatformAudio = {
     return []; // No native device routing in web
   },
 
+  async listInputDevices(): Promise<AudioDevice[]> {
+    return []; // No native device routing in web
+  },
+
   async playToDevices(_audioData: Uint8Array, _deviceIds: string[]): Promise<void> {
     throw new Error('Native audio device routing is only available in the desktop app.');
   },
diff --git a/web/src/platform/metadata.ts b/web/src/platform/metadata.ts
index d8a73f8..99f8a45 100644
--- a/web/src/platform/metadata.ts
+++ b/web/src/platform/metadata.ts
@@ -5,5 +5,9 @@ export const webMetadata: PlatformMetadata = {
     // Return version from env var or package.json
     return import.meta.env.VITE_APP_VERSION || '0.1.0';
   },
+  getBuildInfo(): string {
+    // Web builds don't inject git hash by default
+    return '';
+  },
   isTauri: false,
 };
diff --git a/web/src/platform/updater.ts b/web/src/platform/updater.ts
index 32ed014..0ccc237 100644
--- a/web/src/platform/updater.ts
+++ b/web/src/platform/updater.ts
@@ -12,7 +12,9 @@ class WebUpdater implements PlatformUpdater {
   private subscribers: Set<(status: UpdateStatus) => void> = new Set();
 
   private notifySubscribers() {
-    this.subscribers.forEach((callback) => callback(this.status));
+    for (const callback of this.subscribers) {
+      callback(this.status);
+    }
   }
 
   subscribe(callback: (status: UpdateStatus) => void): () => void {