audiohacking · lmangani · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026 · Feb 9, 2026
diff --git a/api/ace_step_models.py b/api/ace_step_models.py
@@ -5,6 +5,7 @@
 """
 
 from pathlib import Path
+import shutil
 import subprocess
 import sys
 import threading
@@ -24,22 +25,22 @@ def _bundled_downloader_available() -> bool:
 
 bp = Blueprint("api_ace_step_models", __name__)
 
-# DiT variants from Tutorial (DiT Selection Summary)
+# DiT variants from Tutorial (DiT Selection Summary). size_gb: approximate for download confirmation.
 DIT_MODELS = [
-    {"id": "turbo", "label": "Turbo (default)", "description": "Best balance, 8 steps", "steps": 8, "cfg": False},
-    {"id": "turbo-shift1", "label": "Turbo shift=1", "description": "Richer details", "steps": 8, "cfg": False},
-    {"id": "turbo-shift3", "label": "Turbo shift=3", "description": "Clearer timbre", "steps": 8, "cfg": False},
-    {"id": "turbo-continuous", "label": "Turbo continuous", "description": "Flexible shift 1–5", "steps": 8, "cfg": False},
-    {"id": "sft", "label": "SFT", "description": "50 steps, CFG", "steps": 50, "cfg": True},
-    {"id": "base", "label": "Base", "description": "50 steps, CFG; lego/extract/complete", "steps": 50, "cfg": True, "exclusive_tasks": ["lego", "extract", "complete"]},
+    {"id": "turbo", "label": "Turbo (default)", "description": "Best balance, 8 steps", "steps": 8, "cfg": False, "size_gb": 8},
+    {"id": "turbo-shift1", "label": "Turbo shift=1", "description": "Richer details", "steps": 8, "cfg": False, "size_gb": 0.5},
+    {"id": "turbo-shift3", "label": "Turbo shift=3", "description": "Clearer timbre", "steps": 8, "cfg": False, "size_gb": 0.5},
+    {"id": "turbo-continuous", "label": "Turbo continuous", "description": "Flexible shift 1–5", "steps": 8, "cfg": False, "size_gb": 0.5},
+    {"id": "sft", "label": "SFT", "description": "50 steps, CFG", "steps": 50, "cfg": True, "size_gb": 8},
+    {"id": "base", "label": "Base", "description": "50 steps, CFG; lego/extract/complete", "steps": 50, "cfg": True, "exclusive_tasks": ["lego", "extract", "complete"], "size_gb": 8},
 ]
 
-# LM planner options from Tutorial
+# LM planner options from Tutorial. size_gb: approximate for download confirmation.
 LM_MODELS = [
-    {"id": "none", "label": "No LM"},
-    {"id": "0.6B", "label": "0.6B"},
-    {"id": "1.7B", "label": "1.7B (default)"},
-    {"id": "4B", "label": "4B"},
+    {"id": "none", "label": "No LM", "size_gb": 0},
+    {"id": "0.6B", "label": "0.6B", "size_gb": 2},
+    {"id": "1.7B", "label": "1.7B (default)", "size_gb": 4},
+    {"id": "4B", "label": "4B", "size_gb": 10},
 ]
 
 # ACE-Step 1.5 CLI model ids (for acestep-download --model)
@@ -333,6 +334,23 @@ def _do_download_worker(model: str, root: Path) -> None:
         _download_cancel_requested = False
 
 
+@bp.route("/models/disk-space", methods=["GET"])
+def disk_space():
+    """
+    GET /api/ace-step/models/disk-space
+    Returns free and total disk space for the models/checkpoints path (for download confirmation).
+    """
+    try:
+        root = _checkpoint_root()
+        root.mkdir(parents=True, exist_ok=True)
+        usage = shutil.disk_usage(str(root))
+        free_gb = round(usage.free / (1024 ** 3), 2)
+        total_gb = round(usage.total / (1024 ** 3), 2)
+        return jsonify({"free_gb": free_gb, "total_gb": total_gb, "path": str(root)})
+    except Exception as e:
+        return jsonify({"error": str(e), "free_gb": None, "total_gb": None, "path": ""}), 500
+
+
 @bp.route("/models/download", methods=["POST"])
 def download_model():
     """

diff --git a/api/generate.py b/api/generate.py
diff --git a/cdmf_pipeline_ace_step.py b/cdmf_pipeline_ace_step.py
@@ -1195,7 +1195,8 @@ def add_latents_noise(
                 sigma_max=sigma_max
             )
 
-        infer_steps = int(sigma_max * infer_steps)
+        # Ensure enough steps for cover/audio2audio so reference is audible (INFERENCE.md: base 32-64 recommended).
+        infer_steps = max(16, int(sigma_max * infer_steps))
         timesteps, num_inference_steps = retrieve_timesteps(
             scheduler,
             num_inference_steps=infer_steps,
@@ -1295,6 +1296,17 @@ def text2music_diffusion_process(
 
         if ref_latents is not None:
             frame_length = ref_latents.shape[-1]
+            # Cap ref length for cover/audio2audio so each diffusion step stays fast (avoids 80s+ per step on long refs)
+            max_cover_sec = float(os.environ.get("ACE_COVER_MAX_REF_SECONDS", "90"))
+            max_cover_frames = int(max_cover_sec * 44100 / 512 / 8)
+            if frame_length > max_cover_frames:
+                ref_latents = ref_latents[:, :, :, :max_cover_frames].contiguous()
+                frame_length = max_cover_frames
+                logger.info(
+                    "Capped ref_latents to %d frames (~%.0fs) for faster cover/audio2audio generation (set ACE_COVER_MAX_REF_SECONDS to override).",
+                    max_cover_frames,
+                    max_cover_sec,
+                )
 
         if len(oss_steps) > 0:
             infer_steps = max(oss_steps)
@@ -2087,6 +2099,7 @@ def __call__(
 
         ref_latents = None
         if ref_audio_input is not None and audio2audio_enable:
+            # For cover mode: ref_audio_input = source song (song to cover), per docs/ACE-Step-INFERENCE.md.
             assert ref_audio_input is not None, "ref_audio_input is required for audio2audio task"
             assert os.path.exists(
                 ref_audio_input

diff --git a/cdmf_state.py b/cdmf_state.py
@@ -4,7 +4,7 @@
 
 import threading
 import time
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, Callable
 
 from ace_model_setup import ace_models_present
 
@@ -26,6 +26,24 @@ def get_current_generation_job_id() -> Optional[str]:
     return getattr(_current_job_id_holder, "job_id", None)
 
 
+# ---------------------------------------------------------------------------
+# Progress updater (called from log handler when tqdm progress is parsed)
+# ---------------------------------------------------------------------------
+
+_progress_updater: Optional[Callable[[int, int, int, Optional[float]], None]] = None
+
+
+def set_progress_updater(cb: Optional[Callable[[int, int, int, Optional[float]], None]]) -> None:
+    """Set a callback(percent, current, total, eta_seconds) used to update API job from parsed log progress."""
+    global _progress_updater
+    _progress_updater = cb
+
+
+def get_progress_updater() -> Optional[Callable[[int, int, int, Optional[float]], None]]:
+    """Return the current progress updater, or None."""
+    return _progress_updater
+
+
 # ---------------------------------------------------------------------------
 # Generation progress (shared with /progress endpoint and model downloads)
 # ---------------------------------------------------------------------------
@@ -57,6 +75,15 @@ def get_current_generation_job_id() -> Optional[str]:
     "message": "",
 }
 
+# ---------------------------------------------------------------------------
+# Generation pipeline loading (may trigger HuggingFace model download on first use)
+# ---------------------------------------------------------------------------
+
+GENERATION_MODEL_LOADING: Dict[str, Any] = {
+    "in_progress": False,
+    "message": "Preparing model (downloading if needed)...",
+}
+
 # ---------------------------------------------------------------------------
 # MuFun-ACEStep analysis model availability
 # ---------------------------------------------------------------------------

diff --git a/docs/LEGO-MODE.md b/docs/LEGO-MODE.md
@@ -0,0 +1,32 @@
+# Lego Mode (ACE-Step 1.5)
+
+Lego mode adds a new instrument track on top of backing audio (e.g. add guitar to a beat). It requires the **Base** DiT model.
+
+## Known limitations and workarounds
+
+We align with findings from [ACE-Step-1.5 issue #117](https://github.com/ace-step/ACE-Step-1.5/issues/117) (BPM/timing drift and MPS crashes):
+
+1. **Timing drift**  
+   Generated tracks are not strictly BPM-locked to the source; onsets can drift (20–80 ms). Workarounds that help:
+   - Match **duration** to the source (e.g. 4 bars at 135 BPM → duration ≈ 7.1 s: `4 * (60/135) * 4` for 4/4).
+   - Use **shorter segments** (e.g. 4 bars) then duplicate if needed; less time for drift.
+   - We use **shift=3.0** for lego (recommended in the issue for better timing vs shift=1.0).
+
+2. **Apple Silicon (MPS)**  
+   On Mac:
+   - **`ref_audio_strength` (backing influence) &lt; 1.0** can crash with a batch dimension mismatch at the cover→text2music transition. We **default to 1.0** for lego so Apple Silicon users don’t hit this. Lower values (0.2–0.5) can improve “new instrument” feel on non-MPS.
+   - **Thinking (LM)** is **disabled for lego** so the backing drives context; with thinking on, LLM-generated codes can override the source and hurt timing/context.
+
+3. **Caption and BPM**  
+   Include style, key, and BPM in the caption (e.g. “electric guitar, C major, 135 BPM, 4 bars”) and set **BPM** in the API so metadata matches the backing.
+
+## AceForge defaults (lego)
+
+| Parameter              | Default | Note |
+|------------------------|--------|------|
+| `ref_audio_strength`   | 1.0    | Avoids MPS crash; UI “Backing influence” |
+| `thinking`             | false  | Forced off for lego so src_audio drives context |
+| `shift`                | 3.0    | Better timing than 1.0/6.0 for lego |
+| `use_cot_caption`      | false  | Keep instruction verbatim (“Generate the X track…”) |
+
+Users can still lower backing influence on non-Apple Silicon if they want more “new instrument” and accept the risk of drift or (on MPS) crash.
diff --git a/docs/ace-step-skills/README.md b/docs/ace-step-skills/README.md
@@ -0,0 +1,17 @@
+# ACE-Step Skills (reference knowledge)
+
+This folder contains reference material from the official **ACE-Step Skills** repository, used as knowledge for AceForge development and for aligning with ACE-Step concepts (caption, lyrics, task types, API parameters).
+
+**Source:** [ace-step/ace-step-skills](https://github.com/ace-step/ace-step-skills) — `skills/acestep/`  
+**License:** See the upstream repository.
+
+## Contents
+
+| File | Description |
+|------|-------------|
+| [SKILL.md](./SKILL.md) | ACE-Step skill definition: API usage, generation modes, parameters, config. |
+| [music-creation-guide.md](./music-creation-guide.md) | Music creation guide: caption, lyrics, structure tags, metadata, duration. |
+
+## Note for AceForge
+
+AceForge runs its own backend and API (Flask, `api/generate.py`, etc.), not the standalone ACE-Step API server on port 8001. The *concepts* (caption vs lyrics, task types, parameters, music-creation practices) still apply and are referenced when implementing or documenting AceForge features.