Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions api/ace_step_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from pathlib import Path
import shutil
import subprocess
import sys
import threading
Expand All @@ -24,22 +25,22 @@ def _bundled_downloader_available() -> bool:

bp = Blueprint("api_ace_step_models", __name__)

# DiT variants from Tutorial (DiT Selection Summary)
# DiT variants from Tutorial (DiT Selection Summary). size_gb: approximate for download confirmation.
DIT_MODELS = [
{"id": "turbo", "label": "Turbo (default)", "description": "Best balance, 8 steps", "steps": 8, "cfg": False},
{"id": "turbo-shift1", "label": "Turbo shift=1", "description": "Richer details", "steps": 8, "cfg": False},
{"id": "turbo-shift3", "label": "Turbo shift=3", "description": "Clearer timbre", "steps": 8, "cfg": False},
{"id": "turbo-continuous", "label": "Turbo continuous", "description": "Flexible shift 1–5", "steps": 8, "cfg": False},
{"id": "sft", "label": "SFT", "description": "50 steps, CFG", "steps": 50, "cfg": True},
{"id": "base", "label": "Base", "description": "50 steps, CFG; lego/extract/complete", "steps": 50, "cfg": True, "exclusive_tasks": ["lego", "extract", "complete"]},
{"id": "turbo", "label": "Turbo (default)", "description": "Best balance, 8 steps", "steps": 8, "cfg": False, "size_gb": 8},
{"id": "turbo-shift1", "label": "Turbo shift=1", "description": "Richer details", "steps": 8, "cfg": False, "size_gb": 0.5},
{"id": "turbo-shift3", "label": "Turbo shift=3", "description": "Clearer timbre", "steps": 8, "cfg": False, "size_gb": 0.5},
{"id": "turbo-continuous", "label": "Turbo continuous", "description": "Flexible shift 1–5", "steps": 8, "cfg": False, "size_gb": 0.5},
{"id": "sft", "label": "SFT", "description": "50 steps, CFG", "steps": 50, "cfg": True, "size_gb": 8},
{"id": "base", "label": "Base", "description": "50 steps, CFG; lego/extract/complete", "steps": 50, "cfg": True, "exclusive_tasks": ["lego", "extract", "complete"], "size_gb": 8},
]

# LM planner options from Tutorial
# LM planner options from Tutorial. size_gb: approximate for download confirmation.
LM_MODELS = [
{"id": "none", "label": "No LM"},
{"id": "0.6B", "label": "0.6B"},
{"id": "1.7B", "label": "1.7B (default)"},
{"id": "4B", "label": "4B"},
{"id": "none", "label": "No LM", "size_gb": 0},
{"id": "0.6B", "label": "0.6B", "size_gb": 2},
{"id": "1.7B", "label": "1.7B (default)", "size_gb": 4},
{"id": "4B", "label": "4B", "size_gb": 10},
]

# ACE-Step 1.5 CLI model ids (for acestep-download --model)
Expand Down Expand Up @@ -333,6 +334,23 @@ def _do_download_worker(model: str, root: Path) -> None:
_download_cancel_requested = False


@bp.route("/models/disk-space", methods=["GET"])
def disk_space():
"""
GET /api/ace-step/models/disk-space
Returns free and total disk space for the models/checkpoints path (for download confirmation).
"""
try:
root = _checkpoint_root()
root.mkdir(parents=True, exist_ok=True)
usage = shutil.disk_usage(str(root))
free_gb = round(usage.free / (1024 ** 3), 2)
total_gb = round(usage.total / (1024 ** 3), 2)
return jsonify({"free_gb": free_gb, "total_gb": total_gb, "path": str(root)})
except Exception as e:
return jsonify({"error": str(e), "free_gb": None, "total_gb": None, "path": ""}), 500


@bp.route("/models/download", methods=["POST"])
def download_model():
"""
Expand Down
207 changes: 191 additions & 16 deletions api/generate.py

Large diffs are not rendered by default.

15 changes: 14 additions & 1 deletion cdmf_pipeline_ace_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,8 @@ def add_latents_noise(
sigma_max=sigma_max
)

infer_steps = int(sigma_max * infer_steps)
# Ensure enough steps for cover/audio2audio so reference is audible (INFERENCE.md: base 32-64 recommended).
infer_steps = max(16, int(sigma_max * infer_steps))
timesteps, num_inference_steps = retrieve_timesteps(
scheduler,
num_inference_steps=infer_steps,
Expand Down Expand Up @@ -1295,6 +1296,17 @@ def text2music_diffusion_process(

if ref_latents is not None:
frame_length = ref_latents.shape[-1]
# Cap ref length for cover/audio2audio so each diffusion step stays fast (avoids 80s+ per step on long refs)
max_cover_sec = float(os.environ.get("ACE_COVER_MAX_REF_SECONDS", "90"))
max_cover_frames = int(max_cover_sec * 44100 / 512 / 8)
if frame_length > max_cover_frames:
ref_latents = ref_latents[:, :, :, :max_cover_frames].contiguous()
frame_length = max_cover_frames
logger.info(
"Capped ref_latents to %d frames (~%.0fs) for faster cover/audio2audio generation (set ACE_COVER_MAX_REF_SECONDS to override).",
max_cover_frames,
max_cover_sec,
)

if len(oss_steps) > 0:
infer_steps = max(oss_steps)
Expand Down Expand Up @@ -2087,6 +2099,7 @@ def __call__(

ref_latents = None
if ref_audio_input is not None and audio2audio_enable:
# For cover mode: ref_audio_input = source song (song to cover), per docs/ACE-Step-INFERENCE.md.
assert ref_audio_input is not None, "ref_audio_input is required for audio2audio task"
assert os.path.exists(
ref_audio_input
Expand Down
29 changes: 28 additions & 1 deletion cdmf_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import threading
import time
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Callable

from ace_model_setup import ace_models_present

Expand All @@ -26,6 +26,24 @@ def get_current_generation_job_id() -> Optional[str]:
return getattr(_current_job_id_holder, "job_id", None)


# ---------------------------------------------------------------------------
# Progress updater (called from log handler when tqdm progress is parsed)
# ---------------------------------------------------------------------------

_progress_updater: Optional[Callable[[int, int, int, Optional[float]], None]] = None


def set_progress_updater(cb: Optional[Callable[[int, int, int, Optional[float]], None]]) -> None:
"""Set a callback(percent, current, total, eta_seconds) used to update API job from parsed log progress."""
global _progress_updater
_progress_updater = cb


def get_progress_updater() -> Optional[Callable[[int, int, int, Optional[float]], None]]:
"""Return the current progress updater, or None."""
return _progress_updater


# ---------------------------------------------------------------------------
# Generation progress (shared with /progress endpoint and model downloads)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -57,6 +75,15 @@ def get_current_generation_job_id() -> Optional[str]:
"message": "",
}

# ---------------------------------------------------------------------------
# Generation pipeline loading (may trigger HuggingFace model download on first use)
# ---------------------------------------------------------------------------

GENERATION_MODEL_LOADING: Dict[str, Any] = {
"in_progress": False,
"message": "Preparing model (downloading if needed)...",
}

# ---------------------------------------------------------------------------
# MuFun-ACEStep analysis model availability
# ---------------------------------------------------------------------------
Expand Down
32 changes: 32 additions & 0 deletions docs/LEGO-MODE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Lego Mode (ACE-Step 1.5)

Lego mode adds a new instrument track on top of backing audio (e.g. add guitar to a beat). It requires the **Base** DiT model.

## Known limitations and workarounds

We align with findings from [ACE-Step-1.5 issue #117](https://github.com/ace-step/ACE-Step-1.5/issues/117) (BPM/timing drift and MPS crashes):

1. **Timing drift**
Generated tracks are not strictly BPM-locked to the source; onsets can drift (20–80 ms). Workarounds that help:
- Match **duration** to the source (e.g. 4 bars at 135 BPM → duration ≈ 7.1 s: `4 * (60/135) * 4` for 4/4).
- Use **shorter segments** (e.g. 4 bars) then duplicate if needed; less time for drift.
- We use **shift=3.0** for lego (recommended in the issue for better timing vs shift=1.0).

2. **Apple Silicon (MPS)**
On Mac:
- **`ref_audio_strength` (backing influence) < 1.0** can crash with a batch dimension mismatch at the cover→text2music transition. We **default to 1.0** for lego so Apple Silicon users don’t hit this. Lower values (0.2–0.5) can improve “new instrument” feel on non-MPS.
- **Thinking (LM)** is **disabled for lego** so the backing drives context; with thinking on, LLM-generated codes can override the source and hurt timing/context.

3. **Caption and BPM**
Include style, key, and BPM in the caption (e.g. “electric guitar, C major, 135 BPM, 4 bars”) and set **BPM** in the API so metadata matches the backing.

## AceForge defaults (lego)

| Parameter | Default | Note |
|------------------------|--------|------|
| `ref_audio_strength` | 1.0 | Avoids MPS crash; UI “Backing influence” |
| `thinking` | false | Forced off for lego so src_audio drives context |
| `shift` | 3.0 | Better timing than 1.0/6.0 for lego |
| `use_cot_caption` | false | Keep instruction verbatim (“Generate the X track…”) |

Users can still lower backing influence on non-Apple Silicon if they want more “new instrument” and accept the risk of drift or (on MPS) crash.
17 changes: 17 additions & 0 deletions docs/ace-step-skills/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# ACE-Step Skills (reference knowledge)

This folder contains reference material from the official **ACE-Step Skills** repository, used as knowledge for AceForge development and for aligning with ACE-Step concepts (caption, lyrics, task types, API parameters).

**Source:** [ace-step/ace-step-skills](https://github.com/ace-step/ace-step-skills) — `skills/acestep/`
**License:** See the upstream repository.

## Contents

| File | Description |
|------|-------------|
| [SKILL.md](./SKILL.md) | ACE-Step skill definition: API usage, generation modes, parameters, config. |
| [music-creation-guide.md](./music-creation-guide.md) | Music creation guide: caption, lyrics, structure tags, metadata, duration. |

## Note for AceForge

AceForge runs its own backend and API (Flask, `api/generate.py`, etc.), not the standalone ACE-Step API server on port 8001. The *concepts* (caption vs lyrics, task types, parameters, music-creation practices) still apply and are referenced when implementing or documenting AceForge features.
Loading