From 57ec4e32c7ba854546b5c2b53c00d44b58788f8a Mon Sep 17 00:00:00 2001 From: E Date: Sun, 8 Feb 2026 15:45:13 +0100 Subject: [PATCH 1/3] Queue jobs until models available; fix progress tracking for UI - Backend: jobs wait in pending_model until required DiT is installed; promote to queued when model available - API: get_status returns pendingReason; cancel supports pending_model; POST /api/generate/retry-pending to promote and start after model download - Frontend: SongList shows 'Waiting for model' + reason and Open Settings; poll sets generationStatus/generationPendingReason; Settings onDownloadComplete calls retryPending; cancel and pending_model handling in poll - Progress: broaden tqdm regex so all progress lines match (INFO); add progress updater from log parser so job progress/ETA updates for UI even when callback path fails Co-authored-by: Cursor --- api/ace_step_models.py | 42 +++++--- api/generate.py | 179 ++++++++++++++++++++++++++++++-- cdmf_pipeline_ace_step.py | 15 ++- cdmf_state.py | 29 +++++- generate_ace.py | 123 ++++++++++++++++------ music_forge_ui.py | 73 ++++++++----- ui/App.tsx | 18 +++- ui/components/CreatePanel.tsx | 140 +++++++++++++++++++++++-- ui/components/SettingsModal.tsx | 101 ++++++++++++------ ui/components/SongList.tsx | 35 +++++-- ui/services/api.ts | 16 ++- ui/types.ts | 5 + 12 files changed, 643 insertions(+), 133 deletions(-) diff --git a/api/ace_step_models.py b/api/ace_step_models.py index 843a5b2..df4e045 100644 --- a/api/ace_step_models.py +++ b/api/ace_step_models.py @@ -5,6 +5,7 @@ """ from pathlib import Path +import shutil import subprocess import sys import threading @@ -24,22 +25,22 @@ def _bundled_downloader_available() -> bool: bp = Blueprint("api_ace_step_models", __name__) -# DiT variants from Tutorial (DiT Selection Summary) +# DiT variants from Tutorial (DiT Selection Summary). size_gb: approximate for download confirmation. DIT_MODELS = [ - {"id": "turbo", "label": "Turbo (default)", "description": "Best balance, 8 steps", "steps": 8, "cfg": False}, - {"id": "turbo-shift1", "label": "Turbo shift=1", "description": "Richer details", "steps": 8, "cfg": False}, - {"id": "turbo-shift3", "label": "Turbo shift=3", "description": "Clearer timbre", "steps": 8, "cfg": False}, - {"id": "turbo-continuous", "label": "Turbo continuous", "description": "Flexible shift 1–5", "steps": 8, "cfg": False}, - {"id": "sft", "label": "SFT", "description": "50 steps, CFG", "steps": 50, "cfg": True}, - {"id": "base", "label": "Base", "description": "50 steps, CFG; lego/extract/complete", "steps": 50, "cfg": True, "exclusive_tasks": ["lego", "extract", "complete"]}, + {"id": "turbo", "label": "Turbo (default)", "description": "Best balance, 8 steps", "steps": 8, "cfg": False, "size_gb": 8}, + {"id": "turbo-shift1", "label": "Turbo shift=1", "description": "Richer details", "steps": 8, "cfg": False, "size_gb": 0.5}, + {"id": "turbo-shift3", "label": "Turbo shift=3", "description": "Clearer timbre", "steps": 8, "cfg": False, "size_gb": 0.5}, + {"id": "turbo-continuous", "label": "Turbo continuous", "description": "Flexible shift 1–5", "steps": 8, "cfg": False, "size_gb": 0.5}, + {"id": "sft", "label": "SFT", "description": "50 steps, CFG", "steps": 50, "cfg": True, "size_gb": 8}, + {"id": "base", "label": "Base", "description": "50 steps, CFG; lego/extract/complete", "steps": 50, "cfg": True, "exclusive_tasks": ["lego", "extract", "complete"], "size_gb": 8}, ] -# LM planner options from Tutorial +# LM planner options from Tutorial. size_gb: approximate for download confirmation. LM_MODELS = [ - {"id": "none", "label": "No LM"}, - {"id": "0.6B", "label": "0.6B"}, - {"id": "1.7B", "label": "1.7B (default)"}, - {"id": "4B", "label": "4B"}, + {"id": "none", "label": "No LM", "size_gb": 0}, + {"id": "0.6B", "label": "0.6B", "size_gb": 2}, + {"id": "1.7B", "label": "1.7B (default)", "size_gb": 4}, + {"id": "4B", "label": "4B", "size_gb": 10}, ] # ACE-Step 1.5 CLI model ids (for acestep-download --model) @@ -333,6 +334,23 @@ def _do_download_worker(model: str, root: Path) -> None: _download_cancel_requested = False +@bp.route("/models/disk-space", methods=["GET"]) +def disk_space(): + """ + GET /api/ace-step/models/disk-space + Returns free and total disk space for the models/checkpoints path (for download confirmation). + """ + try: + root = _checkpoint_root() + root.mkdir(parents=True, exist_ok=True) + usage = shutil.disk_usage(str(root)) + free_gb = round(usage.free / (1024 ** 3), 2) + total_gb = round(usage.total / (1024 ** 3), 2) + return jsonify({"free_gb": free_gb, "total_gb": total_gb, "path": str(root)}) + except Exception as e: + return jsonify({"error": str(e), "free_gb": None, "total_gb": None, "path": ""}), 500 + + @bp.route("/models/download", methods=["POST"]) def download_model(): """ diff --git a/api/generate.py b/api/generate.py index 814da55..3f8d7cc 100644 --- a/api/generate.py +++ b/api/generate.py @@ -21,7 +21,7 @@ def _uppercase_track_in_instruction(instruction): return instruction[: m.start(2)] + m.group(2).upper() + instruction[m.end(2) :] return instruction -from cdmf_paths import get_output_dir, get_user_data_dir, load_config +from cdmf_paths import get_output_dir, get_user_data_dir, get_models_folder, load_config, save_config from cdmf_tracks import get_audio_duration, list_lora_adapters, load_track_meta, save_track_meta from cdmf_generation_job import GenerationCancelled import cdmf_state @@ -59,6 +59,38 @@ def _is_cancel_requested(job_id: str) -> bool: return job_id in _cancel_requested +def _is_model_available(dit_tag: str) -> bool: + """Return True if the given DiT model is installed and ready (no download needed). Used to promote pending_model jobs.""" + if not dit_tag or not isinstance(dit_tag, str): + return False + dit = dit_tag.strip().lower() + DIT_15_FOLDERS = { + "turbo": "acestep-v15-turbo", + "base": "acestep-v15-base", + "sft": "acestep-v15-sft", + "turbo-shift1": "acestep-v15-turbo-shift1", + "turbo-shift3": "acestep-v15-turbo-shift3", + "turbo-continuous": "acestep-v15-turbo-continuous", + } + REQUIRED_SUBDIRS = ("music_dcae_f8c8", "music_vocoder", "ace_step_transformer", "umt5-base") + folder = DIT_15_FOLDERS.get(dit) + models_root = Path(get_models_folder()) / "checkpoints" + if folder: + candidate = models_root / folder + if not candidate.exists(): + return False + for sub in REQUIRED_SUBDIRS: + if not (candidate / sub).exists(): + return False + return True + # Legacy v1 + try: + from ace_model_setup import ace_models_present + return ace_models_present() + except Exception: + return False + + def _refs_dir() -> Path: d = get_user_data_dir() / "references" d.mkdir(parents=True, exist_ok=True) @@ -115,10 +147,45 @@ def _on_job_progress( register_job_progress_callback(_on_job_progress) +def _update_job_progress_from_log( + percent: int, current: int, total: int, eta_seconds: float | None +) -> None: + """Update current job progress from parsed tqdm log line (log handler runs in same thread as worker).""" + with _jobs_lock: + jid = cdmf_state.get_current_generation_job_id() + if not jid: + return + job = _jobs.get(jid) + if not job: + return + job["progressPercent"] = round(float(percent), 1) + job["progressSteps"] = f"{current}/{total}" + if eta_seconds is not None: + job["progressEta"] = round(float(eta_seconds), 1) + + def _run_generation(job_id: str) -> None: """Background: run generate_track_ace and update job.""" global _generation_busy, _current_job_id + prev_config = None + config_switched = False try: + with _jobs_lock: + job = _jobs.get(job_id) + if not job or job.get("status") != "queued": + return + job_dit = job.get("dit_model") or "turbo" + # If required model is not installed, leave job as pending_model so it runs after user installs it. + if not _is_model_available(job_dit): + logging.info("[API generate] Job %s waiting for model %s (not installed)", job_id, job_dit) + with _jobs_lock: + job = _jobs.get(job_id) + if job and job.get("status") == "queued": + job["status"] = "pending_model" + job["pendingReason"] = ( + f"Model '{job_dit}' is not installed. Install it from Settings → Models to run this job." + ) + return with _jobs_lock: job = _jobs.get(job_id) if not job or job.get("status") != "queued": @@ -131,8 +198,20 @@ def _run_generation(job_id: str) -> None: _current_job_id = job_id cdmf_state.set_current_generation_job_id(job_id) + cdmf_state.set_progress_updater(_update_job_progress_from_log) cancel_check = lambda: _is_cancel_requested(job_id) - from generate_ace import generate_track_ace + from generate_ace import generate_track_ace, clear_ace_pipeline + + # Use job's dit_model (e.g. base for cover). Temporarily switch config so pipeline loads the right model. + with _jobs_lock: + j = _jobs.get(job_id) + job_dit = (j.get("dit_model") or "turbo") if j else "turbo" + prev_config = load_config() or {} + prev_config = dict(prev_config) + config_switched = job_dit != (prev_config.get("ace_step_dit_model") or "turbo") + if config_switched: + save_config({**prev_config, "ace_step_dit_model": job_dit}) + clear_ace_pipeline() params = job.get("params") or {} if not isinstance(params, dict): @@ -451,20 +530,43 @@ def _run_generation(job_id: str) -> None: job["status"] = "cancelled" job["error"] = "Cancelled by user" except Exception as e: - logging.exception("Generation job %s failed", job_id) - with _jobs_lock: - job = _jobs.get(job_id) - if job: - job["status"] = "failed" - job["error"] = str(e) + err_msg = str(e) + # Keep job queued as pending_model when model is missing so it can run after user installs it. + if "not installed" in err_msg.lower() or "Settings → Models" in err_msg: + logging.info("[API generate] Job %s waiting for model: %s", job_id, err_msg[:120]) + with _jobs_lock: + job = _jobs.get(job_id) + if job: + job["status"] = "pending_model" + job["pendingReason"] = err_msg + job["error"] = None + else: + logging.exception("Generation job %s failed", job_id) + with _jobs_lock: + job = _jobs.get(job_id) + if job: + job["status"] = "failed" + job["error"] = err_msg finally: + cdmf_state.set_progress_updater(None) + if config_switched and prev_config: + save_config(prev_config) + clear_ace_pipeline() cdmf_state.set_current_generation_job_id(None) _generation_busy = False with _jobs_lock: _current_job_id = None _cancel_requested.discard(job_id) - # Start next queued job (skips cancelled: they are no longer "queued") + # Promote pending_model jobs to queued when their model is now available; then start first queued job. with _jobs_lock: + for jid in _job_order: + j = _jobs.get(jid) + if j and j.get("status") == "pending_model": + dit = j.get("dit_model") or "turbo" + if _is_model_available(dit): + j["status"] = "queued" + j["pendingReason"] = None + logging.info("[API generate] Job %s promoted to queued (model %s now available)", jid, dit) for jid in _job_order: j = _jobs.get(jid) if j and j.get("status") == "queued": @@ -472,6 +574,20 @@ def _run_generation(job_id: str) -> None: break +@bp.route("/model-download-status", methods=["GET"]) +def get_model_download_status(): + """GET /api/generate/model-download-status — whether pipeline is loading (may be downloading model files).""" + try: + st = getattr(cdmf_state, "GENERATION_MODEL_LOADING", {}) + return jsonify({ + "in_progress": bool(st.get("in_progress")), + "message": st.get("message") or "Preparing model (downloading if needed)...", + }) + except Exception as e: + logging.warning("[API generate] model-download-status failed: %s", e) + return jsonify({"in_progress": False, "message": ""}) + + @bp.route("/lora_adapters", methods=["GET"]) def get_lora_adapters(): """GET /api/generate/lora_adapters — list LoRA adapters (e.g. from Training or custom_lora).""" @@ -554,7 +670,15 @@ def _str(v): except (TypeError, ValueError): params_copy = {} config = load_config() - dit_tag = config.get("ace_step_dit_model") or params_copy.get("aceStepDitModel") or "turbo" + # User override from Generation tab model selector takes precedence; else auto base for cover (per docs). + task_for_dit = (params_copy.get("task_type") or params_copy.get("taskType") or "text2music").strip().lower() + explicit_dit = (params_copy.get("aceStepDitModel") or params_copy.get("ace_step_dit_model") or "").strip() + if explicit_dit: + dit_tag = explicit_dit + elif task_for_dit == "cover": + dit_tag = "base" + else: + dit_tag = config.get("ace_step_dit_model") or "turbo" lm_tag = config.get("ace_step_lm") or params_copy.get("aceStepLm") or "1.7B" with _jobs_lock: _jobs[job_id] = { @@ -608,10 +732,42 @@ def get_status(job_id: str): "progressStage": job.get("progressStage"), "result": job.get("result"), "error": job.get("error"), + "pendingReason": job.get("pendingReason") if status == "pending_model" else None, } return jsonify(out) +@bp.route("/retry-pending", methods=["POST"]) +def retry_pending(): + """POST /api/generate/retry-pending — promote pending_model jobs to queued if model is now available, start first queued job. Call after model download completes.""" + global _generation_busy + promoted = 0 + started = None + with _jobs_lock: + for jid in _job_order: + j = _jobs.get(jid) + if j and j.get("status") == "pending_model": + dit = j.get("dit_model") or "turbo" + if _is_model_available(dit): + j["status"] = "queued" + j["pendingReason"] = None + promoted += 1 + logging.info("[API generate] Job %s promoted to queued (model %s now available)", jid, dit) + if not _generation_busy: + for jid in _job_order: + j = _jobs.get(jid) + if j and j.get("status") == "queued": + _generation_busy = True + threading.Thread(target=_run_generation, args=(jid,), daemon=True).start() + started = jid + break + return jsonify({ + "ok": True, + "promoted": promoted, + "startedJobId": started, + }) + + @bp.route("/unstick", methods=["POST"]) def unstick_queue(): """POST /api/generate/unstick — clear stuck worker state and start the next queued job (if any).""" @@ -641,9 +797,10 @@ def cancel_job(job_id: str): if not job: return jsonify({"error": "Job not found"}), 404 status = job.get("status", "unknown") - if status == "queued": + if status in ("queued", "pending_model"): job["status"] = "cancelled" job["error"] = "Cancelled by user" + job["pendingReason"] = None return jsonify({"cancelled": True, "jobId": job_id, "message": "Job removed from queue."}) if status == "running": _cancel_requested.add(job_id) diff --git a/cdmf_pipeline_ace_step.py b/cdmf_pipeline_ace_step.py index 5780a74..0e3aa57 100644 --- a/cdmf_pipeline_ace_step.py +++ b/cdmf_pipeline_ace_step.py @@ -1195,7 +1195,8 @@ def add_latents_noise( sigma_max=sigma_max ) - infer_steps = int(sigma_max * infer_steps) + # Ensure enough steps for cover/audio2audio so reference is audible (INFERENCE.md: base 32-64 recommended). + infer_steps = max(16, int(sigma_max * infer_steps)) timesteps, num_inference_steps = retrieve_timesteps( scheduler, num_inference_steps=infer_steps, @@ -1295,6 +1296,17 @@ def text2music_diffusion_process( if ref_latents is not None: frame_length = ref_latents.shape[-1] + # Cap ref length for cover/audio2audio so each diffusion step stays fast (avoids 80s+ per step on long refs) + max_cover_sec = float(os.environ.get("ACE_COVER_MAX_REF_SECONDS", "90")) + max_cover_frames = int(max_cover_sec * 44100 / 512 / 8) + if frame_length > max_cover_frames: + ref_latents = ref_latents[:, :, :, :max_cover_frames].contiguous() + frame_length = max_cover_frames + logger.info( + "Capped ref_latents to %d frames (~%.0fs) for faster cover/audio2audio generation (set ACE_COVER_MAX_REF_SECONDS to override).", + max_cover_frames, + max_cover_sec, + ) if len(oss_steps) > 0: infer_steps = max(oss_steps) @@ -2087,6 +2099,7 @@ def __call__( ref_latents = None if ref_audio_input is not None and audio2audio_enable: + # For cover mode: ref_audio_input = source song (song to cover), per docs/ACE-Step-INFERENCE.md. assert ref_audio_input is not None, "ref_audio_input is required for audio2audio task" assert os.path.exists( ref_audio_input diff --git a/cdmf_state.py b/cdmf_state.py index 4517d3d..3776cc3 100644 --- a/cdmf_state.py +++ b/cdmf_state.py @@ -4,7 +4,7 @@ import threading import time -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Callable from ace_model_setup import ace_models_present @@ -26,6 +26,24 @@ def get_current_generation_job_id() -> Optional[str]: return getattr(_current_job_id_holder, "job_id", None) +# --------------------------------------------------------------------------- +# Progress updater (called from log handler when tqdm progress is parsed) +# --------------------------------------------------------------------------- + +_progress_updater: Optional[Callable[[int, int, int, Optional[float]], None]] = None + + +def set_progress_updater(cb: Optional[Callable[[int, int, int, Optional[float]], None]]) -> None: + """Set a callback(percent, current, total, eta_seconds) used to update API job from parsed log progress.""" + global _progress_updater + _progress_updater = cb + + +def get_progress_updater() -> Optional[Callable[[int, int, int, Optional[float]], None]]: + """Return the current progress updater, or None.""" + return _progress_updater + + # --------------------------------------------------------------------------- # Generation progress (shared with /progress endpoint and model downloads) # --------------------------------------------------------------------------- @@ -57,6 +75,15 @@ def get_current_generation_job_id() -> Optional[str]: "message": "", } +# --------------------------------------------------------------------------- +# Generation pipeline loading (may trigger HuggingFace model download on first use) +# --------------------------------------------------------------------------- + +GENERATION_MODEL_LOADING: Dict[str, Any] = { + "in_progress": False, + "message": "Preparing model (downloading if needed)...", +} + # --------------------------------------------------------------------------- # MuFun-ACEStep analysis model availability # --------------------------------------------------------------------------- diff --git a/generate_ace.py b/generate_ace.py index 73f0c24..3420ab3 100644 --- a/generate_ace.py +++ b/generate_ace.py @@ -180,6 +180,7 @@ def _candy_torchaudio_load( # ----------------------------------------------------------------------------- import cdmf_paths +import cdmf_state # Default target length + fades (UI can override) DEFAULT_TARGET_SECONDS = 150.0 @@ -437,32 +438,91 @@ def _get_ace_pipeline() -> "ACEStepPipeline": if _ACE_PIPELINE is not None: return _ACE_PIPELINE - print( - "[ACE] Initializing ACEStepPipeline (first time will download/load checkpoints)...", - flush=True, - ) - _report_progress(0.05, "ace_load") - - # Make sure our dedicated ACE cache under ace_models/checkpoints is ready. + # Notify UI that model may be downloading (pipeline load can trigger HuggingFace fetch). try: - checkpoint_root = ensure_ace_models() - except Exception as exc: - raise RuntimeError( - "Failed to prepare ACE-Step checkpoints. " - "See the console logs above for details." - ) from exc + cdmf_state.GENERATION_MODEL_LOADING["in_progress"] = True + cdmf_state.GENERATION_MODEL_LOADING["message"] = "Preparing model (downloading if needed)..." + except Exception: + pass - # Wire ACE's internal progress bars into our callback before heavy work starts. - _monkeypatch_ace_tqdm() + try: + print( + "[ACE] Initializing ACEStepPipeline (first time will download/load checkpoints)...", + flush=True, + ) + _report_progress(0.05, "ace_load") + + # Resolve checkpoint path: use ACE-Step 1.5 model folder from config when present (e.g. base for cover). + # Never trigger downloads from generation; require model to be installed via Settings → Models. + DIT_15_FOLDERS = { + "turbo": "acestep-v15-turbo", + "base": "acestep-v15-base", + "sft": "acestep-v15-sft", + "turbo-shift1": "acestep-v15-turbo-shift1", + "turbo-shift3": "acestep-v15-turbo-shift3", + "turbo-continuous": "acestep-v15-turbo-continuous", + } + REQUIRED_SUBDIRS = ("music_dcae_f8c8", "music_vocoder", "ace_step_transformer", "umt5-base") + config = cdmf_paths.load_config() + dit = (config.get("ace_step_dit_model") or "turbo").strip().lower() + folder = DIT_15_FOLDERS.get(dit) + models_root = Path(cdmf_paths.get_models_folder()) / "checkpoints" + checkpoint_root = None + if folder: + candidate = models_root / folder + if not candidate.exists(): + raise RuntimeError( + f"Model '{dit}' is not installed. Please install it from Settings → Models " + "(do not start generation to trigger downloads)." + ) + for sub in REQUIRED_SUBDIRS: + if not (candidate / sub).exists(): + raise RuntimeError( + f"Model '{dit}' is not fully installed (missing {sub}). " + "Please install or re-download it from Settings → Models." + ) + checkpoint_root = candidate + print(f"[ACE] Using DiT model '{dit}' at {checkpoint_root}", flush=True) + if checkpoint_root is None: + # Legacy v1 path: only use if already present; never download from here. + from ace_model_setup import ace_models_present + if not ace_models_present(): + raise RuntimeError( + "ACE-Step model is not installed. Please install it from Settings → Models " + "(do not start generation to trigger downloads)." + ) + try: + checkpoint_root = ensure_ace_models() + except Exception as exc: + raise RuntimeError( + "Failed to prepare ACE-Step checkpoints. " + "See the console logs above for details." + ) from exc + + # Wire ACE's internal progress bars into our callback before heavy work starts. + _monkeypatch_ace_tqdm() + + # Tell ACE-Step to use our cache root as its checkpoint_dir so it + # doesn't try to re-download into ~/.cache/ace-step/checkpoints. + pipeline = ACEStepPipeline(checkpoint_dir=str(checkpoint_root)) + _ACE_PIPELINE = pipeline + + print("[ACE] ACEStepPipeline ready.", flush=True) + return _ACE_PIPELINE + finally: + try: + cdmf_state.GENERATION_MODEL_LOADING["in_progress"] = False + except Exception: + pass - # Tell ACE-Step to use our cache root as its checkpoint_dir so it - # doesn't try to re-download into ~/.cache/ace-step/checkpoints. - pipeline = ACEStepPipeline(checkpoint_dir=str(checkpoint_root)) - _ACE_PIPELINE = pipeline + return _ACE_PIPELINE - print("[ACE] ACEStepPipeline ready.", flush=True) - return _ACE_PIPELINE +def clear_ace_pipeline() -> None: + """Clear the cached pipeline so the next call to _get_ace_pipeline() loads fresh (e.g. after switching DiT model).""" + global _ACE_PIPELINE + with _ACE_PIPELINE_LOCK: + _ACE_PIPELINE = None # ----------------------------------------------------------------------------- @@ -569,11 +629,10 @@ def _prepare_reference_audio( """ Normalise the ACE-Step edit / audio2audio mode (task_type, reference_audio, src_audio per Tutorial/INFERENCE): - - Task (task_type) is clamped to one of: text2music / retake / repaint / extend. - - UI tasks "cover" and "audio2audio" are mapped to "retake" (ACE-Step - then uses ref_audio_input and sets task to "audio2audio" internally). - - If Audio2Audio is enabled while task is still 'text2music', we - internally flip it to 'retake' (this is how ACE-Step expects edits). + - Task (task_type) is clamped to one of: text2music / audio2audio / retake / repaint / extend. + - UI tasks "cover" and "audio2audio" are passed as task="audio2audio" (INFERENCE.md: + cover uses src_audio = song to cover; we pass it as ref_audio_input, no retake path). + - If Audio2Audio is enabled while task is still 'text2music', we set task to 'audio2audio'. - For any edit mode (retake/repaint/extend) we prefer to have a reference audio file and make sure ACE-Step sees a .wav path. If no reference is provided, we *gracefully* fall back to @@ -582,19 +641,19 @@ def _prepare_reference_audio( task_norm = (task or "text2music").strip().lower() if task_norm not in ("text2music", "retake", "repaint", "extend", "cover", "audio2audio", "lego", "extract", "complete"): task_norm = "text2music" - # Map UI task names to pipeline task: cover and audio2audio both run as retake - # (pipeline will set task to "audio2audio" when ref_audio_input is passed). + # Per docs/ACE-Step-INFERENCE.md: cover uses src_audio (song to cover) + caption (target style). + # Pass task="audio2audio" so the pipeline uses ref_audio_input only (no retake/repaint path). if task_norm in ("cover", "audio2audio"): - task_norm = "retake" + task_norm = "audio2audio" # Audio2Audio is effectively an edit of an existing clip. If the user - # left the task on "Text → music", run it as a retake under the hood. + # left the task on "Text → music" but provided ref audio, run as audio2audio. if audio2audio_enable and task_norm == "text2music": - task_norm = "retake" + task_norm = "audio2audio" # Any of the edit-style tasks imply some form of Audio2Audio or source-backed (lego/extract/complete). audio2audio_flag = bool( - audio2audio_enable or task_norm in ("retake", "repaint", "extend") + audio2audio_enable or task_norm in ("audio2audio", "retake", "repaint", "extend") ) needs_src_path = audio2audio_flag or task_norm in ("lego", "extract", "complete") diff --git a/music_forge_ui.py b/music_forge_ui.py index 369af35..d3722c7 100644 --- a/music_forge_ui.py +++ b/music_forge_ui.py @@ -265,22 +265,30 @@ def _should_filter(self, line): return False + def _parse_eta_seconds(self, time_info): + """Parse tqdm time_info like '01:34<23:38, 94.54s/it' -> remaining seconds (23*60+38).""" + if not time_info: + return None + # Match prev.map(s => { if (s.id === tempId) { return { ...s, - queuePosition: status.status === 'queued' ? status.queuePosition : undefined, + queuePosition: (status.status === 'queued' || status.status === 'pending_model') ? status.queuePosition : undefined, generationPercent: status.status === 'running' ? status.progressPercent : undefined, generationSteps: status.status === 'running' ? status.progressSteps : undefined, generationEtaSeconds: status.status === 'running' && status.etaSeconds != null ? status.etaSeconds : undefined, + generationStatus: status.status as Song['generationStatus'], + generationPendingReason: status.pendingReason ?? undefined, }; } return s; @@ -709,7 +712,14 @@ export default function App() { cleanupJob(job.jobId, tempId); console.error(`Job ${job.jobId} failed:`, status.error); showToast(`Generation failed: ${status.error || 'Unknown error'}`, 'error'); + } else if (status.status === 'cancelled') { + cleanupJob(job.jobId, tempId); + setSongs(prev => prev.filter(song => song.id !== tempId)); + } else if (status.status === 'pending_model') { + // Promote to queued when model becomes available (e.g. user installed from Settings) + generateApi.retryPending().catch(() => {}); } + // pending_model and queued: keep polling; job stays in list } catch (pollError) { console.error(`Polling error for job ${job.jobId}:`, pollError); cleanupJob(job.jobId, tempId); @@ -1057,6 +1067,7 @@ export default function App() { onNavigateToProfile={handleNavigateToProfile} onReusePrompt={handleReuse} onDelete={handleDeleteSong} + onOpenSettings={() => setShowSettingsModal(true)} /> {showRightSidebar && ( @@ -1104,6 +1115,7 @@ export default function App() { isGenerating={isGenerating} initialData={reuseData} onOpenSettings={() => setShowSettingsModal(true)} + onOpenConsoleLogs={() => setShowConsole(true)} /> @@ -1130,6 +1142,7 @@ export default function App() { onNavigateToProfile={handleNavigateToProfile} onReusePrompt={handleReuse} onDelete={handleDeleteSong} + onOpenSettings={() => setShowSettingsModal(true)} /> @@ -1265,6 +1278,7 @@ export default function App() { theme={theme} onToggleTheme={toggleTheme} onNavigateToProfile={handleNavigateToProfile} + onDownloadComplete={() => generateApi.retryPending().catch(() => {})} /> {/* Mobile Details Modal */} diff --git a/ui/components/CreatePanel.tsx b/ui/components/CreatePanel.tsx index b902906..c7ec20d 100644 --- a/ui/components/CreatePanel.tsx +++ b/ui/components/CreatePanel.tsx @@ -5,7 +5,7 @@ import { useAuth } from '../context/AuthContext'; import { generateApi, preferencesApi, aceStepModelsApi, type LoraAdapter } from '../services/api'; /** Tasks that require ACE-Step Base model only (see docs/ACE-Step-Tutorial.md). */ -const TASKS_REQUIRING_BASE = ['lego', 'extract', 'complete'] as const; +const TASKS_REQUIRING_BASE = ['cover', 'lego', 'extract', 'complete'] as const; function taskRequiresBase(taskType: string): boolean { return TASKS_REQUIRING_BASE.includes(taskType as typeof TASKS_REQUIRING_BASE[number]); } @@ -31,6 +31,8 @@ interface CreatePanelProps { initialData?: { song: Song, timestamp: number } | null; /** Open Settings modal (e.g. to download required model). */ onOpenSettings?: () => void; + /** Open Console logs panel (e.g. when model is downloading in background). */ + onOpenConsoleLogs?: () => void; } /** Visible tooltip on hover (native title has delay and is unreliable). */ @@ -155,13 +157,18 @@ const VOCAL_LANGUAGES = [ // Create panel mode: Simple (description), Custom (full controls), Cover (pure cover: source + caption), Lego (add-instrument tracks) type CreateMode = 'simple' | 'custom' | 'cover' | 'lego'; -export const CreatePanel: React.FC = ({ onGenerate, isGenerating, initialData, onOpenSettings }) => { +export const CreatePanel: React.FC = ({ onGenerate, isGenerating, initialData, onOpenSettings, onOpenConsoleLogs }) => { const { isAuthenticated, token } = useAuth(); // Mode: simple | custom | cover | lego const [createMode, setCreateMode] = useState('custom'); const customMode = createMode === 'custom'; + // ACE-Step model for this generation (only installed models). Updated by workflow (e.g. base for cover) or user override. + const [generationDitModel, setGenerationDitModel] = useState('turbo'); + const [installedDitModels, setInstalledDitModels] = useState>([]); + const [modelDownloadInProgress, setModelDownloadInProgress] = useState(false); + // Cover tab: pure cover (source + caption) or blend (source + style audio) const [coverCaption, setCoverCaption] = useState(''); const [coverStrength, setCoverStrength] = useState(0.8); @@ -356,6 +363,42 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati // Fetch LoRA adapters on mount (Training output + custom_lora) useEffect(() => { fetchLoraAdapters(); }, [fetchLoraAdapters]); + // Load installed ACE-Step DiT models and sync generation model from preferences (or default base for cover/lego). + useEffect(() => { + aceStepModelsApi.list().then((list) => { + const installed = (list.dit_models || []).filter((m) => m.installed).map((m) => ({ id: m.id, label: m.label, description: m.description })); + setInstalledDitModels(installed); + preferencesApi.get().then((prefs) => { + const prefDit = (prefs.ace_step_dit_model || 'turbo').trim(); + const valid = installed.some((m) => m.id === prefDit); + setGenerationDitModel(valid ? prefDit : (installed[0]?.id || 'turbo')); + }).catch(() => { + if (installed.length) setGenerationDitModel(installed[0].id); + }); + }).catch(() => setInstalledDitModels([])); + }, []); + + // When switching to Cover or Lego, default model to base (user can override via selector). + useEffect(() => { + if ((createMode === 'cover' || createMode === 'lego') && installedDitModels.some((m) => m.id === 'base')) { + setGenerationDitModel((prev) => (prev === 'turbo' ? 'base' : prev)); + } + }, [createMode, installedDitModels]); + + // Poll model-download status when generating so we can show banner + link to console. + useEffect(() => { + if (!isGenerating) { + setModelDownloadInProgress(false); + return; + } + const poll = () => { + generateApi.modelDownloadStatus().then((st) => setModelDownloadInProgress(st.in_progress)).catch(() => setModelDownloadInProgress(false)); + }; + poll(); + const t = setInterval(poll, 2000); + return () => clearInterval(t); + }, [isGenerating]); + useEffect(() => { const handleMouseMove = (e: MouseEvent) => { if (!isResizing) return; @@ -704,16 +747,23 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati const effectiveTaskType = createMode === 'lego' ? 'lego' : createMode === 'cover' ? 'cover' : (customMode ? taskType : (sourceAudioUrl?.trim() ? 'cover' : 'text2music')); if (taskRequiresBase(effectiveTaskType)) { setLegoValidationError(''); + setCoverValidationError(''); try { const list = await aceStepModelsApi.list(); const baseInstalled = list.dit_models.some((m) => m.id === 'base' && m.installed); if (!baseInstalled) { - setLegoValidationError('Lego (and Extract/Complete) require the Base model. Open Settings to download it, then try again.'); + const msg = effectiveTaskType === 'cover' + ? 'Cover requires the Base model. Open Settings to download it, then try again.' + : 'Lego (and Extract/Complete) require the Base model. Open Settings to download it, then try again.'; + if (effectiveTaskType === 'cover') setCoverValidationError(msg); + else setLegoValidationError(msg); onOpenSettings?.(); return; } } catch (e) { - setLegoValidationError('Could not check models. Open Settings to ensure the Base model is installed.'); + const msg = 'Could not check models. Open Settings to ensure the Base model is installed.'; + if (effectiveTaskType === 'cover') setCoverValidationError(msg); + else setLegoValidationError(msg); onOpenSettings?.(); return; } @@ -784,6 +834,7 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati lmBatchChunkSize, negativePrompt: negativePrompt.trim() || undefined, isFormatCaption, + aceStepDitModel: generationDitModel, }); return; } @@ -852,6 +903,7 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati lmBatchChunkSize, negativePrompt: negativePrompt.trim() || undefined, isFormatCaption, + aceStepDitModel: generationDitModel, }); return; } @@ -927,6 +979,7 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati lmBatchChunkSize, negativePrompt: negativePrompt.trim() || undefined, isFormatCaption, + aceStepDitModel: generationDitModel, }); } @@ -958,6 +1011,46 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati onLoadedMetadata={(e) => setSourceDuration(e.currentTarget.duration || 0)} /> + {/* Model selector: only installed models; workflow (e.g. Cover→base) updates this; user can override. */} +
+
+ + {installedDitModels.length > 0 ? ( + + ) : ( + No models installed + )} + {(createMode === 'cover' || createMode === 'lego') && generationDitModel === 'base' && ( + (recommended for this mode) + )} +
+ {installedDitModels.length === 0 && ( + + )} +
+ + {/* Notify when pipeline is loading (may be downloading model files in background). */} + {modelDownloadInProgress && ( +
+ Model files are being downloaded or prepared. This may take a while. Do not close the app. + {onOpenConsoleLogs && ( + + )} +
+ )} + {/* Header - Mode Toggle */}
@@ -974,7 +1067,12 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati Custom
+ {/* Inference steps (cover): up to 80 for base model (docs) */} +
+
+ Inference steps + +
+
+
+ { setInferenceSteps(Number(e.target.value)); setQualityPreset('custom'); }} + className="flex-1 h-2 bg-zinc-200 dark:bg-zinc-700 rounded-lg appearance-none cursor-pointer accent-pink-500" + /> + {inferenceSteps} +
+

1–80 (Base model used for Cover; 32–64 recommended)

+
+
+ {/* Quality preset */}
@@ -2262,20 +2384,20 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati
- + {inferenceSteps}
{ setInferenceSteps(Number(e.target.value)); setQualityPreset('custom'); }} className="w-full h-2 bg-zinc-200 dark:bg-zinc-700 rounded-lg appearance-none cursor-pointer accent-pink-500" /> -

65 recommended for quality; base/SFT can use up to 75 steps

+

65 recommended for quality; base/cover can use up to 80 steps (INFERENCE.md)

{/* Guidance Scale */} @@ -2589,7 +2711,7 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati {(taskType === 'cover' || taskType === 'audio2audio') && 'Transform an existing track: set a source/cover audio and describe the new style. Use Cover Strength to control how much to follow the original.'} {taskType === 'repaint' && 'Regenerate only a time segment of the source. Set start/end (seconds; -1 = end of file) and style for that section.'} {taskType === 'extend' && 'Extend the source audio. Use source audio and optional style for the continuation.'} - {(taskType === 'lego' || taskType === 'extract' || taskType === 'complete') && 'Requires ACE-Step 1.5 Base model. Lego: add new tracks to existing. Extract: separate stems. Complete: add accompaniment to a single track.'} + {(taskType === 'cover' || taskType === 'lego' || taskType === 'extract' || taskType === 'complete') && 'Requires ACE-Step 1.5 Base model. Cover: style transfer. Lego: add new tracks. Extract: separate stems. Complete: add accompaniment.'}

diff --git a/ui/components/SettingsModal.tsx b/ui/components/SettingsModal.tsx index 7414d18..1377cbd 100644 --- a/ui/components/SettingsModal.tsx +++ b/ui/components/SettingsModal.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect } from 'react'; +import React, { useState, useEffect, useRef } from 'react'; import { X, User as UserIcon, Palette, Info, Edit3, ExternalLink, Github, FolderOpen, HardDrive, ZoomIn, Box } from 'lucide-react'; import { useAuth } from '../context/AuthContext'; import { EditProfileModal } from './EditProfileModal'; @@ -11,6 +11,8 @@ interface SettingsModalProps { theme: 'light' | 'dark'; onToggleTheme: () => void; onNavigateToProfile?: (username: string) => void; + /** Called when an ACE-Step model download finishes (so pending generation jobs can be retried). */ + onDownloadComplete?: () => void; } const ZOOM_OPTIONS = [80, 90, 100, 110, 125] as const; @@ -33,7 +35,7 @@ const ACE_STEP_LM_OPTIONS = [ { value: '4B', label: '4B' }, ] as const; -export const SettingsModal: React.FC = ({ isOpen, onClose, theme, onToggleTheme, onNavigateToProfile }) => { +export const SettingsModal: React.FC = ({ isOpen, onClose, theme, onToggleTheme, onNavigateToProfile, onDownloadComplete }) => { const { user } = useAuth(); const [isEditProfileOpen, setIsEditProfileOpen] = useState(false); const [modelsFolder, setModelsFolder] = useState(''); @@ -45,10 +47,13 @@ export const SettingsModal: React.FC = ({ isOpen, onClose, t const [aceStepDitModel, setAceStepDitModel] = useState('turbo'); const [aceStepLm, setAceStepLm] = useState('1.7B'); const [modelsSaved, setModelsSaved] = useState(false); - const [aceStepList, setAceStepList] = useState<{ dit_models: Array<{ id: string; label: string; description?: string; installed: boolean }>; lm_models: Array<{ id: string; label: string; installed: boolean }>; discovered_models?: Array<{ id: string; label: string; path: string; custom: boolean }>; acestep_download_available: boolean } | null>(null); + const [aceStepList, setAceStepList] = useState<{ dit_models: Array<{ id: string; label: string; description?: string; installed: boolean; size_gb?: number }>; lm_models: Array<{ id: string; label: string; installed: boolean; size_gb?: number }>; discovered_models?: Array<{ id: string; label: string; path: string; custom: boolean }>; acestep_download_available: boolean } | null>(null); const [downloadingModel, setDownloadingModel] = useState(null); const [downloadError, setDownloadError] = useState(null); const [downloadStatus, setDownloadStatus] = useState(null); + const [pendingDownload, setPendingDownload] = useState<{ id: string; label: string; sizeGb: number } | null>(null); + const [diskSpace, setDiskSpace] = useState<{ free_gb: number; total_gb: number } | null>(null); + const wasDownloadingRef = useRef(false); useEffect(() => { if (isOpen) { @@ -72,6 +77,7 @@ export const SettingsModal: React.FC = ({ isOpen, onClose, t // Poll download status while a download is running (so we show progress and know when it finishes) useEffect(() => { if (!isOpen || !downloadStatus?.running) return; + wasDownloadingRef.current = true; const interval = setInterval(() => { aceStepModelsApi.downloadStatus() .then((s) => { @@ -80,12 +86,14 @@ export const SettingsModal: React.FC = ({ isOpen, onClose, t setDownloadingModel(null); if (s.error && !s.cancelled) setDownloadError(s.error); aceStepModelsApi.list().then(setAceStepList).catch(() => {}); + if (wasDownloadingRef.current && !s.error) onDownloadComplete?.(); + wasDownloadingRef.current = false; } }) .catch(() => {}); }, 1500); return () => clearInterval(interval); - }, [isOpen, downloadStatus?.running]); + }, [isOpen, downloadStatus?.running, onDownloadComplete]); // Restrict selection to installed or discovered models: if current choice not in list, switch to first available useEffect(() => { @@ -345,20 +353,9 @@ export const SettingsModal: React.FC = ({ isOpen, onClose, t disabled={downloadStatus?.running === true} onClick={() => { setDownloadError(null); - setDownloadingModel(m.id); - aceStepModelsApi.download(m.id) - .then((r) => { - if (r.error) { - setDownloadError(r.hint ? `${r.error} ${r.hint}` : r.error); - setDownloadingModel(null); - } else { - aceStepModelsApi.downloadStatus().then(setDownloadStatus); - } - }) - .catch((err) => { - setDownloadError(err?.message || 'Download failed'); - setDownloadingModel(null); - }); + const sizeGb = (m as { size_gb?: number }).size_gb ?? 8; + setPendingDownload({ id: m.id, label: m.label, sizeGb }); + aceStepModelsApi.diskSpace().then((d) => setDiskSpace(d)).catch(() => setDiskSpace(null)); }} className="text-xs px-2 py-1 rounded bg-pink-500 text-white hover:bg-pink-600 disabled:opacity-50" > @@ -381,20 +378,9 @@ export const SettingsModal: React.FC = ({ isOpen, onClose, t disabled={downloadStatus?.running === true} onClick={() => { setDownloadError(null); - setDownloadingModel(m.id); - aceStepModelsApi.download(m.id) - .then((r) => { - if (r.error) { - setDownloadError(r.error); - setDownloadingModel(null); - } else { - aceStepModelsApi.downloadStatus().then(setDownloadStatus); - } - }) - .catch((err) => { - setDownloadError(err?.message || 'Download failed'); - setDownloadingModel(null); - }); + const sizeGb = (m as { size_gb?: number }).size_gb ?? 4; + setPendingDownload({ id: m.id, label: m.label, sizeGb }); + aceStepModelsApi.diskSpace().then((d) => setDiskSpace(d)).catch(() => setDiskSpace(null)); }} className="text-xs px-2 py-1 rounded bg-pink-500 text-white hover:bg-pink-600 disabled:opacity-50" > @@ -604,6 +590,57 @@ export const SettingsModal: React.FC = ({ isOpen, onClose, t
+ {/* Confirm model download: ensure user intends to download and has space */} + {pendingDownload && ( +
setPendingDownload(null)}> +
e.stopPropagation()}> +

Download model?

+

+ {pendingDownload.label} will use approximately {pendingDownload.sizeGb} GB. +

+ {diskSpace != null && ( +

+ You have {diskSpace.free_gb} GB free. Ensure you have enough space before continuing. +

+ )} + {diskSpace != null && pendingDownload.sizeGb > 0 && diskSpace.free_gb < pendingDownload.sizeGb && ( +

Low disk space. Free at least {pendingDownload.sizeGb - diskSpace.free_gb} GB more.

+ )} +
+ + +
+
+
+ )} + setIsEditProfileOpen(false)} diff --git a/ui/components/SongList.tsx b/ui/components/SongList.tsx index d52d525..3599742 100644 --- a/ui/components/SongList.tsx +++ b/ui/components/SongList.tsx @@ -1,6 +1,6 @@ import React, { useState, useMemo, useRef, useEffect } from 'react'; import { Song } from '../types'; -import { Play, MoreHorizontal, Heart, ThumbsDown, ListPlus, Pause, Search, Filter, Check, Globe, Lock, Loader2, ThumbsUp, Share2, Video, Info, Clock } from 'lucide-react'; +import { Play, MoreHorizontal, Heart, ThumbsDown, ListPlus, Pause, Search, Filter, Check, Globe, Lock, Loader2, ThumbsUp, Share2, Video, Info, Clock, Settings } from 'lucide-react'; import { useAuth } from '../context/AuthContext'; import { SongDropdownMenu } from './SongDropdownMenu'; import { ShareModal } from './ShareModal'; @@ -21,6 +21,8 @@ interface SongListProps { onNavigateToProfile?: (username: string) => void; onReusePrompt?: (song: Song) => void; onDelete?: (song: Song) => void; + /** Open Settings (e.g. to install model when job is pending_model). */ + onOpenSettings?: () => void; } // ... existing code ... @@ -51,7 +53,8 @@ export const SongList: React.FC = ({ onShowDetails, onNavigateToProfile, onReusePrompt, - onDelete + onDelete, + onOpenSettings }) => { const { user } = useAuth(); const [searchQuery, setSearchQuery] = useState(''); @@ -283,7 +286,27 @@ const SongItem: React.FC = ({ {song.isGenerating ? (
- {song.queuePosition ? ( + {song.generationStatus === 'pending_model' ? ( + /* Waiting for model */ + <> +
+ +
+ Waiting for model + {song.generationPendingReason && ( + {song.generationPendingReason} + )} + {onOpenSettings && ( + + )} + + ) : song.queuePosition ? ( /* Queue indicator */ <>
@@ -336,7 +359,7 @@ const SongItem: React.FC = ({

- {song.title || (song.isGenerating ? (song.queuePosition ? "Queued..." : (song.generationPercent != null ? `Creating... ${Math.round(song.generationPercent)}%` : "Creating...")) : "Untitled")} + {song.title || (song.isGenerating ? (song.generationStatus === 'pending_model' ? "Waiting for model..." : song.queuePosition ? "Queued..." : (song.generationPercent != null ? `Creating... ${Math.round(song.generationPercent)}%` : "Creating...")) : "Untitled")}

v1.5 @@ -450,8 +473,8 @@ const SongItem: React.FC = ({ {/* Timestamp */}
{song.isGenerating ? ( - - {song.queuePosition ? `#${song.queuePosition}` : (song.generationPercent != null ? `${Math.round(song.generationPercent)}%` : 'Creating...')} + + {song.generationStatus === 'pending_model' ? 'Waiting for model' : song.queuePosition ? `#${song.queuePosition}` : (song.generationPercent != null ? `${Math.round(song.generationPercent)}%` : 'Creating...')} ) : song.duration}
diff --git a/ui/services/api.ts b/ui/services/api.ts index b4c4802..a3c2ce3 100644 --- a/ui/services/api.ts +++ b/ui/services/api.ts @@ -280,6 +280,8 @@ export interface AceStepModelItem { steps?: number; cfg?: boolean; exclusive_tasks?: string[]; + /** Approximate size in GB for download confirmation. */ + size_gb?: number; } export interface AceStepDiscoveredModel { @@ -312,6 +314,9 @@ export interface AceStepDownloadStatus { export const aceStepModelsApi = { list: (): Promise => api('/api/ace-step/models') as Promise, + /** Free/total disk space for models directory (for download confirmation). */ + diskSpace: (): Promise<{ free_gb: number; total_gb: number; path: string }> => + api('/api/ace-step/models/disk-space') as Promise<{ free_gb: number; total_gb: number; path: string }>, download: (model: string): Promise<{ ok?: boolean; started?: boolean; error?: string; path?: string; hint?: string }> => api('/api/ace-step/models/download', { method: 'POST', body: { model } }), downloadStatus: (): Promise => @@ -322,8 +327,9 @@ export const aceStepModelsApi = { export interface GenerationJob { jobId: string; - status: 'pending' | 'queued' | 'running' | 'succeeded' | 'failed'; + status: 'pending' | 'queued' | 'running' | 'succeeded' | 'failed' | 'pending_model'; queuePosition?: number; + pendingReason?: string | null; etaSeconds?: number; progressPercent?: number; progressSteps?: string; @@ -355,9 +361,17 @@ export const generateApi = { cancelJob: (jobId: string, token: string): Promise<{ cancelled: boolean; jobId: string; message: string }> => api(`/api/generate/cancel/${jobId}`, { method: 'POST', token }), + /** Promote pending_model jobs to queued when model is available; start first queued job. Call after model download. */ + retryPending: (): Promise<{ ok: boolean; promoted: number; startedJobId?: string }> => + api('/api/generate/retry-pending', { method: 'POST' }) as Promise<{ ok: boolean; promoted: number; startedJobId?: string }>, + getHistory: (token: string): Promise<{ jobs: GenerationJob[] }> => api('/api/generate/history', { token }), + /** Whether the generation pipeline is loading (may be downloading model files). Show banner + link to console. */ + modelDownloadStatus: (): Promise<{ in_progress: boolean; message?: string }> => + api('/api/generate/model-download-status') as Promise<{ in_progress: boolean; message?: string }>, + /** List LoRA adapters (Training output and custom_lora folder). */ getLoraAdapters: (): Promise<{ adapters: LoraAdapter[] }> => api('/api/generate/lora_adapters'), diff --git a/ui/types.ts b/ui/types.ts index f0e0364..2c34a7e 100644 --- a/ui/types.ts +++ b/ui/types.ts @@ -11,6 +11,9 @@ export interface Song { generationPercent?: number; generationSteps?: string; generationEtaSeconds?: number; + /** When status is pending_model: job is waiting for the required model to be installed. */ + generationStatus?: 'queued' | 'running' | 'succeeded' | 'failed' | 'pending_model' | 'cancelled'; + generationPendingReason?: string | null; tags: string[]; audioUrl?: string; isPublic?: boolean; @@ -116,6 +119,8 @@ export interface GenerationParams { isFormatCaption?: boolean; loraNameOrPath?: string; loraWeight?: number; + /** Override DiT model for this job (e.g. from Generation tab selector). Only installed models. */ + aceStepDitModel?: string; } export interface PlayerState { From 26aa3e768bf78c6aba248026d4d89b313449ddc0 Mon Sep 17 00:00:00 2001 From: E Date: Sun, 8 Feb 2026 16:05:37 +0100 Subject: [PATCH 2/3] Lego mode: defaults from ACE-Step-1.5 #117, caption/instruction handling, docs - Backend: ref_audio_strength=1.0 for lego (avoid MPS crash), force thinking=False, shift=3.0 for lego; instruction + caption prompt build (no duplicate/trailing comma) - generate_ace: plumb shift param (3.0 lego / 6.0 default) - UI: lego backing default 1.0, caption optional with tooltip, send style=caption only - docs/LEGO-MODE.md: known limits and defaults; docs/ace-step-skills: ACE-Step skills reference (SKILL.md, music-creation-guide.md) Co-authored-by: Cursor --- api/generate.py | 28 +- docs/LEGO-MODE.md | 32 ++ docs/ace-step-skills/README.md | 17 + docs/ace-step-skills/SKILL.md | 356 +++++++++++++++++++ docs/ace-step-skills/music-creation-guide.md | 350 ++++++++++++++++++ generate_ace.py | 6 +- ui/components/CreatePanel.tsx | 22 +- 7 files changed, 795 insertions(+), 16 deletions(-) create mode 100644 docs/LEGO-MODE.md create mode 100644 docs/ace-step-skills/README.md create mode 100644 docs/ace-step-skills/SKILL.md create mode 100644 docs/ace-step-skills/music-creation-guide.md diff --git a/api/generate.py b/api/generate.py index 3f8d7cc..99699d1 100644 --- a/api/generate.py +++ b/api/generate.py @@ -298,13 +298,17 @@ def _run_generation(job_id: str) -> None: bpm = None except (TypeError, ValueError): bpm = None - # Lego/extract/complete: instruction (uppercase track) + caption appended with comma. + # Lego/extract/complete: instruction (uppercase track) + optional caption. Instruction is auto (e.g. "Generate the GUITAR track..."); caption is user style (key, BPM, tone). # No metas — BPM/key/timesignature should match the input backing. if task in ("lego", "extract", "complete"): instruction = _uppercase_track_in_instruction( instruction or "Generate an instrument track based on the audio context:" ) - prompt = (instruction.rstrip(":").strip() + ", " + (caption or "").strip()).strip() if (instruction or caption) else instruction + cap = (caption or "").strip() + if not cap: + prompt = instruction.rstrip(":").strip() + else: + prompt = (instruction.rstrip(":").strip() + ", " + cap).strip() if not prompt: prompt = instruction or "Generate an instrument track based on the audio context" title = (params.get("title") or "Untitled").strip() or "Track" @@ -343,12 +347,13 @@ def _run_generation(job_id: str) -> None: # When reference/source audio is provided, enable Audio2Audio so ACE-Step uses it (cover/retake/repaint/lego). # Defaults aligned with ACE-Step-MCP (ref_audio_strength 0.5) and cover/retake UX (strong source → 0.8). - # Lego/extract/complete: low ref_audio_strength so output follows prompt (new instrument), not copy of backing. + # Lego/extract/complete: default ref_audio_strength=1.0 to avoid MPS crash (batch dim mismatch when <1.0). + # See https://github.com/ace-step/ACE-Step-1.5/issues/117 — lower values can improve "new instrument" feel but crash on Apple Silicon. # See docs/ACE-Step-INFERENCE.md: audio_cover_strength 1.0 = strong adherence; lower = more prompt influence. audio2audio_enable = bool(src_audio_path) ref_default = 0.8 if task in ("cover", "retake") else (0.5 if task == "audio2audio" else 0.7) if task in ("lego", "extract", "complete"): - ref_default = 0.25 # low strength so output follows prompt (instrument) while matching backing timing + ref_default = 1.0 # 1.0 avoids MPS crash; user can lower via legoBackingInfluence if not on Apple Silicon # audio_cover_strength per ACE-Step; lego/cover blend use specific overrides when set ref_audio_strength = params.get("legoBackingInfluence") if task in ("lego", "extract", "complete") else None if ref_audio_strength is None and cover_blend: @@ -381,6 +386,15 @@ def _run_generation(job_id: str) -> None: retake_variance = 0.2 retake_variance = max(0.0, min(1.0, retake_variance)) + # Shift (timestep): 3.0 recommended for lego/timing; 6.0 pipeline default for others. See ACE-Step-1.5 issue #117. + try: + shift_val = float(params.get("shift") or params.get("shiftFactor") or 0) + except (TypeError, ValueError): + shift_val = 0.0 + if shift_val <= 0: + shift_val = 3.0 if task in ("lego", "extract", "complete") else 6.0 + shift_val = max(0.1, min(10.0, shift_val)) + # LoRA adapter (optional): path or folder name under custom_lora lora_name_or_path = (params.get("loraNameOrPath") or params.get("lora_name_or_path") or "").strip() try: @@ -389,8 +403,11 @@ def _run_generation(job_id: str) -> None: lora_weight = 0.75 lora_weight = max(0.0, min(2.0, lora_weight)) - # Thinking / LM / CoT (passed through so pipeline or future LM path can use them) + # Thinking / LM / CoT (passed through so pipeline or future LM path can use them). + # Lego/extract/complete: force thinking=False so src_audio drives context; thinking=True overrides with LLM codes (issue #117). thinking = bool(params.get("thinking", False)) + if task in ("lego", "extract", "complete"): + thinking = False use_cot_metas = bool(params.get("useCotMetas", True)) use_cot_caption = bool(params.get("useCotCaption", True)) # Lego/extract/complete: instruction must stay verbatim ("Generate the X track based on the audio context:"). @@ -466,6 +483,7 @@ def _run_generation(job_id: str) -> None: lora_weight=lora_weight, cancel_check=cancel_check, vocal_language=vocal_lang or "", + shift=shift_val, thinking=thinking, use_cot_metas=use_cot_metas, use_cot_caption=use_cot_caption, diff --git a/docs/LEGO-MODE.md b/docs/LEGO-MODE.md new file mode 100644 index 0000000..326cf90 --- /dev/null +++ b/docs/LEGO-MODE.md @@ -0,0 +1,32 @@ +# Lego Mode (ACE-Step 1.5) + +Lego mode adds a new instrument track on top of backing audio (e.g. add guitar to a beat). It requires the **Base** DiT model. + +## Known limitations and workarounds + +We align with findings from [ACE-Step-1.5 issue #117](https://github.com/ace-step/ACE-Step-1.5/issues/117) (BPM/timing drift and MPS crashes): + +1. **Timing drift** + Generated tracks are not strictly BPM-locked to the source; onsets can drift (20–80 ms). Workarounds that help: + - Match **duration** to the source (e.g. 4 bars at 135 BPM → duration ≈ 7.1 s: `4 * (60/135) * 4` for 4/4). + - Use **shorter segments** (e.g. 4 bars) then duplicate if needed; less time for drift. + - We use **shift=3.0** for lego (recommended in the issue for better timing vs shift=1.0). + +2. **Apple Silicon (MPS)** + On Mac: + - **`ref_audio_strength` (backing influence) < 1.0** can crash with a batch dimension mismatch at the cover→text2music transition. We **default to 1.0** for lego so Apple Silicon users don’t hit this. Lower values (0.2–0.5) can improve “new instrument” feel on non-MPS. + - **Thinking (LM)** is **disabled for lego** so the backing drives context; with thinking on, LLM-generated codes can override the source and hurt timing/context. + +3. **Caption and BPM** + Include style, key, and BPM in the caption (e.g. “electric guitar, C major, 135 BPM, 4 bars”) and set **BPM** in the API so metadata matches the backing. + +## AceForge defaults (lego) + +| Parameter | Default | Note | +|------------------------|--------|------| +| `ref_audio_strength` | 1.0 | Avoids MPS crash; UI “Backing influence” | +| `thinking` | false | Forced off for lego so src_audio drives context | +| `shift` | 3.0 | Better timing than 1.0/6.0 for lego | +| `use_cot_caption` | false | Keep instruction verbatim (“Generate the X track…”) | + +Users can still lower backing influence on non-Apple Silicon if they want more “new instrument” and accept the risk of drift or (on MPS) crash. diff --git a/docs/ace-step-skills/README.md b/docs/ace-step-skills/README.md new file mode 100644 index 0000000..64ec767 --- /dev/null +++ b/docs/ace-step-skills/README.md @@ -0,0 +1,17 @@ +# ACE-Step Skills (reference knowledge) + +This folder contains reference material from the official **ACE-Step Skills** repository, used as knowledge for AceForge development and for aligning with ACE-Step concepts (caption, lyrics, task types, API parameters). + +**Source:** [ace-step/ace-step-skills](https://github.com/ace-step/ace-step-skills) — `skills/acestep/` +**License:** See the upstream repository. + +## Contents + +| File | Description | +|------|-------------| +| [SKILL.md](./SKILL.md) | ACE-Step skill definition: API usage, generation modes, parameters, config. | +| [music-creation-guide.md](./music-creation-guide.md) | Music creation guide: caption, lyrics, structure tags, metadata, duration. | + +## Note for AceForge + +AceForge runs its own backend and API (Flask, `api/generate.py`, etc.), not the standalone ACE-Step API server on port 8001. The *concepts* (caption vs lyrics, task types, parameters, music-creation practices) still apply and are referenced when implementing or documenting AceForge features. diff --git a/docs/ace-step-skills/SKILL.md b/docs/ace-step-skills/SKILL.md new file mode 100644 index 0000000..141ed68 --- /dev/null +++ b/docs/ace-step-skills/SKILL.md @@ -0,0 +1,356 @@ +--- +name: acestep +description: Use ACE-Step API to generate music, edit songs, and remix music. Supports text-to-music, lyrics generation, audio continuation, and audio repainting. Use this skill when users mention generating music, creating songs, music production, remix, or audio continuation. +allowed-tools: Read, Write, Bash, Skill +--- + +# ACE-Step Music Generation Skill + +Use ACE-Step V1.5 API for music generation. Script: `scripts/acestep.sh` (requires curl + jq). + +## Prerequisites - ACE-Step API Service + +**IMPORTANT**: This skill requires the ACE-Step API server to be running. + +### Required Dependencies + +The `scripts/acestep.sh` script requires the following tools: + +**1. curl** - For making HTTP requests to the API +**2. jq** - For parsing JSON responses + +#### Check Dependencies + +Before using this skill, verify that the required tools are installed: + +```bash +# Check curl +curl --version + +# Check jq +jq --version +``` + +#### Installing jq + +If jq is not installed, the script will attempt to install it automatically. If automatic installation fails, install manually: + +**Windows:** +```bash +# Using Chocolatey +choco install jq + +# Or download from: https://jqlang.github.io/jq/download/ +# Extract jq.exe and add to PATH +``` + +**macOS:** +```bash +# Using Homebrew +brew install jq + +# Using MacPorts +port install jq +``` + +**Linux:** +```bash +# Debian/Ubuntu +sudo apt-get install jq + +# Fedora/RHEL/CentOS +sudo yum install jq +# or +sudo dnf install jq + +# Arch Linux +sudo pacman -S jq +``` + +**Verification:** +```bash +jq --version +# Should output: jq-1.x +``` + +If user reports jq installation issues, guide them through manual installation for their platform. + +### Before First Use + +**Ask the user about their setup:** + +1. **"Do you have ACE-Step API service configured and running?"** + + If **YES**: + - Verify the API endpoint: `curl -s http://127.0.0.1:8001/health` + - If using remote service, ask for the API URL and update `scripts/config.json` + - Proceed with music generation + + If **NO** or **NOT SURE**: + - Ask: "Do you have ACE-Step installed?" + + **If installed but not running**: + - Use the acestep-docs skill to help them start the service + - Guide them through startup process + + **If not installed**: + - Offer to help download and install ACE-Step + - Ask: "Would you like to use the Windows portable package or install from source?" + - Use acestep-docs skill to guide through installation + +### Service Configuration + +**Local Service (Default):** +```json +{ + "api_url": "http://127.0.0.1:8001", + "api_key": "" +} +``` + +**Remote Service:** +```json +{ + "api_url": "http://your-server-ip:8001", + "api_key": "your-api-key-if-needed" +} +``` + +To configure remote service, update `scripts/config.json` or use: +```bash +cd {skill_directory}/scripts/ +./acestep.sh config --set api_url "http://remote-server:8001" +./acestep.sh config --set api_key "your-key" +``` + +### Using acestep-docs Skill for Setup Help + +**IMPORTANT**: For installation and startup, always use the acestep-docs skill to get complete and accurate guidance. + +When user needs help with installation or startup, invoke the acestep-docs skill: + +``` +Use the Skill tool to invoke: acestep-docs +``` + +**DO NOT provide simplified startup commands** - each user's environment may be different. Always guide them to use acestep-docs for proper setup. + +### Health Check + +**To verify if service is running:** +```bash +curl http://127.0.0.1:8001/health +# Should return: {"status":"ok",...} +``` + +If health check fails, use acestep-docs skill to help user start the service properly. + +--- + +**WORKFLOW**: For user requests requiring vocals, you should: +1. Consult [Music Creation Guide](./music-creation-guide.md) for lyrics writing, caption creation, duration/BPM/key selection +2. Write complete, well-structured lyrics yourself based on the guide +3. Generate using Caption mode with `-c` and `-l` parameters + +Only use Simple/Random mode (`-d` or `random`) for quick inspiration or instrumental exploration. + +## Output Files + +After generation, the script automatically saves results to the `acestep_output` folder in the project root (same level as `.claude`): + +``` +project_root/ +├── .claude/ +│ └── skills/acestep/... +├── acestep_output/ # Output directory +│ ├──.json # Complete task result (JSON) +│ ├── _1.mp3 # First audio file +│ ├── _2.mp3 # Second audio file (if batch_size > 1) +│ └── ... +└── ... +``` + +### JSON Result Structure + +**Important**: When LM enhancement is enabled (`use_format=true`), the final synthesized content may differ from your input. Check the JSON file for actual values: + +| Field | Description | +|-------|-------------| +| `prompt` | **Actual caption** used for synthesis (may be LM-enhanced) | +| `lyrics` | **Actual lyrics** used for synthesis (may be LM-enhanced) | +| `metas.prompt` | Original input caption | +| `metas.lyrics` | Original input lyrics | +| `metas.bpm` | BPM used | +| `metas.keyscale` | Key scale used | +| `metas.duration` | Duration in seconds | +| `generation_info` | Detailed timing and model info | +| `seed_value` | Seeds used (for reproducibility) | +| `lm_model` | LM model name | +| `dit_model` | DiT model name | + +To get the actual synthesized lyrics, parse the JSON and read the top-level `lyrics` field, not `metas.lyrics`. + +## Script Commands + +**CRITICAL - Complete Lyrics Input**: When providing lyrics via the `-l` parameter, you MUST pass ALL lyrics content WITHOUT any omission: +- If user provides lyrics, pass the ENTIRE text they give you +- If you generate lyrics yourself, pass the COMPLETE lyrics you created +- NEVER truncate, shorten, or pass only partial lyrics +- Missing lyrics will result in incomplete or incoherent songs + +**Music Parameters**: Refer to [Music Creation Guide](./music-creation-guide.md) for how to calculate duration, choose BPM, key scale, and time signature. + +```bash +# need to cd skills path +cd {project_root}/{.claude or .codex}/skills/acestep/ + +# Caption mode - RECOMMENDED: Write lyrics first, then generate +./scripts/acestep.sh generate -c "Electronic pop, energetic synths" -l "[Verse] Your complete lyrics +[Chorus] Full chorus here..." --duration 120 --bpm 128 + +# Instrumental only +./scripts/acestep.sh generate "Jazz with saxophone" + +# Quick exploration (Simple/Random mode) +./scripts/acestep.sh generate -d "A cheerful song about spring" +./scripts/acestep.sh random + +# Options +./scripts/acestep.sh generate "Rock" --duration 60 --batch 2 +./scripts/acestep.sh generate "EDM" --no-thinking # Faster + +# Other commands +./scripts/acestep.sh status +./scripts/acestep.sh health +./scripts/acestep.sh models +``` + +## Configuration + +**Important**: Configuration follows this priority (high to low): + +1. **Command line arguments** > **config.json defaults** +2. User-specified parameters **temporarily override** defaults but **do not modify** config.json +3. Only `config --set` command **permanently modifies** config.json + +### Default Config File (`scripts/config.json`) + +```json +{ + "api_url": "http://127.0.0.1:8001", + "api_key": "", + "generation": { + "thinking": true, + "use_format": false, + "use_cot_caption": true, + "use_cot_language": false, + "batch_size": 1, + "audio_format": "mp3", + "vocal_language": "en" + } +} +``` + +| Option | Default | Description | +|--------|---------|-------------| +| `api_url` | `http://127.0.0.1:8001` | API server address | +| `api_key` | `""` | API authentication key (optional) | +| `generation.thinking` | `true` | Enable 5Hz LM (higher quality, slower) | +| `generation.audio_format` | `mp3` | Output format (mp3/wav/flac) | +| `generation.vocal_language` | `en` | Vocal language | + +## API Reference + +All responses wrapped: `{"data":, "code": 200, "error": null, "timestamp": ...}` + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Health check | +| `/release_task` | POST | Create generation task | +| `/query_result` | POST | Query task status, body: `{"task_id_list": ["id"]}` | +| `/v1/models` | GET | List available models | +| `/v1/audio?path={path}` | GET | Download audio file | + +### Query Result Response + +```json +{ + "data": [{ + "task_id": "xxx", + "status": 1, + "result": "[{\"file\":\"/v1/audio?path=...\",\"metas\":{\"bpm\":120,\"duration\":60,\"keyscale\":\"C Major\"}}]" + }] +} +``` + +Status codes: `0` = processing, `1` = success, `2` = failed + +## Request Parameters (`/release_task`) + +Parameters can be placed in `param_obj` object. + +### Generation Modes + +| Mode | Usage | When to Use | +|------|-------|-------------| +| **Caption** (Recommended) | `generate -c "style" -l "lyrics"` | For vocal songs - write lyrics yourself first | +| **Simple** | `generate -d "description"` | Quick exploration, LM generates everything | +| **Random** | `random` | Random generation for inspiration | + +### Core Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `prompt` | string | "" | Music style description (Caption mode) | +| `lyrics` | string | "" | **Full lyrics content** - Pass ALL lyrics without omission. Use `[inst]` for instrumental. Partial/truncated lyrics = incomplete songs | +| `sample_mode` | bool | false | Enable Simple/Random mode | +| `sample_query` | string | "" | Description for Simple mode | +| `thinking` | bool | false | Enable 5Hz LM for audio code generation | +| `use_format` | bool | false | Use LM to enhance caption/lyrics | +| `model` | string | - | DiT model name | +| `batch_size` | int | 1 | Number of audio files to generate | + +### Music Attributes + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `audio_duration` | float | - | Duration in seconds | +| `bpm` | int | - | Tempo (beats per minute) | +| `key_scale` | string | "" | Key (e.g. "C Major") | +| `time_signature` | string | "" | Time signature (e.g. "4/4") | +| `vocal_language` | string | "en" | Language code (en, zh, ja, etc.) | +| `audio_format` | string | "mp3" | Output format (mp3/wav/flac) | + +### Generation Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `inference_steps` | int | 8 | Diffusion steps | +| `guidance_scale` | float | 7.0 | CFG scale | +| `seed` | int | -1 | Random seed (-1 for random) | +| `infer_method` | string | "ode" | Diffusion method (ode/sde) | + +### Audio Task Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `task_type` | string | "text2music" | text2music / continuation / repainting | +| `src_audio_path` | string | - | Source audio for continuation | +| `repainting_start` | float | 0.0 | Repainting start position (seconds) | +| `repainting_end` | float | - | Repainting end position (seconds) | + +### Example Request (Simple Mode) + +```json +{ + "sample_mode": true, + "sample_query": "A cheerful pop song about spring", + "thinking": true, + "param_obj": { + "duration": 60, + "bpm": 120, + "language": "en" + }, + "batch_size": 2 +} +``` diff --git a/docs/ace-step-skills/music-creation-guide.md b/docs/ace-step-skills/music-creation-guide.md new file mode 100644 index 0000000..3a52f6a --- /dev/null +++ b/docs/ace-step-skills/music-creation-guide.md @@ -0,0 +1,350 @@ +# ACE-Step Music Creation Guide + +> This guide contains professional music creation knowledge extracted from ACE-Step Tutorial. Use this as reference when creating music with ACE-Step. + +--- + +## Input Control: What Do You Want? + +This is the part where you communicate "creative intent" with the model—what kind of music you want to generate. + +| Category | Parameter | Function | +|----------|-----------|----------| +| **Task Type** | `task_type` | Determines generation mode: text2music, cover, repaint, lego, extract, complete | +| **Text Input** | `caption` | Description of overall music elements: style, instruments, emotion, atmosphere, timbre, vocal gender, progression, etc. | +| | `lyrics` | Temporal element description: lyric content, music structure evolution, vocal changes, vocal/instrument performance style, start/end style, articulation, etc. (use `[Instrumental]` for instrumental music) | +| **Music Metadata** | `bpm` | Tempo (30–300) | +| | `keyscale` | Key (e.g., C Major, Am) | +| | `timesignature` | Time signature (4/4, 3/4, 6/8) | +| | `vocal_language` | Vocal language | +| | `duration` | Target duration (seconds) | +| **Audio Reference** | `reference_audio` | Global reference for timbre or style (for cover, style transfer) | +| | `src_audio` | Source audio for non-text2music tasks (text2music defaults to silence, no input needed) | +| | `audio_codes` | Semantic codes input to model in Cover mode (advanced: reuse codes for variants, convert songs to codes for extension, combine like DJ mixing) | +| **Interval Control** | `repainting_start/end` | Time interval for operations (repaint redraw area / lego new track area) | + +--- + +## About Caption: The Most Important Input + +**Caption is the most important factor affecting generated music.** + +It supports multiple input formats: simple style words, comma-separated tags, complex natural language descriptions. We've trained to be compatible with various formats, ensuring text format doesn't significantly affect model performance. + +### Common Dimensions for Caption Writing + +| Dimension | Examples | +|-----------|----------| +| **Style/Genre** | pop, rock, jazz, electronic, hip-hop, R&B, folk, classical, lo-fi, synthwave | +| **Emotion/Atmosphere** | melancholic, uplifting, energetic, dreamy, dark, nostalgic, euphoric, intimate | +| **Instruments** | acoustic guitar, piano, synth pads, 808 drums, strings, brass, electric bass | +| **Timbre Texture** | warm, bright, crisp, muddy, airy, punchy, lush, raw, polished | +| **Era Reference** | 80s synth-pop, 90s grunge, 2010s EDM, vintage soul, modern trap | +| **Production Style** | lo-fi, high-fidelity, live recording, studio-polished, bedroom pop | +| **Vocal Characteristics** | female vocal, male vocal, breathy, powerful, falsetto, raspy, choir | +| **Speed/Rhythm** | slow tempo, mid-tempo, fast-paced, groovy, driving, laid-back | +| **Structure Hints** | building intro, catchy chorus, dramatic bridge, fade-out ending | + +### Practical Principles for Caption Writing + +1. **Specific beats vague** — "sad piano ballad with female breathy vocal" works better than "a sad song." + +2. **Combine multiple dimensions** — Single-dimension descriptions give the model too much room to play; combining style+emotion+instruments+timbre can more precisely anchor your desired direction. + +3. **Use references well** — "in the style of 80s synthwave" or "reminiscent of Bon Iver" can quickly convey complex aesthetic preferences. + +4. **Texture words are useful** — Adjectives like warm, crisp, airy, punchy can influence mixing and timbre tendencies. + +5. **Don't pursue perfect descriptions** — Caption is a starting point, not an endpoint. Write a general direction first, then iterate based on results. + +6. **Description granularity determines freedom** — More omitted descriptions give the model more room to play, more random factor influence; more detailed descriptions constrain the model more. Decide specificity based on your needs—want surprises? Write less. Want control? Write more details. + +7. **Avoid conflicting words** — Conflicting style combinations easily lead to degraded output. For example, wanting both "classical strings" and "hardcore metal" simultaneously—the model will try to fuse but usually not ideal. + + **Ways to resolve conflicts:** + - **Repetition reinforcement** — Strengthen the elements you want more in mixed styles by repeating certain words + - **Conflict to evolution** — Transform style conflicts into temporal style evolution. For example: "Start with soft strings, middle becomes noisy dynamic metal rock, end turns to hip-hop"—this gives the model clear guidance on how to handle different styles, rather than mixing them into a mess + +--- + +## About Lyrics: The Temporal Script + +If Caption describes the music's "overall portrait"—style, atmosphere, timbre—then **Lyrics is the music's "temporal script"**, controlling how music unfolds over time. + +Lyrics is not just lyric content. It carries: +- The lyric text itself +- **Structure tags** ([Verse], [Chorus], [Bridge]...) +- **Vocal style hints** ([raspy vocal], [whispered]...) +- **Instrumental sections** ([guitar solo], [drum break]...) +- **Energy changes** ([building energy], [explosive drop]...) + +### Common Structure Tags + +| Category | Tag | Description | +|----------|-----|-------------| +| **Basic Structure** | `[Intro]` | Opening, establish atmosphere | +| | `[Verse]` / `[Verse 1]` | Verse, narrative progression | +| | `[Pre-Chorus]` | Pre-chorus, build energy | +| | `[Chorus]` | Chorus, emotional climax | +| | `[Bridge]` | Bridge, transition or elevation | +| | `[Outro]` | Ending, conclusion | +| **Dynamic Sections** | `[Build]` | Energy gradually rising | +| | `[Drop]` | Electronic music energy release | +| | `[Breakdown]` | Reduced instrumentation, space | +| **Instrumental Sections** | `[Instrumental]` | Pure instrumental, no vocals | +| | `[Guitar Solo]` | Guitar solo | +| | `[Piano Interlude]` | Piano interlude | +| **Special Tags** | `[Fade Out]` | Fade out ending | +| | `[Silence]` | Silence | + +### Combining Tags: Use Moderately + +Structure tags can be combined with `-` for finer control: + +``` +[Chorus - anthemic] +This is the chorus lyrics +Dreams are burning + +[Bridge - whispered] +Whisper those words softly +``` + +⚠️ **Note: Don't stack too many tags.** + +``` +❌ Not recommended: +[Chorus - anthemic - stacked harmonies - high energy - powerful - epic] + +✅ Recommended: +[Chorus - anthemic] +``` + +**Principle**: Keep structure tags concise; put complex style descriptions in Caption. + +### ⚠️ Key: Maintain Consistency Between Caption and Lyrics + +**Models are not good at resolving conflicts.** If descriptions in Caption and Lyrics contradict, the model gets confused and output quality decreases. + +**Checklist:** +- Instruments in Caption ↔ Instrumental section tags in Lyrics +- Emotion in Caption ↔ Energy tags in Lyrics +- Vocal description in Caption ↔ Vocal control tags in Lyrics + +Think of Caption as "overall setting" and Lyrics as "shot script"—they should tell the same story. + +### Vocal Control Tags + +| Tag | Effect | +|-----|--------| +| `[raspy vocal]` | Raspy, textured vocals | +| `[whispered]` | Whispered | +| `[falsetto]` | Falsetto | +| `[powerful belting]` | Powerful, high-pitched singing | +| `[spoken word]` | Rap/recitation | +| `[harmonies]` | Layered harmonies | +| `[call and response]` | Call and response | +| `[ad-lib]` | Improvised embellishments | + +### Energy and Emotion Tags + +| Tag | Effect | +|-----|--------| +| `[high energy]` | High energy, passionate | +| `[low energy]` | Low energy, restrained | +| `[building energy]` | Increasing energy | +| `[explosive]` | Explosive energy | +| `[melancholic]` | Melancholic | +| `[euphoric]` | Euphoric | +| `[dreamy]` | Dreamy | +| `[aggressive]` | Aggressive | + +### Lyric Text Writing Tips + +**1. Control Syllable Count** + +**6-10 syllables per line** usually works best. The model aligns syllables to beats—if one line has 6 syllables and the next has 14, rhythm becomes strange. + +**Tip**: Keep similar syllable counts for lines in the same position (e.g., first line of each verse) (±1-2 deviation). + +**2. Use Case to Control Intensity** + +Uppercase indicates stronger vocal intensity: + +``` +[Verse] +walking through the empty streets (normal intensity) + +[Chorus] +WE ARE THE CHAMPIONS! (high intensity, shouting) +``` + +**3. Use Parentheses for Background Vocals** + +``` +[Chorus] +We rise together (together) +Into the light (into the light) +``` + +Content in parentheses is processed as background vocals or harmonies. + +**4. Extend Vowels** + +You can extend sounds by repeating vowels: + +``` +Feeeling so aliiive +``` + +But use cautiously—effects are unstable, sometimes ignored or mispronounced. + +**5. Clear Section Separation** + +Separate each section with blank lines: + +``` +[Verse 1] +First verse lyrics +Continue first verse + +[Chorus] +Chorus lyrics +Chorus continues +``` + +### Avoiding "AI-flavored" Lyrics + +These characteristics make lyrics seem mechanical and lack human touch: + +| Red Flag 🚩 | Description | +|-------------|-------------| +| **Adjective stacking** | "neon skies, electric hearts, endless dreams"—filling a section with vague imagery | +| **Rhyme chaos** | Inconsistent rhyme patterns, or forced rhymes causing semantic breaks | +| **Blurred section boundaries** | Lyric content crosses structure tags, Verse content "flows" into Chorus | +| **No breathing room** | Each line too long, can't sing in one breath | +| **Mixed metaphors** | First verse uses water imagery, second suddenly becomes fire, third is flying—listeners can't anchor | + +**Metaphor discipline**: Stick to one core metaphor per song, exploring its multiple aspects. + +--- + +## About Music Metadata: Optional Fine Control + +**Most of the time, you don't need to manually set metadata.** + +When you enable `thinking` mode (or enable `use_cot_metas`), LM automatically infers appropriate BPM, key, time signature, etc. based on your Caption and Lyrics. This is usually good enough. + +But if you have clear ideas, you can also manually control them: + +| Parameter | Control Range | Description | +|-----------|--------------|-------------| +| `bpm` | 30–300 | Tempo. Common distribution: slow songs 60–80, mid-tempo 90–120, fast songs 130–180 | +| `keyscale` | Key | e.g., `C Major`, `Am`, `F# Minor`. Affects overall pitch and emotional color | +| `timesignature` | Time signature | `4/4` (most common), `3/4` (waltz), `6/8` (swing feel) | +| `vocal_language` | Language | Vocal language. LM usually auto-detects from lyrics | +| `duration` | Seconds | Target duration. Actual generation may vary slightly | + +### Understanding Control Boundaries + +These parameters are **guidance** rather than **precise commands**: + +- **BPM**: Common range (60–180) works well; extreme values (like 30 or 280) have less training data, may be unstable +- **Key**: Common keys (C, G, D, Am, Em) are stable; rare keys may be ignored or shifted +- **Time signature**: `4/4` is most reliable; `3/4`, `6/8` usually OK; complex signatures (like `5/4`, `7/8`) are advanced, effects vary by style +- **Duration**: Short songs (30–60s) and medium length (2–4min) are stable; very long generation may have repetition or structure issues + +### When Do You Need Manual Settings? + +| Scenario | Suggestion | +|----------|------------| +| Daily generation | Don't worry, let LM auto-infer | +| Clear tempo requirement | Manually set `bpm` | +| Specific style (e.g., waltz) | Manually set `timesignature=3/4` | +| Need to match other material | Manually set `bpm` and `duration` | +| Pursue specific key color | Manually set `keyscale` | + +**Tip**: If you manually set metadata but generation results clearly don't match—check if there's conflict with Caption/Lyrics. For example, Caption says "slow ballad" but `bpm=160`, the model gets confused. + +**Recommended Practice**: Don't write tempo, BPM, key, and other metadata information in Caption. These should be set through dedicated metadata parameters (`bpm`, `keyscale`, `timesignature`, etc.), not described in Caption. Caption should focus on style, emotion, instruments, timbre, and other musical characteristics, while metadata information is handled by corresponding parameters. + +--- + +## Duration Calculation Guidelines + +When creating music, you MUST calculate appropriate duration based on lyrics content and song structure: + +### Estimation Method + +- **Per line of lyrics**: 3-5 seconds +- **Intro/Outro**: 5-10 seconds each +- **Instrumental sections**: 5-15 seconds each +- **Typical song structures**: + - 2 verses + 2 choruses: 120-150 seconds minimum + - 2 verses + 2 choruses + bridge: 180-240 seconds minimum + - Full song with intro/outro: 210-270 seconds (3.5-4.5 minutes) + +### Common Pitfalls + +❌ **DON'T**: Set duration too short for the lyrics amount +- Example: 10 lines of lyrics with 120 seconds → rushed, compressed + +✅ **DO**: Calculate realistic duration +- Example: 10 lines of lyrics → ~40 seconds of vocals + 20 seconds intro/outro = 60 seconds minimum + +### BPM and Duration Relationship + +The BPM affects how quickly lyrics are sung: +- **Slower BPM (60-80)**: Need MORE duration for same lyrics +- **Medium BPM (100-130)**: Standard duration +- **Faster BPM (150-180)**: Can fit more lyrics in less time, but still need breathing room + +**Rule of thumb**: When in doubt, estimate longer rather than shorter. A song that's too short will feel rushed and incomplete. + +--- + +## Complete Example + +Assuming Caption is: `female vocal, piano ballad, emotional, intimate atmosphere, strings, building to powerful chorus` + +``` +[Intro - piano] + +[Verse 1] +月光洒在窗台上 +我听见你的呼吸 +城市在远处沉睡 +只有我们还醒着 + +[Pre-Chorus] +这一刻如此安静 +却藏着汹涌的心 + +[Chorus - powerful] +让我们燃烧吧 +像夜空中的烟火 +短暂却绚烂 +这就是我们的时刻 + +[Verse 2] +时间在指尖流过 +我们抓不住什么 +但至少此刻拥有 +彼此眼中的火焰 + +[Bridge - whispered] +如果明天一切消散 +至少我们曾经闪耀 + +[Final Chorus] +让我们燃烧吧 +像夜空中的烟火 +短暂却绚烂 +THIS IS OUR MOMENT! + +[Outro - fade out] +``` + +Note: In this example, Lyrics tags (piano, powerful, whispered) are consistent with Caption descriptions (piano ballad, building to powerful chorus, intimate), with no conflicts. + +--- diff --git a/generate_ace.py b/generate_ace.py index 3420ab3..4dc4cec 100644 --- a/generate_ace.py +++ b/generate_ace.py @@ -898,6 +898,7 @@ def _run_ace_text2music( lora_weight: float = 0.75, cancel_check: Optional[Callable[[], bool]] = None, vocal_language: str | None = None, + shift: float = 6.0, # Timestep shift; 3.0 for lego (ACE-Step-1.5 #117) # Thinking / LM / CoT (passed to pipeline; used when LM path is integrated) thinking: bool = False, use_cot_metas: bool = True, @@ -1016,7 +1017,7 @@ def _run_ace_text2music( "batch_size": 1, "save_path": str(output_path), "debug": False, - "shift": 6.0, + "shift": float(shift), } if vocal_language is not None and (vocal_language or "").strip(): call_kwargs["vocal_language"] = (vocal_language or "").strip() @@ -1189,6 +1190,7 @@ def generate_track_ace( lora_weight: float = 0.75, cancel_check: Optional[Callable[[], bool]] = None, vocal_language: str = "", + shift: float = 6.0, # Timestep shift; 3.0 recommended for lego (better timing per ACE-Step-1.5 #117) # Thinking / LM / CoT (forwarded to pipeline for when LM path is integrated) thinking: bool = False, use_cot_metas: bool = True, @@ -1358,6 +1360,7 @@ def generate_track_ace( lora_weight=float(lora_weight), cancel_check=cancel_check, vocal_language=(vocal_language or "").strip() or None, + shift=float(shift), thinking=thinking, use_cot_metas=use_cot_metas, use_cot_caption=use_cot_caption, @@ -1368,6 +1371,7 @@ def generate_track_ace( lm_top_p=lm_top_p, lm_negative_prompt=(lm_negative_prompt or "").strip() or "NO USER INPUT", lm_checkpoint_path=lm_checkpoint_path, + shift=shift, ) _report_progress(0.90, "fades") diff --git a/ui/components/CreatePanel.tsx b/ui/components/CreatePanel.tsx index c7ec20d..c73a5ed 100644 --- a/ui/components/CreatePanel.tsx +++ b/ui/components/CreatePanel.tsx @@ -179,7 +179,7 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati // Lego tab only const [legoTrackName, setLegoTrackName] = useState('guitar'); const [legoCaption, setLegoCaption] = useState(''); - const [legoBackingInfluence, setLegoBackingInfluence] = useState(0.25); + const [legoBackingInfluence, setLegoBackingInfluence] = useState(1.0); const [legoValidationError, setLegoValidationError] = useState(''); // Shared between Simple and Custom: description/style (genre, mood, etc.) and title @@ -846,6 +846,7 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati setLegoValidationError('Please select backing audio (required for Lego).'); return; } + // Instruction is auto from track name; caption (style) is optional user description (key, BPM, tone). Backend builds prompt = instruction + ", " + caption when caption present. const instruction = `Generate the ${legoTrackName} track based on the audio context:`; const effGuidance = guidanceScale; const effAudioCover = legoBackingInfluence; @@ -853,9 +854,9 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati onGenerate({ customMode: false, songDescription: undefined, - prompt: instruction + (legoCaption.trim() ? ', ' + legoCaption.trim() : ''), + prompt: legoCaption.trim() ? instruction + ', ' + legoCaption.trim() : instruction, lyrics: '', - style: legoCaption.trim() || instruction, + style: legoCaption.trim(), // caption only; do not send instruction as style (backend uses instruction + style for prompt) title: title.trim() || `Lego ${legoTrackName}`, instrumental: true, vocalLanguage: 'en', @@ -1677,24 +1678,25 @@ export const CreatePanel: React.FC = ({ onGenerate, isGenerati
- {/* Describe the track (caption) */} + {/* Caption: style/key/BPM (instruction above is auto from track name) */}
-
- Describe the track +
+ Caption (optional) +