diff --git a/.gitignore b/.gitignore
index 8e3515cc..8e011e2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,6 +160,13 @@ demo/
 frontend/node_modules/
 run_experiments/
 generated/
+!frontend/demo_cache/generated/experiments/**
+!frontend/demo_cache/generated/
+!frontend/demo_cache/generated/**
+!frontend/demo_cache/ideas/
+!frontend/demo_cache/ideas/**
+!frontend/demo_cache/reviews/
+!frontend/demo_cache/reviews/**
 runs/
 testst/
 
diff --git a/backend/app.py b/backend/app.py
index d6c6d373..f78d3921 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -1,41 +1,74 @@
 import builtins
-import io
+import logging
 import os
 import sys
+import time
 from typing import Any, Dict, Optional, Union
 
-from flask import Flask, Response, jsonify, request, send_file, session
-from flask_cors import CORS
-from flask_socketio import SocketIO
+import eventlet
+
+eventlet.monkey_patch()
+
+from flask import Flask, Response, jsonify, request, send_file, session  # noqa: E402
+from flask_cors import CORS  # noqa: E402
+from flask_socketio import SocketIO  # noqa: E402
 
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if project_root not in sys.path:
     sys.path.insert(0, project_root)
 
+
 original_print = builtins.print
-original_stdout = sys.stdout
 
-# Buffer to store messages until socketio is ready
-log_buffer = []
+_LOG_BUFFER: list[dict[str, Any]] = []
+_MAX_LOG_BUFFER_LENGTH = 500
+
+
+def _push_log(message: str, level: str = "info") -> None:
+    message = message.strip()
+    if not message:
+        return
+    payload = {
+        "message": message,
+        "level": level,
+        "timestamp": time.time(),
+    }
+    _LOG_BUFFER.append(payload)
+    if len(_LOG_BUFFER) > _MAX_LOG_BUFFER_LENGTH:
+        del _LOG_BUFFER[0]
+    try:
+        if "socketio" in globals():
+            socketio.emit("log", payload)
+    except Exception:
+        pass
+
+
+class SocketIOLogHandler(logging.Handler):
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            message = self.format(record)
+        except Exception:
+            message = record.getMessage()
+        _push_log(message, record.levelname.lower())
 
 
-class WebSocketCapture(io.StringIO):
-    def write(self, text: str) -> int:
-        # Also write to original stdout
-        original_stdout.write(text)
-        # Store for WebSocket emission
-        if text.strip():  # Only non-empty messages
-            log_buffer.append(text.strip())
-        return len(text)
+def flush_log_buffer() -> None:
+    global _LOG_BUFFER
+    if not _LOG_BUFFER:
+        return
+    try:
+        for payload in _LOG_BUFFER:
+            socketio.emit("log", payload)
+    except Exception:
+        return
+    finally:
+        _LOG_BUFFER = []
 
 
 def websocket_print(*args: Any, **kwargs: Any) -> None:
-    # Call original print
-    original_print(*args, **kwargs)
-    # Also emit via WebSocket in real-time
     message = " ".join(str(arg) for arg in args)
-    if message.strip():
-        emit_log_realtime(message.strip())
+    _push_log(message)
+    original_print(*args, **kwargs)
 
 
 # Override print globally before importing tiny_scientist modules
@@ -53,41 +86,7 @@ def websocket_print(*args: Any, **kwargs: Any) -> None:
     pass
 
 
-# Create a function to emit buffered logs when socketio is ready
-def emit_buffered_logs() -> None:
-    global log_buffer
-    try:
-        for message in log_buffer:
-            socketio.emit(
-                "log",
-                {
-                    "message": message,
-                    "level": "info",
-                    "timestamp": __import__("time").time(),
-                },
-            )
-        log_buffer = []  # Clear buffer after emitting
-    except Exception:
-        pass
-
-
-# Create a function to emit logs in real-time
-def emit_log_realtime(message: str, level: str = "info") -> None:
-    try:
-        # Check if socketio is available
-        if "socketio" in globals():
-            socketio.emit(
-                "log",
-                {
-                    "message": message,
-                    "level": level,
-                    "timestamp": __import__("time").time(),
-                },
-            )
-    except Exception:
-        pass
-
-
+from backend.demo_cache import DemoCacheError, DemoCacheService  # noqa: E402
 from tiny_scientist.budget_checker import BudgetChecker  # noqa: E402
 from tiny_scientist.coder import Coder  # noqa: E402
 from tiny_scientist.reviewer import Reviewer  # noqa: E402
@@ -118,6 +117,38 @@ def patch_module_print() -> None:
 # Call the patching function
 patch_module_print()
 
+
+def _demo_log(message: str, level: str = "info") -> None:
+    _push_log(message, level)
+
+
+DEMO_CACHE_MODE = os.environ.get("DEMO_CACHE_MODE", "").strip().lower() in {
+    "1",
+    "true",
+    "yes",
+    "on",
+}
+DEMO_CACHE_DIR = os.path.abspath(
+    os.environ.get("DEMO_CACHE_DIR")
+    or os.path.join(project_root, "frontend", "demo_cache")
+)
+
+try:
+    demo_cache = DemoCacheService(
+        DEMO_CACHE_DIR,
+        enabled=DEMO_CACHE_MODE,
+        log_fn=_demo_log,
+    )
+    if demo_cache.enabled:
+        print(f"🗄️  Demo cache enabled using data at {DEMO_CACHE_DIR}")
+except DemoCacheError as exc:
+    print(f"⚠️ Demo cache disabled: {exc}")
+    demo_cache = DemoCacheService(
+        DEMO_CACHE_DIR,
+        enabled=False,
+        log_fn=_demo_log,
+    )
+
 app = Flask(__name__)
 app.secret_key = "your-secret-key-here"
 CORS(
@@ -129,7 +160,15 @@ def patch_module_print() -> None:
         "http://localhost:3000",
     ],
 )
-socketio = SocketIO(app, cors_allowed_origins="*")
+socketio = SocketIO(app, cors_allowed_origins="*", async_mode="eventlet")
+root_logger = logging.getLogger()
+if not any(isinstance(handler, SocketIOLogHandler) for handler in root_logger.handlers):
+    socketio_handler = SocketIOLogHandler()
+    socketio_handler.setLevel(logging.INFO)
+    socketio_handler.setFormatter(logging.Formatter("%(message)s"))
+    root_logger.addHandler(socketio_handler)
+if root_logger.level > logging.INFO:
+    root_logger.setLevel(logging.INFO)
 
 # Print override is now active
 print("🚀 Backend server starting with WebSocket logging enabled!")
@@ -156,6 +195,12 @@ def configure() -> Union[Response, tuple[Response, int]]:
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.apply_config(session)
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
     model = data.get("model")
     api_key = data.get("api_key")
     budget = data.get("budget")
@@ -245,15 +290,23 @@ def configure() -> Union[Response, tuple[Response, int]]:
 @app.route("/api/generate-initial", methods=["POST"])
 def generate_initial() -> Union[Response, tuple[Response, int]]:
     """Generate initial ideas from an intent (handleAnalysisIntentSubmit)"""
-    emit_buffered_logs()  # Emit any buffered logs from module initialization
+    flush_log_buffer()  # Emit any buffered logs from module initialization
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
-    if thinker is None:
-        return jsonify({"error": "Thinker not configured"}), 400
     intent = data.get("intent")
     num_ideas = data.get("num_ideas", 3)
 
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_initial_ideas(intent)
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
+
+    if thinker is None:
+        return jsonify({"error": "Thinker not configured"}), 400
+
     # Generate ideas
     ideas = thinker.run(intent=intent, num_ideas=num_ideas)
 
@@ -279,16 +332,23 @@ def generate_initial() -> Union[Response, tuple[Response, int]]:
 @app.route("/api/set-system-prompt", methods=["POST"])
 def set_system_prompt() -> Union[Response, tuple[Response, int]]:
     """Set the system prompt for the Thinker"""
-    global thinker
-
-    if not thinker:
-        return jsonify({"error": "Thinker not configured"}), 400
-
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
     system_prompt = data.get("system_prompt")
 
+    if demo_cache.enabled:
+        try:
+            demo_cache.update_system_prompt(system_prompt)
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify({"status": "success", "message": "System prompt updated"})
+
+    global thinker
+
+    if not thinker:
+        return jsonify({"error": "Thinker not configured"}), 400
+
     # If empty string or None, reset to default
     if not system_prompt:
         thinker.set_system_prompt(None)  # This will reset to default
@@ -301,11 +361,6 @@ def set_system_prompt() -> Union[Response, tuple[Response, int]]:
 @app.route("/api/set-criteria", methods=["POST"])
 def set_criteria() -> Union[Response, tuple[Response, int]]:
     """Set evaluation criteria for a specific dimension"""
-    global thinker
-
-    if not thinker:
-        return jsonify({"error": "Thinker not configured"}), 400
-
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
@@ -315,6 +370,23 @@ def set_criteria() -> Union[Response, tuple[Response, int]]:
     if dimension not in ["novelty", "feasibility", "impact"]:
         return jsonify({"error": "Invalid dimension"}), 400
 
+    if demo_cache.enabled:
+        try:
+            demo_cache.update_criteria(dimension, criteria)
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(
+            {
+                "status": "success",
+                "message": f"{dimension.capitalize()} criteria updated",
+            }
+        )
+
+    global thinker
+
+    if not thinker:
+        return jsonify({"error": "Thinker not configured"}), 400
+
     # If empty string or None, reset to default
     if not criteria:
         thinker.set_criteria(dimension, None)  # This will reset to default
@@ -329,6 +401,12 @@ def set_criteria() -> Union[Response, tuple[Response, int]]:
 @app.route("/api/get-prompts", methods=["GET"])
 def get_prompts() -> Union[Response, tuple[Response, int]]:
     """Get current prompts and criteria"""
+    if demo_cache.enabled:
+        try:
+            return jsonify(demo_cache.get_prompts())
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+
     global thinker
 
     if thinker is None:
@@ -358,6 +436,12 @@ def generate_children() -> Union[Response, tuple[Response, int]]:
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_child_ideas()
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
     if thinker is None:
         return jsonify({"error": "Thinker not configured"}), 400
     parent_content = data.get("parent_content")
@@ -392,6 +476,12 @@ def modify_idea() -> Union[Response, tuple[Response, int]]:
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_modified_idea()
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
     if thinker is None:
         return jsonify({"error": "Thinker not configured"}), 400
     original_idea = data.get("original_idea")
@@ -429,6 +519,12 @@ def merge_ideas() -> Union[Response, tuple[Response, int]]:
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_merged_idea()
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
     if thinker is None:
         return jsonify({"error": "Thinker not configured"}), 400
     idea_a = data.get("idea_a")
@@ -461,10 +557,17 @@ def evaluate_ideas() -> Union[Response, tuple[Response, int]]:
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
+    ideas = data.get("ideas") or []
+    intent = data.get("intent")
+
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.evaluate(ideas)
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
     if thinker is None:
         return jsonify({"error": "Thinker not configured"}), 400
-    ideas = data.get("ideas")
-    intent = data.get("intent")
 
     # Use original data directly (no conversion needed)
     thinker_ideas = ideas
@@ -523,17 +626,40 @@ def format_idea_content(idea: Union[Dict[str, Any], str]) -> str:
 @app.route("/api/code", methods=["POST"])
 def generate_code() -> Union[Response, tuple[Response, int]]:
     """Generate code synchronously and return when complete"""
-    emit_buffered_logs()  # Emit any buffered logs
+    flush_log_buffer()  # Emit any buffered logs
     global coder
 
-    if coder is None:
-        return jsonify({"error": "Coder not configured"}), 400
-
     data = request.json
     if data is None:
         return jsonify({"error": "No JSON data provided"}), 400
     idea_data = data.get("idea")
     baseline_results = data.get("baseline_results", {})
+    idea_id = data.get("idea_id")
+    idea_name = None
+    if isinstance(idea_data, dict):
+        if not idea_id:
+            idea_candidate = idea_data.get("id")
+            if isinstance(idea_candidate, str):
+                idea_id = idea_candidate
+        idea_name = (
+            idea_data.get("Name")
+            or idea_data.get("Title")
+            or idea_data.get("name")
+            or idea_data.get("title")
+        )
+
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_code_result(
+                idea_id=idea_id,
+                idea_name=idea_name,
+            )
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
+
+    if coder is None:
+        return jsonify({"error": "Coder not configured"}), 400
 
     print("💻 Starting synchronous code generation...")
 
@@ -622,11 +748,34 @@ def generate_paper() -> Union[Response, tuple[Response, int]]:
 
     idea_data = data.get("idea")
     experiment_dir = data.get("experiment_dir", None)
+    idea_id = data.get("idea_id")
+    idea_name = None
+    if isinstance(idea_data, dict):
+        if not idea_id:
+            idea_candidate = idea_data.get("id")
+            if isinstance(idea_candidate, str):
+                idea_id = idea_candidate
+        idea_name = (
+            idea_data.get("Name")
+            or idea_data.get("Title")
+            or idea_data.get("name")
+            or idea_data.get("title")
+        )
 
-    s2_api_key = data.get("s2_api_key", None)
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_paper_result(
+                idea_id=idea_id,
+                idea_name=idea_name,
+                experiment_hint=experiment_dir,
+            )
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
 
-    if not s2_api_key:
-        return jsonify({"error": "Semantic Scholar API key is required"}), 400
+    s2_api_key = data.get("s2_api_key", None)
+    if isinstance(s2_api_key, str):
+        s2_api_key = s2_api_key.strip() or None
 
     if not idea_data:
         print("ERROR: No idea provided in request")
@@ -653,6 +802,10 @@ def generate_paper() -> Union[Response, tuple[Response, int]]:
             ),
         )
         print(f"Writer initialized for this request with model: {writer.model}")
+        if not s2_api_key:
+            print(
+                "Proceeding without Semantic Scholar API key; using fallback sources."
+            )
 
         # Extract the original idea data
         if isinstance(idea_data, dict) and "originalData" in idea_data:
@@ -731,15 +884,14 @@ def generate_paper() -> Union[Response, tuple[Response, int]]:
 def serve_experiment_file(file_path: str) -> Union[Response, tuple[Response, int]]:
     """Serve generated experiment files"""
     try:
-        # The base directory for all generated content
-        generated_base = os.path.join(project_root, "generated")
-
-        # Construct the full path securely
-        full_path = os.path.abspath(os.path.join(generated_base, file_path))
-
-        # Security check: ensure the file is within the allowed directory
-        if not full_path.startswith(os.path.abspath(generated_base)):
-            return jsonify({"error": "Access denied"}), 403
+        if demo_cache.enabled:
+            generated_base = demo_cache.generated_base
+            full_path = demo_cache.resolve_generated_path(file_path)
+        else:
+            generated_base = os.path.join(project_root, "generated")
+            full_path = os.path.abspath(os.path.join(generated_base, file_path))
+            if not full_path.startswith(os.path.abspath(generated_base)):
+                return jsonify({"error": "Access denied"}), 403
 
         if not os.path.exists(full_path):
             return jsonify({"error": "File not found"}), 404
@@ -753,6 +905,8 @@ def serve_experiment_file(file_path: str) -> Union[Response, tuple[Response, int
             # For other files, serve directly
             return send_file(full_path)
 
+    except DemoCacheError as exc:
+        return jsonify({"error": str(exc)}), 404
     except Exception as e:
         print(f"Error serving file {file_path}: {e}")
         return jsonify({"error": str(e)}), 500
@@ -770,6 +924,19 @@ def review_paper() -> Union[Response, tuple[Response, int]]:
 
     pdf_path = data.get("pdf_path")
     s2_api_key = data.get("s2_api_key")
+    idea_id = data.get("idea_id")
+    idea_name = data.get("idea_name")
+
+    if demo_cache.enabled:
+        try:
+            response_payload = demo_cache.get_review_result(
+                idea_id=idea_id,
+                idea_name=idea_name,
+                pdf_path=pdf_path,
+            )
+        except DemoCacheError as exc:
+            return jsonify({"error": str(exc)}), 409
+        return jsonify(response_payload)
 
     if not pdf_path:
         return jsonify({"error": "No PDF path provided"}), 400
@@ -853,11 +1020,12 @@ def review_paper() -> Union[Response, tuple[Response, int]]:
 if __name__ == "__main__":
     # Configure Flask for long-running requests
     app.config["SEND_FILE_MAX_AGE_DEFAULT"] = 0
+    port = int(os.environ.get("PORT", "5000"))
     socketio.run(
         app,
         debug=True,
         use_reloader=False,
-        port=5000,
+        port=port,
         host="0.0.0.0",
         allow_unsafe_werkzeug=True,
     )
diff --git a/backend/demo_cache.py b/backend/demo_cache.py
new file mode 100644
index 00000000..7ec2da2f
--- /dev/null
+++ b/backend/demo_cache.py
@@ -0,0 +1,467 @@
+from __future__ import annotations
+
+import copy
+import json
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+
+class DemoCacheError(RuntimeError):
+    """Raised when demo cache data is missing or inconsistent."""
+
+
+class DemoCacheService:
+    """Utility to replay a pre-recorded Tiny Scientist session for demos."""
+
+    def __init__(
+        self,
+        base_dir: str | Path,
+        *,
+        enabled: bool,
+        log_fn: Optional[Callable[[str, str], None]] = None,
+    ) -> None:
+        self.base_dir = Path(base_dir)
+        self.enabled = bool(enabled)
+        self._log_fn = log_fn or (lambda message, level="info": None)
+
+        self.generated_root = "generated"
+        self.generated_base = self.base_dir / self.generated_root
+
+        self._queues: Dict[str, List[Any]] = {}
+        self._counters: Dict[str, int] = defaultdict(int)
+        self._evaluation_by_name: Dict[str, Dict[str, Any]] = {}
+        self._evaluation_default: Optional[Dict[str, Any]] = None
+        self._prompts_state: Dict[str, Any] = {
+            "system_prompt": "",
+            "criteria": {},
+            "defaults": {},
+        }
+        self._configure_state: Dict[str, Any] = {}
+        self._logs: Dict[str, List[Any]] = {}
+        self.intent: Optional[str] = None
+        self._idea_names_by_id: defaultdict[str, set[str]] = defaultdict(set)
+        self._idea_id_by_name: Dict[str, str] = {}
+        self._code_results_by_id: Dict[str, Any] = {}
+        self._code_results_by_experiment: Dict[str, Any] = {}
+        self._paper_results_by_id: Dict[str, Any] = {}
+        self._paper_results_by_path: Dict[str, Any] = {}
+        self._review_results_by_id: Dict[str, Any] = {}
+        self._review_results_by_path: Dict[str, Any] = {}
+
+        if self.enabled:
+            self._load()
+
+    def _load(self) -> None:
+        session_path = self.base_dir / "session.json"
+        if not session_path.exists():
+            raise DemoCacheError(
+                f"Demo cache session file not found at {session_path!s}"
+            )
+
+        with session_path.open("r", encoding="utf-8") as fh:
+            payload = json.load(fh)
+
+        self.intent = payload.get("intent")
+        self.generated_root = payload.get("generated_root") or "generated"
+        self.generated_base = self.base_dir / self.generated_root
+
+        self._configure_state = payload.get("configure") or {}
+
+        prompts = payload.get("prompts") or {}
+        defaults = prompts.get("defaults") or {}
+        if not defaults and prompts.get("criteria"):
+            defaults = copy.deepcopy(prompts["criteria"])
+
+        self._prompts_state = {
+            "system_prompt": prompts.get("system_prompt", ""),
+            "criteria": copy.deepcopy(prompts.get("criteria") or {}),
+            "defaults": copy.deepcopy(defaults),
+        }
+
+        evaluation_payload = payload.get("evaluation") or {}
+        self._evaluation_by_name = copy.deepcopy(
+            evaluation_payload.get("by_name") or {}
+        )
+        default_entry = evaluation_payload.get("default")
+        self._evaluation_default = (
+            copy.deepcopy(default_entry) if default_entry else None
+        )
+
+        self._logs = {}
+        raw_logs = payload.get("logs") or {}
+        for key, messages in raw_logs.items():
+            if isinstance(messages, list):
+                self._logs[key] = list(messages)
+            elif isinstance(messages, dict):
+                self._logs[key] = [messages]
+            else:
+                self._logs[key] = [messages]
+
+        queue_keys = [
+            "generate_initial",
+            "generate_children",
+            "modify",
+            "merge",
+            "code",
+            "write",
+            "review",
+        ]
+        for key in queue_keys:
+            raw_queue = payload.get(key)
+            if raw_queue is None:
+                self._queues[key] = []
+            elif isinstance(raw_queue, dict):
+                self._queues[key] = [copy.deepcopy(raw_queue)]
+            else:
+                self._queues[key] = [copy.deepcopy(item) for item in raw_queue]
+
+        self._build_name_index()
+        self._rebuild_result_indices()
+
+    def _emit_logs(self, channel: str) -> None:
+        if not self.enabled:
+            return
+
+        messages = self._logs.get(channel) or []
+        for entry in messages:
+            if isinstance(entry, dict):
+                message = entry.get("message")
+                level = entry.get("level", "info")
+            else:
+                message = str(entry)
+                level = "info"
+
+            if message:
+                self._log_fn(message, level)
+
+    def _normalize_name(self, name: Optional[str]) -> Optional[str]:
+        if not name or not isinstance(name, str):
+            return None
+        normalized = " ".join(name.strip().split()).lower()
+        return normalized or None
+
+    def _extract_idea_id(self, value: Optional[str]) -> Optional[str]:
+        if not value or not isinstance(value, str):
+            return None
+        sanitized = value.replace("\\", "/")
+        match = re.search(r"(idea[-_][0-9a-z]+)", sanitized, re.IGNORECASE)
+        if match:
+            return match.group(1).lower()
+        return None
+
+    def _build_name_index(self) -> None:
+        self._idea_names_by_id = defaultdict(set)
+        self._idea_id_by_name = {}
+
+        initial_entries = self._queues.get("generate_initial") or []
+        for entry in initial_entries:
+            ideas = entry.get("ideas") if isinstance(entry, dict) else None
+            if not isinstance(ideas, list):
+                continue
+            for idea in ideas:
+                if not isinstance(idea, dict):
+                    continue
+                idea_id = idea.get("id")
+                normalized_id = (
+                    idea_id.strip().lower() if isinstance(idea_id, str) else None
+                )
+                if not normalized_id:
+                    continue
+
+                candidates = [
+                    idea.get("title"),
+                    idea.get("name"),
+                    idea.get("Title"),
+                    idea.get("Name"),
+                ]
+                original = idea.get("originalData")
+                if isinstance(original, dict):
+                    candidates.extend(
+                        [original.get("Title"), original.get("Name")]
+                    )
+
+                for candidate in candidates:
+                    normalized = self._normalize_name(candidate)
+                    if normalized:
+                        self._idea_names_by_id[normalized_id].add(normalized)
+
+        for idea_id, names in self._idea_names_by_id.items():
+            for normalized in names:
+                self._idea_id_by_name.setdefault(normalized, idea_id)
+
+    def _rebuild_result_indices(self) -> None:
+        self._code_results_by_id = {}
+        self._code_results_by_experiment = {}
+        for entry in self._queues.get("code") or []:
+            if not isinstance(entry, dict):
+                continue
+            experiment_dir = entry.get("experiment_dir")
+            idea_id = self._extract_idea_id(experiment_dir)
+            if idea_id:
+                self._code_results_by_id.setdefault(idea_id, copy.deepcopy(entry))
+            if isinstance(experiment_dir, str):
+                self._code_results_by_experiment.setdefault(
+                    experiment_dir, copy.deepcopy(entry)
+                )
+
+        self._paper_results_by_id = {}
+        self._paper_results_by_path = {}
+        for entry in self._queues.get("write") or []:
+            if not isinstance(entry, dict):
+                continue
+            idea_id = (
+                self._extract_idea_id(entry.get("pdf_path"))
+                or self._extract_idea_id(entry.get("local_pdf_path"))
+                or self._extract_idea_id(entry.get("paper_name"))
+            )
+            if idea_id:
+                self._paper_results_by_id.setdefault(idea_id, copy.deepcopy(entry))
+            pdf_path = entry.get("pdf_path")
+            if isinstance(pdf_path, str):
+                self._paper_results_by_path.setdefault(pdf_path, copy.deepcopy(entry))
+
+        self._review_results_by_id = {}
+        self._review_results_by_path = {}
+        for entry in self._queues.get("review") or []:
+            if not isinstance(entry, dict):
+                continue
+            pdf_path = entry.get("pdf_path")
+            if isinstance(pdf_path, str):
+                self._review_results_by_path.setdefault(pdf_path, copy.deepcopy(entry))
+            idea_id = self._extract_idea_id(pdf_path)
+            if idea_id:
+                self._review_results_by_id.setdefault(idea_id, copy.deepcopy(entry))
+
+    def _resolve_candidate_ids(
+        self,
+        idea_id: Optional[str] = None,
+        idea_name: Optional[str] = None,
+        experiment_hint: Optional[str] = None,
+    ) -> List[str]:
+        candidates: List[str] = []
+
+        for raw in (
+            idea_id,
+            self._extract_idea_id(experiment_hint),
+        ):
+            if isinstance(raw, str):
+                normalized = raw.strip().lower()
+                if normalized and normalized not in candidates:
+                    candidates.append(normalized)
+
+        if idea_name:
+            normalized = self._normalize_name(idea_name)
+            if normalized:
+                mapped = self._idea_id_by_name.get(normalized)
+                if mapped and mapped not in candidates:
+                    candidates.append(mapped)
+
+        return candidates
+
+    def _next(self, key: str) -> Any:
+        queue = self._queues.get(key) or []
+        if not queue:
+            raise DemoCacheError(f"No cached payloads configured for '{key}'")
+
+        index = self._counters[key]
+        if index < len(queue):
+            payload = queue[index]
+            self._counters[key] = index + 1
+        else:
+            payload = queue[-1]
+
+        self._emit_logs(key)
+        return copy.deepcopy(payload)
+
+    def apply_config(self, flask_session: Any) -> Dict[str, Any]:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        session_values = copy.deepcopy(self._configure_state.get("session") or {})
+        if session_values:
+            for key, value in session_values.items():
+                flask_session[key] = value
+        else:
+            flask_session["configured"] = True
+
+        response_payload = copy.deepcopy(
+            self._configure_state.get("response")
+            or {
+                "status": "configured",
+                "model": session_values.get("model", "demo-model"),
+                "budget": session_values.get("budget"),
+                "budget_preference": session_values.get("budget_preference"),
+            }
+        )
+
+        self._emit_logs("configure")
+        return response_payload
+
+    def get_prompts(self) -> Dict[str, Any]:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        return {
+            "system_prompt": self._prompts_state.get("system_prompt", ""),
+            "criteria": copy.deepcopy(self._prompts_state.get("criteria") or {}),
+            "defaults": copy.deepcopy(self._prompts_state.get("defaults") or {}),
+        }
+
+    def update_system_prompt(self, system_prompt: Optional[str]) -> None:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        default_prompt = self._prompts_state.get("defaults", {}).get(
+            "system_prompt", ""
+        )
+        self._prompts_state["system_prompt"] = system_prompt or default_prompt
+        self._emit_logs("set_system_prompt")
+
+    def update_criteria(self, dimension: str, criteria: Optional[str]) -> None:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        defaults = self._prompts_state.get("defaults") or {}
+        self._prompts_state.setdefault("criteria", {})
+        if criteria:
+            self._prompts_state["criteria"][dimension] = criteria
+        else:
+            fallback = defaults.get(dimension, "")
+            if fallback:
+                self._prompts_state["criteria"][dimension] = fallback
+            else:
+                self._prompts_state["criteria"].pop(dimension, None)
+        self._emit_logs("set_criteria")
+
+    def get_initial_ideas(self, intent: Optional[str] = None) -> Dict[str, Any]:
+        _ = intent  # Intent is kept for potential validation/debugging.
+        return self._next("generate_initial")
+
+    def get_child_ideas(self) -> Dict[str, Any]:
+        return self._next("generate_children")
+
+    def get_modified_idea(self) -> Dict[str, Any]:
+        return self._next("modify")
+
+    def get_merged_idea(self) -> Dict[str, Any]:
+        return self._next("merge")
+
+    def get_code_result(
+        self,
+        idea_id: Optional[str] = None,
+        idea_name: Optional[str] = None,
+        experiment_hint: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        for candidate in self._resolve_candidate_ids(
+            idea_id=idea_id, idea_name=idea_name, experiment_hint=experiment_hint
+        ):
+            payload = self._code_results_by_id.get(candidate)
+            if payload is not None:
+                self._emit_logs("code")
+                return copy.deepcopy(payload)
+
+        return self._next("code")
+
+    def get_paper_result(
+        self,
+        idea_id: Optional[str] = None,
+        idea_name: Optional[str] = None,
+        experiment_hint: Optional[str] = None,
+        pdf_path: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        if pdf_path and isinstance(pdf_path, str):
+            cached = self._paper_results_by_path.get(pdf_path)
+            if cached is not None:
+                self._emit_logs("write")
+                return copy.deepcopy(cached)
+
+        for candidate in self._resolve_candidate_ids(
+            idea_id=idea_id, idea_name=idea_name, experiment_hint=experiment_hint
+        ):
+            payload = self._paper_results_by_id.get(candidate)
+            if payload is not None:
+                self._emit_logs("write")
+                return copy.deepcopy(payload)
+
+        return self._next("write")
+
+    def get_review_result(
+        self,
+        idea_id: Optional[str] = None,
+        idea_name: Optional[str] = None,
+        pdf_path: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        if pdf_path and isinstance(pdf_path, str):
+            cached = self._review_results_by_path.get(pdf_path)
+            if cached is not None:
+                self._emit_logs("review")
+                return copy.deepcopy(cached)
+
+        for candidate in self._resolve_candidate_ids(
+            idea_id=idea_id, idea_name=idea_name, experiment_hint=pdf_path
+        ):
+            payload = self._review_results_by_id.get(candidate)
+            if payload is not None:
+                self._emit_logs("review")
+                return copy.deepcopy(payload)
+
+        return self._next("review")
+
+    def evaluate(self, ideas: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        results: List[Dict[str, Any]] = []
+        for idx, idea in enumerate(ideas):
+            idea_id = idea.get("id") or f"idea_{idx}"
+            name_candidates = [
+                idea.get("Name"),
+                idea.get("Title"),
+                idea.get("title"),
+                idea.get("name"),
+            ]
+
+            score_entry: Optional[Dict[str, Any]] = None
+            for name in name_candidates:
+                if isinstance(name, str) and name in self._evaluation_by_name:
+                    score_entry = self._evaluation_by_name[name]
+                    break
+
+            if score_entry is None:
+                score_entry = self._evaluation_default
+
+            if score_entry is None:
+                raise DemoCacheError(
+                    f"No cached evaluation scores for idea '{name_candidates[0] or idea_id}'"
+                )
+
+            payload = copy.deepcopy(score_entry)
+            payload["id"] = idea_id
+            results.append(payload)
+
+        self._emit_logs("evaluate")
+        return results
+
+    def resolve_generated_path(self, relative_path: str) -> Path:
+        if not self.enabled:
+            raise DemoCacheError("Demo cache mode is disabled")
+
+        sanitized = relative_path.lstrip("/\\")
+        candidate = (self.generated_base / sanitized).resolve()
+        base = self.generated_base.resolve()
+        if not str(candidate).startswith(str(base)):
+            raise DemoCacheError(
+                f"Attempt to access file outside demo cache: {relative_path}"
+            )
+        return candidate
diff --git a/backend/run_app.sh b/backend/run_app.sh
new file mode 100755
index 00000000..08b2e91b
--- /dev/null
+++ b/backend/run_app.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Resolve repository root relative to this script.
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+cd "$ROOT_DIR"
+
+# Ensure demo flags are not leaking into the normal instance.
+unset DEMO_CACHE_MODE || true
+unset DEMO_CACHE_DIR || true
+
+# Default port for live instance
+export PORT="${PORT:-5000}"
+
+exec poetry run python backend/app.py "$@"
diff --git a/backend/run_demo.sh b/backend/run_demo.sh
new file mode 100755
index 00000000..1e993067
--- /dev/null
+++ b/backend/run_demo.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Resolve repository root relative to this script.
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+CACHE_DIR="${DEMO_CACHE_DIR:-$ROOT_DIR/frontend/demo_cache}"
+
+if [[ ! -f "$CACHE_DIR/session.json" ]]; then
+  echo "[demo] Missing cache snapshot at $CACHE_DIR/session.json" >&2
+  echo "[demo] Run: poetry run python scripts/generate_demo_cache.py ..." >&2
+  exit 1
+fi
+
+cd "$ROOT_DIR"
+
+export DEMO_CACHE_MODE=1
+export DEMO_CACHE_DIR="$CACHE_DIR"
+export PORT="${PORT:-5001}"
+
+exec poetry run python backend/app.py "$@"
diff --git a/backend/tests/test_coder_demo.py b/backend/tests/test_coder_demo.py
new file mode 100644
index 00000000..c3d5be5d
--- /dev/null
+++ b/backend/tests/test_coder_demo.py
@@ -0,0 +1,76 @@
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+from backend.app import app
+
+
+@pytest.fixture(scope="module")
+def client():
+    app.config["TESTING"] = True
+    with app.test_client() as client:
+        yield client
+
+
+def _load_demo_idea() -> dict:
+    idea_path = Path(__file__).resolve().parents[2] / "demo_test" / "idea.json"
+    if not idea_path.exists():
+        raise FileNotFoundError(f"Idea file not found: {idea_path}")
+    return json.loads(idea_path.read_text())
+
+
+def _configure_backend(client) -> None:
+    model = "gpt-4o"
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        pytest.skip("OPENAI_API_KEY not set; skipping coder integration test")
+
+    response = client.post(
+        "/api/configure",
+        json={
+            "model": model,
+            "api_key": api_key,
+            "budget": 10.0,
+            "budget_preference": "balanced",
+        },
+    )
+    assert response.status_code == 200, response.get_data(as_text=True)
+
+
+def test_coder_with_demo_idea(client):
+    """
+    Integration test: run backend /api/code with the demo idea.
+    Ensures coder executes and produces experiment outputs.
+    """
+    _configure_backend(client)
+
+    idea_payload = _load_demo_idea()
+    response = client.post(
+        "/api/code",
+        json={"idea": {"originalData": idea_payload}},
+    )
+
+    assert response.status_code == 200, response.get_data(as_text=True)
+
+    data = response.get_json()
+    assert data is not None, "No JSON body returned"
+    assert data.get("success") is True, data
+
+    experiment_dir = data.get("experiment_dir")
+    assert experiment_dir, data
+
+    generated_base = Path(__file__).resolve().parents[2] / "generated"
+    abs_experiment_dir = generated_base / experiment_dir
+    assert abs_experiment_dir.exists(), f"Experiment dir missing: {abs_experiment_dir}"
+
+    expected_files = {
+        "experiment.py",
+        "notes.txt",
+        "experiment_results.txt",
+    }
+    missing = [
+        name for name in expected_files if not (abs_experiment_dir / name).exists()
+    ]
+    assert not missing, f"Missing files in {abs_experiment_dir}: {missing}"
diff --git a/frontend/demo_cache/generated/experiments/idea-1/experiment.py b/frontend/demo_cache/generated/experiments/idea-1/experiment.py
new file mode 100644
index 00000000..68eccf72
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/experiment.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# experiment.py — HumanEval prompt→solution char-GRU training (keeps "training" intact)
+# Outputs: final_info.json (metrics) + generations.jsonl
+#
+# Usage:
+#   python experiment.py --out_dir runs/idea1 --epochs 3 --batch_size 8
+#
+# Notes:
+# - We use a deterministic pseudo-split (70/15/15) from HumanEval test-only items to preserve train/val/test structure.
+# - For true pass@k, plug your unit-test runner where indicated.
+
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import List, Dict, Tuple
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    return tasks
+
+
+def pseudo_split(items: List[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self, pairs: List[Dict[str, str]], vocab: CharVocab, max_len: int = 1024
+    ):
+        self.vocab = vocab
+        self.max_len = max_len
+        self.data = []
+        for ex in pairs:
+            ctx_ids = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt_ids = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx_ids + tgt_ids)[:max_len]
+            ctx_len = min(len(ctx_ids), len(x))
+            y = x[1:] + [vocab.eos_id]
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.data.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+
+def collate(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+class CharGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+@torch.no_grad()
+def greedy_decode(
+    model, vocab: CharVocab, prompt: str, max_new: int = 512, device: str = "cpu"
+):
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train, val, test = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train] + [ex["reference"] for ex in train]
+    )
+    train_ds = HEDataset(train, vocab, max_len=max_len)
+    val_ds = HEDataset(val, vocab, max_len=max_len)
+    test_ds = HEDataset(test, vocab, max_len=max_len)
+
+    coll = lambda b: collate(b, vocab.pad_id)
+    train_loader = DataLoader(
+        train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+    test_loader = DataLoader(
+        test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = CharGRU(vocab_size=len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+
+    best = 1e9
+    for ep in range(1, epochs + 1):
+        model.train()
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            opt.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, vocab.pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            opt.step()
+            tot += loss.item()
+            steps += 1
+        tr_loss = tot / max(1, steps)
+
+        model.eval()
+        vtot = 0.0
+        vsteps = 0
+        with torch.no_grad():
+            for X, Y, M, _, _ in val_loader:
+                X, Y, M = X.to(device), Y.to(device), M.to(device)
+                vtot += masked_ce(model(X), Y, M, vocab.pad_id).item()
+                vsteps += 1
+        val_loss = vtot / max(1, vsteps)
+        print(f"[Epoch {ep}] train_loss={tr_loss:.4f} val_loss={val_loss:.4f}")
+        if val_loss < best:
+            best = val_loss
+            torch.save(model.state_dict(), os.path.join(out_dir, "best.pt"))
+
+    # Reload best and evaluate on test with greedy decode
+    if os.path.exists(os.path.join(out_dir, "best.pt")):
+        model.load_state_dict(
+            torch.load(os.path.join(out_dir, "best.pt"), map_location=device)
+        )
+
+    gens = []
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_cnt = 0
+    for X, Y, M, refs, tids in test_loader:
+        for ref, tid in zip(refs, tids):
+            gen = greedy_decode(
+                model,
+                vocab,
+                (
+                    items[0]["prompt"]
+                    if False
+                    else next(ex["prompt"] for ex in test if ex["task_id"] == tid)
+                ),
+                max_new=decode_max,
+                device=device,
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(ast_ok(gen))
+            undef_sum += undef_refs(gen)
+            sim_sum += text_sim(gen, ref)
+            pass_cnt += pass1_proxy(gen, ref)
+
+    n = len(test)
+    results = {
+        "Dataset": f"HumanEval pseudo-split (n_train={len(train)}, n_val={len(val)}, n_test={len(test)})",
+        "Test": {
+            "AST_Parse_Rate": ast_cnt / max(1, n),
+            "UndefinedRef_Avg": undef_sum / max(1, n),
+            "TextSim_Avg": sim_sum / max(1, n),
+            "pass@1_proxy": pass_cnt / max(1, n),
+        },
+        "Notes": "Plug a unit-test runner for true pass@k.",
+    }
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(results, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+    print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--out_dir", type=str, required=True)
+    p.add_argument("--epochs", type=int, default=3)
+    p.add_argument("--batch_size", type=int, default=8)
+    p.add_argument("--lr", type=float, default=1e-3)
+    p.add_argument("--emb", type=int, default=128)
+    p.add_argument("--hidden", type=int, default=256)
+    p.add_argument("--max_len", type=int, default=1024)
+    p.add_argument("--decode_max", type=int, default=512)
+    p.add_argument("--max_items", type=int, default=164)
+    args = p.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-1/experiment_results.txt b/frontend/demo_cache/generated/experiments/idea-1/experiment_results.txt
new file mode 100644
index 00000000..e379834d
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/experiment_results.txt
@@ -0,0 +1,32 @@
+{
+  "run_1": {
+    "Dataset": "HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)",
+    "Test": {
+      "AST_Parse_Rate": 1.0,
+      "UndefinedRef_Avg": 0.0,
+      "TextSim_Avg": 0.04661257702428245,
+      "pass@1_proxy": 0.0
+    },
+    "Notes": "Plug a unit-test runner for true pass@k."
+  },
+  "run_2": {
+    "Dataset": "HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)",
+    "Test": {
+      "AST_Parse_Rate": 1.0,
+      "UndefinedRef_Avg": 0.0,
+      "TextSim_Avg": 0.04661257702428245,
+      "pass@1_proxy": 0.0
+    },
+    "Notes": "Plug a unit-test runner for true pass@k."
+  },
+  "run_3": {
+    "Dataset": "HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)",
+    "Test": {
+      "AST_Parse_Rate": 1.0,
+      "UndefinedRef_Avg": 0.0,
+      "TextSim_Avg": 0.04661257702428245,
+      "pass@1_proxy": 0.0
+    },
+    "Notes": "Plug a unit-test runner for true pass@k."
+  }
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-1/notes.txt b/frontend/demo_cache/generated/experiments/idea-1/notes.txt
new file mode 100644
index 00000000..4dddd6a1
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/notes.txt
@@ -0,0 +1,17 @@
+Auto-generated experiment notes
+Updated: 2025-11-05T19:16:01.642452Z
+
+run_1:
+  - Dataset: HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)
+  - Notes: Plug a unit-test runner for true pass@k.
+  - Test: {'AST_Parse_Rate': 1.0, 'UndefinedRef_Avg': 0.0, 'TextSim_Avg': 0.04661257702428245, 'pass@1_proxy': 0.0}
+
+run_2:
+  - Dataset: HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)
+  - Notes: Plug a unit-test runner for true pass@k.
+  - Test: {'AST_Parse_Rate': 1.0, 'UndefinedRef_Avg': 0.0, 'TextSim_Avg': 0.04661257702428245, 'pass@1_proxy': 0.0}
+
+run_3:
+  - Dataset: HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)
+  - Notes: Plug a unit-test runner for true pass@k.
+  - Test: {'AST_Parse_Rate': 1.0, 'UndefinedRef_Avg': 0.0, 'TextSim_Avg': 0.04661257702428245, 'pass@1_proxy': 0.0}
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_1.py b/frontend/demo_cache/generated/experiments/idea-1/run_1.py
new file mode 100644
index 00000000..68eccf72
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_1.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# experiment.py — HumanEval prompt→solution char-GRU training (keeps "training" intact)
+# Outputs: final_info.json (metrics) + generations.jsonl
+#
+# Usage:
+#   python experiment.py --out_dir runs/idea1 --epochs 3 --batch_size 8
+#
+# Notes:
+# - We use a deterministic pseudo-split (70/15/15) from HumanEval test-only items to preserve train/val/test structure.
+# - For true pass@k, plug your unit-test runner where indicated.
+
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import List, Dict, Tuple
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    return tasks
+
+
+def pseudo_split(items: List[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self, pairs: List[Dict[str, str]], vocab: CharVocab, max_len: int = 1024
+    ):
+        self.vocab = vocab
+        self.max_len = max_len
+        self.data = []
+        for ex in pairs:
+            ctx_ids = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt_ids = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx_ids + tgt_ids)[:max_len]
+            ctx_len = min(len(ctx_ids), len(x))
+            y = x[1:] + [vocab.eos_id]
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.data.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+
+def collate(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+class CharGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+@torch.no_grad()
+def greedy_decode(
+    model, vocab: CharVocab, prompt: str, max_new: int = 512, device: str = "cpu"
+):
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train, val, test = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train] + [ex["reference"] for ex in train]
+    )
+    train_ds = HEDataset(train, vocab, max_len=max_len)
+    val_ds = HEDataset(val, vocab, max_len=max_len)
+    test_ds = HEDataset(test, vocab, max_len=max_len)
+
+    coll = lambda b: collate(b, vocab.pad_id)
+    train_loader = DataLoader(
+        train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+    test_loader = DataLoader(
+        test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = CharGRU(vocab_size=len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+
+    best = 1e9
+    for ep in range(1, epochs + 1):
+        model.train()
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            opt.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, vocab.pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            opt.step()
+            tot += loss.item()
+            steps += 1
+        tr_loss = tot / max(1, steps)
+
+        model.eval()
+        vtot = 0.0
+        vsteps = 0
+        with torch.no_grad():
+            for X, Y, M, _, _ in val_loader:
+                X, Y, M = X.to(device), Y.to(device), M.to(device)
+                vtot += masked_ce(model(X), Y, M, vocab.pad_id).item()
+                vsteps += 1
+        val_loss = vtot / max(1, vsteps)
+        print(f"[Epoch {ep}] train_loss={tr_loss:.4f} val_loss={val_loss:.4f}")
+        if val_loss < best:
+            best = val_loss
+            torch.save(model.state_dict(), os.path.join(out_dir, "best.pt"))
+
+    # Reload best and evaluate on test with greedy decode
+    if os.path.exists(os.path.join(out_dir, "best.pt")):
+        model.load_state_dict(
+            torch.load(os.path.join(out_dir, "best.pt"), map_location=device)
+        )
+
+    gens = []
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_cnt = 0
+    for X, Y, M, refs, tids in test_loader:
+        for ref, tid in zip(refs, tids):
+            gen = greedy_decode(
+                model,
+                vocab,
+                (
+                    items[0]["prompt"]
+                    if False
+                    else next(ex["prompt"] for ex in test if ex["task_id"] == tid)
+                ),
+                max_new=decode_max,
+                device=device,
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(ast_ok(gen))
+            undef_sum += undef_refs(gen)
+            sim_sum += text_sim(gen, ref)
+            pass_cnt += pass1_proxy(gen, ref)
+
+    n = len(test)
+    results = {
+        "Dataset": f"HumanEval pseudo-split (n_train={len(train)}, n_val={len(val)}, n_test={len(test)})",
+        "Test": {
+            "AST_Parse_Rate": ast_cnt / max(1, n),
+            "UndefinedRef_Avg": undef_sum / max(1, n),
+            "TextSim_Avg": sim_sum / max(1, n),
+            "pass@1_proxy": pass_cnt / max(1, n),
+        },
+        "Notes": "Plug a unit-test runner for true pass@k.",
+    }
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(results, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+    print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--out_dir", type=str, required=True)
+    p.add_argument("--epochs", type=int, default=3)
+    p.add_argument("--batch_size", type=int, default=8)
+    p.add_argument("--lr", type=float, default=1e-3)
+    p.add_argument("--emb", type=int, default=128)
+    p.add_argument("--hidden", type=int, default=256)
+    p.add_argument("--max_len", type=int, default=1024)
+    p.add_argument("--decode_max", type=int, default=512)
+    p.add_argument("--max_items", type=int, default=164)
+    args = p.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_1/final_info.json b/frontend/demo_cache/generated/experiments/idea-1/run_1/final_info.json
new file mode 100644
index 00000000..641508c5
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_1/final_info.json
@@ -0,0 +1,10 @@
+{
+  "Dataset": "HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)",
+  "Test": {
+    "AST_Parse_Rate": 1.0,
+    "UndefinedRef_Avg": 0.0,
+    "TextSim_Avg": 0.04661257702428245,
+    "pass@1_proxy": 0.0
+  },
+  "Notes": "Plug a unit-test runner for true pass@k."
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_1/notes.txt b/frontend/demo_cache/generated/experiments/idea-1/run_1/notes.txt
new file mode 100644
index 00000000..7da54b02
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_1/notes.txt
@@ -0,0 +1,6 @@
+Auto-generated notes for run_1
+Updated: 2025-11-05T19:10:30.023769Z
+
+- Dataset: HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)
+- Notes: Plug a unit-test runner for true pass@k.
+- Test: {'AST_Parse_Rate': 1.0, 'UndefinedRef_Avg': 0.0, 'TextSim_Avg': 0.04661257702428245, 'pass@1_proxy': 0.0}
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_2.py b/frontend/demo_cache/generated/experiments/idea-1/run_2.py
new file mode 100644
index 00000000..68eccf72
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_2.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# experiment.py — HumanEval prompt→solution char-GRU training (keeps "training" intact)
+# Outputs: final_info.json (metrics) + generations.jsonl
+#
+# Usage:
+#   python experiment.py --out_dir runs/idea1 --epochs 3 --batch_size 8
+#
+# Notes:
+# - We use a deterministic pseudo-split (70/15/15) from HumanEval test-only items to preserve train/val/test structure.
+# - For true pass@k, plug your unit-test runner where indicated.
+
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import List, Dict, Tuple
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    return tasks
+
+
+def pseudo_split(items: List[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self, pairs: List[Dict[str, str]], vocab: CharVocab, max_len: int = 1024
+    ):
+        self.vocab = vocab
+        self.max_len = max_len
+        self.data = []
+        for ex in pairs:
+            ctx_ids = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt_ids = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx_ids + tgt_ids)[:max_len]
+            ctx_len = min(len(ctx_ids), len(x))
+            y = x[1:] + [vocab.eos_id]
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.data.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+
+def collate(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+class CharGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+@torch.no_grad()
+def greedy_decode(
+    model, vocab: CharVocab, prompt: str, max_new: int = 512, device: str = "cpu"
+):
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train, val, test = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train] + [ex["reference"] for ex in train]
+    )
+    train_ds = HEDataset(train, vocab, max_len=max_len)
+    val_ds = HEDataset(val, vocab, max_len=max_len)
+    test_ds = HEDataset(test, vocab, max_len=max_len)
+
+    coll = lambda b: collate(b, vocab.pad_id)
+    train_loader = DataLoader(
+        train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+    test_loader = DataLoader(
+        test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = CharGRU(vocab_size=len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+
+    best = 1e9
+    for ep in range(1, epochs + 1):
+        model.train()
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            opt.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, vocab.pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            opt.step()
+            tot += loss.item()
+            steps += 1
+        tr_loss = tot / max(1, steps)
+
+        model.eval()
+        vtot = 0.0
+        vsteps = 0
+        with torch.no_grad():
+            for X, Y, M, _, _ in val_loader:
+                X, Y, M = X.to(device), Y.to(device), M.to(device)
+                vtot += masked_ce(model(X), Y, M, vocab.pad_id).item()
+                vsteps += 1
+        val_loss = vtot / max(1, vsteps)
+        print(f"[Epoch {ep}] train_loss={tr_loss:.4f} val_loss={val_loss:.4f}")
+        if val_loss < best:
+            best = val_loss
+            torch.save(model.state_dict(), os.path.join(out_dir, "best.pt"))
+
+    # Reload best and evaluate on test with greedy decode
+    if os.path.exists(os.path.join(out_dir, "best.pt")):
+        model.load_state_dict(
+            torch.load(os.path.join(out_dir, "best.pt"), map_location=device)
+        )
+
+    gens = []
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_cnt = 0
+    for X, Y, M, refs, tids in test_loader:
+        for ref, tid in zip(refs, tids):
+            gen = greedy_decode(
+                model,
+                vocab,
+                (
+                    items[0]["prompt"]
+                    if False
+                    else next(ex["prompt"] for ex in test if ex["task_id"] == tid)
+                ),
+                max_new=decode_max,
+                device=device,
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(ast_ok(gen))
+            undef_sum += undef_refs(gen)
+            sim_sum += text_sim(gen, ref)
+            pass_cnt += pass1_proxy(gen, ref)
+
+    n = len(test)
+    results = {
+        "Dataset": f"HumanEval pseudo-split (n_train={len(train)}, n_val={len(val)}, n_test={len(test)})",
+        "Test": {
+            "AST_Parse_Rate": ast_cnt / max(1, n),
+            "UndefinedRef_Avg": undef_sum / max(1, n),
+            "TextSim_Avg": sim_sum / max(1, n),
+            "pass@1_proxy": pass_cnt / max(1, n),
+        },
+        "Notes": "Plug a unit-test runner for true pass@k.",
+    }
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(results, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+    print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--out_dir", type=str, required=True)
+    p.add_argument("--epochs", type=int, default=3)
+    p.add_argument("--batch_size", type=int, default=8)
+    p.add_argument("--lr", type=float, default=1e-3)
+    p.add_argument("--emb", type=int, default=128)
+    p.add_argument("--hidden", type=int, default=256)
+    p.add_argument("--max_len", type=int, default=1024)
+    p.add_argument("--decode_max", type=int, default=512)
+    p.add_argument("--max_items", type=int, default=164)
+    args = p.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_2/final_info.json b/frontend/demo_cache/generated/experiments/idea-1/run_2/final_info.json
new file mode 100644
index 00000000..641508c5
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_2/final_info.json
@@ -0,0 +1,10 @@
+{
+  "Dataset": "HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)",
+  "Test": {
+    "AST_Parse_Rate": 1.0,
+    "UndefinedRef_Avg": 0.0,
+    "TextSim_Avg": 0.04661257702428245,
+    "pass@1_proxy": 0.0
+  },
+  "Notes": "Plug a unit-test runner for true pass@k."
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_2/notes.txt b/frontend/demo_cache/generated/experiments/idea-1/run_2/notes.txt
new file mode 100644
index 00000000..af00905b
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_2/notes.txt
@@ -0,0 +1,6 @@
+Auto-generated notes for run_2
+Updated: 2025-11-05T19:13:13.157468Z
+
+- Dataset: HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)
+- Notes: Plug a unit-test runner for true pass@k.
+- Test: {'AST_Parse_Rate': 1.0, 'UndefinedRef_Avg': 0.0, 'TextSim_Avg': 0.04661257702428245, 'pass@1_proxy': 0.0}
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_3.py b/frontend/demo_cache/generated/experiments/idea-1/run_3.py
new file mode 100644
index 00000000..68eccf72
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_3.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# experiment.py — HumanEval prompt→solution char-GRU training (keeps "training" intact)
+# Outputs: final_info.json (metrics) + generations.jsonl
+#
+# Usage:
+#   python experiment.py --out_dir runs/idea1 --epochs 3 --batch_size 8
+#
+# Notes:
+# - We use a deterministic pseudo-split (70/15/15) from HumanEval test-only items to preserve train/val/test structure.
+# - For true pass@k, plug your unit-test runner where indicated.
+
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import List, Dict, Tuple
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    return tasks
+
+
+def pseudo_split(items: List[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self, pairs: List[Dict[str, str]], vocab: CharVocab, max_len: int = 1024
+    ):
+        self.vocab = vocab
+        self.max_len = max_len
+        self.data = []
+        for ex in pairs:
+            ctx_ids = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt_ids = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx_ids + tgt_ids)[:max_len]
+            ctx_len = min(len(ctx_ids), len(x))
+            y = x[1:] + [vocab.eos_id]
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.data.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
+
+
+def collate(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+class CharGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+@torch.no_grad()
+def greedy_decode(
+    model, vocab: CharVocab, prompt: str, max_new: int = 512, device: str = "cpu"
+):
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train, val, test = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train] + [ex["reference"] for ex in train]
+    )
+    train_ds = HEDataset(train, vocab, max_len=max_len)
+    val_ds = HEDataset(val, vocab, max_len=max_len)
+    test_ds = HEDataset(test, vocab, max_len=max_len)
+
+    coll = lambda b: collate(b, vocab.pad_id)
+    train_loader = DataLoader(
+        train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+    test_loader = DataLoader(
+        test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll
+    )
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = CharGRU(vocab_size=len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+
+    best = 1e9
+    for ep in range(1, epochs + 1):
+        model.train()
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            opt.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, vocab.pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            opt.step()
+            tot += loss.item()
+            steps += 1
+        tr_loss = tot / max(1, steps)
+
+        model.eval()
+        vtot = 0.0
+        vsteps = 0
+        with torch.no_grad():
+            for X, Y, M, _, _ in val_loader:
+                X, Y, M = X.to(device), Y.to(device), M.to(device)
+                vtot += masked_ce(model(X), Y, M, vocab.pad_id).item()
+                vsteps += 1
+        val_loss = vtot / max(1, vsteps)
+        print(f"[Epoch {ep}] train_loss={tr_loss:.4f} val_loss={val_loss:.4f}")
+        if val_loss < best:
+            best = val_loss
+            torch.save(model.state_dict(), os.path.join(out_dir, "best.pt"))
+
+    # Reload best and evaluate on test with greedy decode
+    if os.path.exists(os.path.join(out_dir, "best.pt")):
+        model.load_state_dict(
+            torch.load(os.path.join(out_dir, "best.pt"), map_location=device)
+        )
+
+    gens = []
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_cnt = 0
+    for X, Y, M, refs, tids in test_loader:
+        for ref, tid in zip(refs, tids):
+            gen = greedy_decode(
+                model,
+                vocab,
+                (
+                    items[0]["prompt"]
+                    if False
+                    else next(ex["prompt"] for ex in test if ex["task_id"] == tid)
+                ),
+                max_new=decode_max,
+                device=device,
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(ast_ok(gen))
+            undef_sum += undef_refs(gen)
+            sim_sum += text_sim(gen, ref)
+            pass_cnt += pass1_proxy(gen, ref)
+
+    n = len(test)
+    results = {
+        "Dataset": f"HumanEval pseudo-split (n_train={len(train)}, n_val={len(val)}, n_test={len(test)})",
+        "Test": {
+            "AST_Parse_Rate": ast_cnt / max(1, n),
+            "UndefinedRef_Avg": undef_sum / max(1, n),
+            "TextSim_Avg": sim_sum / max(1, n),
+            "pass@1_proxy": pass_cnt / max(1, n),
+        },
+        "Notes": "Plug a unit-test runner for true pass@k.",
+    }
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(results, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+    print(json.dumps(results, indent=2))
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--out_dir", type=str, required=True)
+    p.add_argument("--epochs", type=int, default=3)
+    p.add_argument("--batch_size", type=int, default=8)
+    p.add_argument("--lr", type=float, default=1e-3)
+    p.add_argument("--emb", type=int, default=128)
+    p.add_argument("--hidden", type=int, default=256)
+    p.add_argument("--max_len", type=int, default=1024)
+    p.add_argument("--decode_max", type=int, default=512)
+    p.add_argument("--max_items", type=int, default=164)
+    args = p.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_3/final_info.json b/frontend/demo_cache/generated/experiments/idea-1/run_3/final_info.json
new file mode 100644
index 00000000..641508c5
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_3/final_info.json
@@ -0,0 +1,10 @@
+{
+  "Dataset": "HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)",
+  "Test": {
+    "AST_Parse_Rate": 1.0,
+    "UndefinedRef_Avg": 0.0,
+    "TextSim_Avg": 0.04661257702428245,
+    "pass@1_proxy": 0.0
+  },
+  "Notes": "Plug a unit-test runner for true pass@k."
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-1/run_3/notes.txt b/frontend/demo_cache/generated/experiments/idea-1/run_3/notes.txt
new file mode 100644
index 00000000..860e4210
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-1/run_3/notes.txt
@@ -0,0 +1,6 @@
+Auto-generated notes for run_3
+Updated: 2025-11-05T19:16:01.641408Z
+
+- Dataset: HumanEval pseudo-split (n_train=114, n_val=24, n_test=26)
+- Notes: Plug a unit-test runner for true pass@k.
+- Test: {'AST_Parse_Rate': 1.0, 'UndefinedRef_Avg': 0.0, 'TextSim_Avg': 0.04661257702428245, 'pass@1_proxy': 0.0}
diff --git a/frontend/demo_cache/generated/experiments/idea-2/experiment.py b/frontend/demo_cache/generated/experiments/idea-2/experiment.py
new file mode 100644
index 00000000..bf91bcb8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/experiment.py
@@ -0,0 +1,348 @@
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# -----------------------------
+# Data
+# -----------------------------
+def _load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    items = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        ref = ex.get("canonical_solution", "")
+        if prompt and ref:
+            items.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": ref,
+                }
+            )
+    return items
+
+
+def load_data():
+    """Keep function name. Return train/val/test DataLoaders for training a char-level GRU LM."""
+    items = _load_humaneval()
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_tr, n_val = int(0.7 * n), int(0.15 * n)
+    train_items = [items[i] for i in idx[:n_tr]]
+    val_items = [items[i] for i in idx[n_tr : n_tr + n_val]]
+    test_items = [items[i] for i in idx[n_tr + n_val :]]
+
+    # Build vocab on train set (prompts + refs)
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+
+    max_len = 1024
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=8,
+        shuffle=True,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    test_loader = DataLoader(
+        test_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+
+    # Return loaders and aux (we keep signature minimal by stashing vocab on dataset)
+    return train_loader, val_loader, test_loader
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(self, items: List[Dict[str, str]], vocab: CharVocab, max_len: int):
+        self.items = items
+        self.vocab = vocab
+        self.max_len = max_len
+        self.rows = []
+        for ex in items:
+            ctx = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.rows)
+
+    def __getitem__(self, idx):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+# -----------------------------
+# Model (keep class name SingleLayerGRU)
+# -----------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)  # [B, L, V]
+
+
+# -----------------------------
+# Train / Evaluate
+# -----------------------------
+def _masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+def train(model, train_loader, optimizer, criterion, device):
+    """Keep function name; internally we use masked CE over target region."""
+    model.train()
+    for epoch in range(3):
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = _masked_ce(logits, Y, M, pad_id=0)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            tot += loss.item()
+            steps += 1
+        print(f"Epoch {epoch+1}: train_loss={tot/max(1,steps):.4f}")
+    print(f"Training completed over {epoch+1} epochs.")
+
+
+@torch.no_grad()
+def _greedy_generate(
+    model, vocab: CharVocab, prompt: str, device: str = "cpu", max_new_tokens: int = 512
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new_tokens):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def _ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def _undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def _text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def _pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def evaluate(model, data_loader, device):
+    """Keep function name; now returns coherence-oriented metrics on the test set."""
+    # retrieve vocab from dataset via closure trick
+    ds = data_loader.dataset
+    vocab = ds.vocab  # type: ignore
+    items = data_loader.dataset.items  # type: ignore
+
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_sum = 0
+    n = 0
+    gens = []
+
+    for _, _, _, refs, tids in data_loader:
+        for ref, tid in zip(refs, tids):
+            prompt = next(ex["prompt"] for ex in items if ex["task_id"] == tid)
+            gen = _greedy_generate(
+                model, vocab, prompt, device=device, max_new_tokens=512
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(_ast_ok(gen))
+            undef_sum += _undef_refs(gen)
+            sim_sum += _text_sim(gen, ref)
+            pass_sum += _pass1_proxy(gen, ref)
+            n += 1
+
+    metrics = {
+        "AST_Parse_Rate": ast_cnt / max(1, n),
+        "UndefinedRef_Avg": undef_sum / max(1, n),
+        "TextSim_Avg": sim_sum / max(1, n),
+        "pass@1_proxy": pass_sum / max(1, n),
+    }
+    return metrics, gens
+
+
+def main(out_dir):
+    os.makedirs(out_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train_loader, val_loader, test_loader = load_data()
+
+    # Build model from dataset vocab
+    vocab = train_loader.dataset.vocab  # type: ignore
+    model = SingleLayerGRU(vocab_size=len(vocab.itos), emb=128, hidden=256).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    # criterion kept for signature compatibility, not used directly (masked CE inside train())
+    criterion = nn.CrossEntropyLoss()
+
+    train(model, train_loader, optimizer, criterion, device)
+
+    # Optional: quick val loop (omitted to keep minimal changes)
+
+    # Evaluate on test
+    test_metrics, gens = evaluate(model, test_loader, device)
+
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+
+    print(json.dumps(test_metrics, indent=2))
+    print("Experiment completed successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    args = parser.parse_args()
+    main(args.out_dir)
diff --git a/frontend/demo_cache/generated/experiments/idea-2/experiment_results.txt b/frontend/demo_cache/generated/experiments/idea-2/experiment_results.txt
new file mode 100644
index 00000000..dc333317
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/experiment_results.txt
@@ -0,0 +1,20 @@
+{
+  "run_1": {
+    "AST_Parse_Rate": 1.0,
+    "UndefinedRef_Avg": 0.0,
+    "TextSim_Avg": 0.04661257702428245,
+    "pass@1_proxy": 0.0
+  },
+  "run_2": {
+    "AST_Parse_Rate": 1.0,
+    "UndefinedRef_Avg": 0.0,
+    "TextSim_Avg": 0.04661257702428245,
+    "pass@1_proxy": 0.0
+  },
+  "run_3": {
+    "AST_Parse_Rate": 1.0,
+    "UndefinedRef_Avg": 0.0,
+    "TextSim_Avg": 0.04661257702428245,
+    "pass@1_proxy": 0.0
+  }
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-2/notes.txt b/frontend/demo_cache/generated/experiments/idea-2/notes.txt
new file mode 100644
index 00000000..45482693
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/notes.txt
@@ -0,0 +1,20 @@
+Auto-generated experiment notes
+Updated: 2025-11-05T19:24:37.495790Z
+
+run_1:
+  - AST_Parse_Rate: 1.0
+  - TextSim_Avg: 0.04661257702428245
+  - UndefinedRef_Avg: 0.0
+  - pass@1_proxy: 0.0
+
+run_2:
+  - AST_Parse_Rate: 1.0
+  - TextSim_Avg: 0.04661257702428245
+  - UndefinedRef_Avg: 0.0
+  - pass@1_proxy: 0.0
+
+run_3:
+  - AST_Parse_Rate: 1.0
+  - TextSim_Avg: 0.04661257702428245
+  - UndefinedRef_Avg: 0.0
+  - pass@1_proxy: 0.0
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_1.py b/frontend/demo_cache/generated/experiments/idea-2/run_1.py
new file mode 100644
index 00000000..bf91bcb8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_1.py
@@ -0,0 +1,348 @@
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# -----------------------------
+# Data
+# -----------------------------
+def _load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    items = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        ref = ex.get("canonical_solution", "")
+        if prompt and ref:
+            items.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": ref,
+                }
+            )
+    return items
+
+
+def load_data():
+    """Keep function name. Return train/val/test DataLoaders for training a char-level GRU LM."""
+    items = _load_humaneval()
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_tr, n_val = int(0.7 * n), int(0.15 * n)
+    train_items = [items[i] for i in idx[:n_tr]]
+    val_items = [items[i] for i in idx[n_tr : n_tr + n_val]]
+    test_items = [items[i] for i in idx[n_tr + n_val :]]
+
+    # Build vocab on train set (prompts + refs)
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+
+    max_len = 1024
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=8,
+        shuffle=True,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    test_loader = DataLoader(
+        test_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+
+    # Return loaders and aux (we keep signature minimal by stashing vocab on dataset)
+    return train_loader, val_loader, test_loader
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(self, items: List[Dict[str, str]], vocab: CharVocab, max_len: int):
+        self.items = items
+        self.vocab = vocab
+        self.max_len = max_len
+        self.rows = []
+        for ex in items:
+            ctx = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.rows)
+
+    def __getitem__(self, idx):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+# -----------------------------
+# Model (keep class name SingleLayerGRU)
+# -----------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)  # [B, L, V]
+
+
+# -----------------------------
+# Train / Evaluate
+# -----------------------------
+def _masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+def train(model, train_loader, optimizer, criterion, device):
+    """Keep function name; internally we use masked CE over target region."""
+    model.train()
+    for epoch in range(3):
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = _masked_ce(logits, Y, M, pad_id=0)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            tot += loss.item()
+            steps += 1
+        print(f"Epoch {epoch+1}: train_loss={tot/max(1,steps):.4f}")
+    print(f"Training completed over {epoch+1} epochs.")
+
+
+@torch.no_grad()
+def _greedy_generate(
+    model, vocab: CharVocab, prompt: str, device: str = "cpu", max_new_tokens: int = 512
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new_tokens):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def _ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def _undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def _text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def _pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def evaluate(model, data_loader, device):
+    """Keep function name; now returns coherence-oriented metrics on the test set."""
+    # retrieve vocab from dataset via closure trick
+    ds = data_loader.dataset
+    vocab = ds.vocab  # type: ignore
+    items = data_loader.dataset.items  # type: ignore
+
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_sum = 0
+    n = 0
+    gens = []
+
+    for _, _, _, refs, tids in data_loader:
+        for ref, tid in zip(refs, tids):
+            prompt = next(ex["prompt"] for ex in items if ex["task_id"] == tid)
+            gen = _greedy_generate(
+                model, vocab, prompt, device=device, max_new_tokens=512
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(_ast_ok(gen))
+            undef_sum += _undef_refs(gen)
+            sim_sum += _text_sim(gen, ref)
+            pass_sum += _pass1_proxy(gen, ref)
+            n += 1
+
+    metrics = {
+        "AST_Parse_Rate": ast_cnt / max(1, n),
+        "UndefinedRef_Avg": undef_sum / max(1, n),
+        "TextSim_Avg": sim_sum / max(1, n),
+        "pass@1_proxy": pass_sum / max(1, n),
+    }
+    return metrics, gens
+
+
+def main(out_dir):
+    os.makedirs(out_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train_loader, val_loader, test_loader = load_data()
+
+    # Build model from dataset vocab
+    vocab = train_loader.dataset.vocab  # type: ignore
+    model = SingleLayerGRU(vocab_size=len(vocab.itos), emb=128, hidden=256).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    # criterion kept for signature compatibility, not used directly (masked CE inside train())
+    criterion = nn.CrossEntropyLoss()
+
+    train(model, train_loader, optimizer, criterion, device)
+
+    # Optional: quick val loop (omitted to keep minimal changes)
+
+    # Evaluate on test
+    test_metrics, gens = evaluate(model, test_loader, device)
+
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+
+    print(json.dumps(test_metrics, indent=2))
+    print("Experiment completed successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    args = parser.parse_args()
+    main(args.out_dir)
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_1/final_info.json b/frontend/demo_cache/generated/experiments/idea-2/run_1/final_info.json
new file mode 100644
index 00000000..da11c8a8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_1/final_info.json
@@ -0,0 +1,6 @@
+{
+  "AST_Parse_Rate": 1.0,
+  "UndefinedRef_Avg": 0.0,
+  "TextSim_Avg": 0.04661257702428245,
+  "pass@1_proxy": 0.0
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_1/notes.txt b/frontend/demo_cache/generated/experiments/idea-2/run_1/notes.txt
new file mode 100644
index 00000000..1237f725
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_1/notes.txt
@@ -0,0 +1,7 @@
+Auto-generated notes for run_1
+Updated: 2025-11-05T19:18:52.932715Z
+
+- AST_Parse_Rate: 1.0
+- TextSim_Avg: 0.04661257702428245
+- UndefinedRef_Avg: 0.0
+- pass@1_proxy: 0.0
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_2.py b/frontend/demo_cache/generated/experiments/idea-2/run_2.py
new file mode 100644
index 00000000..bf91bcb8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_2.py
@@ -0,0 +1,348 @@
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# -----------------------------
+# Data
+# -----------------------------
+def _load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    items = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        ref = ex.get("canonical_solution", "")
+        if prompt and ref:
+            items.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": ref,
+                }
+            )
+    return items
+
+
+def load_data():
+    """Keep function name. Return train/val/test DataLoaders for training a char-level GRU LM."""
+    items = _load_humaneval()
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_tr, n_val = int(0.7 * n), int(0.15 * n)
+    train_items = [items[i] for i in idx[:n_tr]]
+    val_items = [items[i] for i in idx[n_tr : n_tr + n_val]]
+    test_items = [items[i] for i in idx[n_tr + n_val :]]
+
+    # Build vocab on train set (prompts + refs)
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+
+    max_len = 1024
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=8,
+        shuffle=True,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    test_loader = DataLoader(
+        test_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+
+    # Return loaders and aux (we keep signature minimal by stashing vocab on dataset)
+    return train_loader, val_loader, test_loader
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(self, items: List[Dict[str, str]], vocab: CharVocab, max_len: int):
+        self.items = items
+        self.vocab = vocab
+        self.max_len = max_len
+        self.rows = []
+        for ex in items:
+            ctx = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.rows)
+
+    def __getitem__(self, idx):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+# -----------------------------
+# Model (keep class name SingleLayerGRU)
+# -----------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)  # [B, L, V]
+
+
+# -----------------------------
+# Train / Evaluate
+# -----------------------------
+def _masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+def train(model, train_loader, optimizer, criterion, device):
+    """Keep function name; internally we use masked CE over target region."""
+    model.train()
+    for epoch in range(3):
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = _masked_ce(logits, Y, M, pad_id=0)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            tot += loss.item()
+            steps += 1
+        print(f"Epoch {epoch+1}: train_loss={tot/max(1,steps):.4f}")
+    print(f"Training completed over {epoch+1} epochs.")
+
+
+@torch.no_grad()
+def _greedy_generate(
+    model, vocab: CharVocab, prompt: str, device: str = "cpu", max_new_tokens: int = 512
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new_tokens):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def _ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def _undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def _text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def _pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def evaluate(model, data_loader, device):
+    """Keep function name; now returns coherence-oriented metrics on the test set."""
+    # retrieve vocab from dataset via closure trick
+    ds = data_loader.dataset
+    vocab = ds.vocab  # type: ignore
+    items = data_loader.dataset.items  # type: ignore
+
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_sum = 0
+    n = 0
+    gens = []
+
+    for _, _, _, refs, tids in data_loader:
+        for ref, tid in zip(refs, tids):
+            prompt = next(ex["prompt"] for ex in items if ex["task_id"] == tid)
+            gen = _greedy_generate(
+                model, vocab, prompt, device=device, max_new_tokens=512
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(_ast_ok(gen))
+            undef_sum += _undef_refs(gen)
+            sim_sum += _text_sim(gen, ref)
+            pass_sum += _pass1_proxy(gen, ref)
+            n += 1
+
+    metrics = {
+        "AST_Parse_Rate": ast_cnt / max(1, n),
+        "UndefinedRef_Avg": undef_sum / max(1, n),
+        "TextSim_Avg": sim_sum / max(1, n),
+        "pass@1_proxy": pass_sum / max(1, n),
+    }
+    return metrics, gens
+
+
+def main(out_dir):
+    os.makedirs(out_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train_loader, val_loader, test_loader = load_data()
+
+    # Build model from dataset vocab
+    vocab = train_loader.dataset.vocab  # type: ignore
+    model = SingleLayerGRU(vocab_size=len(vocab.itos), emb=128, hidden=256).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    # criterion kept for signature compatibility, not used directly (masked CE inside train())
+    criterion = nn.CrossEntropyLoss()
+
+    train(model, train_loader, optimizer, criterion, device)
+
+    # Optional: quick val loop (omitted to keep minimal changes)
+
+    # Evaluate on test
+    test_metrics, gens = evaluate(model, test_loader, device)
+
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+
+    print(json.dumps(test_metrics, indent=2))
+    print("Experiment completed successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    args = parser.parse_args()
+    main(args.out_dir)
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_2/final_info.json b/frontend/demo_cache/generated/experiments/idea-2/run_2/final_info.json
new file mode 100644
index 00000000..da11c8a8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_2/final_info.json
@@ -0,0 +1,6 @@
+{
+  "AST_Parse_Rate": 1.0,
+  "UndefinedRef_Avg": 0.0,
+  "TextSim_Avg": 0.04661257702428245,
+  "pass@1_proxy": 0.0
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_2/notes.txt b/frontend/demo_cache/generated/experiments/idea-2/run_2/notes.txt
new file mode 100644
index 00000000..12889088
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_2/notes.txt
@@ -0,0 +1,7 @@
+Auto-generated notes for run_2
+Updated: 2025-11-05T19:21:47.358039Z
+
+- AST_Parse_Rate: 1.0
+- TextSim_Avg: 0.04661257702428245
+- UndefinedRef_Avg: 0.0
+- pass@1_proxy: 0.0
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_3.py b/frontend/demo_cache/generated/experiments/idea-2/run_3.py
new file mode 100644
index 00000000..bf91bcb8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_3.py
@@ -0,0 +1,348 @@
+import argparse
+import json
+import os
+import random
+import ast
+import difflib
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# -----------------------------
+# Data
+# -----------------------------
+def _load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    items = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        ref = ex.get("canonical_solution", "")
+        if prompt and ref:
+            items.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": ref,
+                }
+            )
+    return items
+
+
+def load_data():
+    """Keep function name. Return train/val/test DataLoaders for training a char-level GRU LM."""
+    items = _load_humaneval()
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_tr, n_val = int(0.7 * n), int(0.15 * n)
+    train_items = [items[i] for i in idx[:n_tr]]
+    val_items = [items[i] for i in idx[n_tr : n_tr + n_val]]
+    test_items = [items[i] for i in idx[n_tr + n_val :]]
+
+    # Build vocab on train set (prompts + refs)
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+
+    max_len = 1024
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=8,
+        shuffle=True,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    val_loader = DataLoader(
+        val_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+    test_loader = DataLoader(
+        test_ds,
+        batch_size=8,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, vocab.pad_id),
+    )
+
+    # Return loaders and aux (we keep signature minimal by stashing vocab on dataset)
+    return train_loader, val_loader, test_loader
+
+
+class CharVocab:
+    def __init__(self, texts: List[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<SEP>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.sep_id = self.stoi["<SEP>"]
+
+    def encode(self, s: str) -> List[int]:
+        return [self.stoi.get(ch, self.sep_id) for ch in s]
+
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(self, items: List[Dict[str, str]], vocab: CharVocab, max_len: int):
+        self.items = items
+        self.vocab = vocab
+        self.max_len = max_len
+        self.rows = []
+        for ex in items:
+            ctx = [vocab.bos_id] + vocab.encode(ex["prompt"] + "\n# solution:\n")
+            tgt = vocab.encode(ex["reference"].rstrip() + "\n") + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ex["reference"], ex["task_id"]))
+
+    def __len__(self):
+        return len(self.rows)
+
+    def __getitem__(self, idx):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    L = max(len(x) for x, _, _, _, _ in batch)
+    X = []
+    Y = []
+    M = []
+    refs = []
+    tids = []
+    for x, y, m, ref, tid in batch:
+        pad = L - len(x)
+        X.append(x + [pad_id] * pad)
+        Y.append(y + [pad_id] * pad)
+        M.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+    return (
+        torch.tensor(X, dtype=torch.long),
+        torch.tensor(Y, dtype=torch.long),
+        torch.tensor(M, dtype=torch.float32),
+        refs,
+        tids,
+    )
+
+
+# -----------------------------
+# Model (keep class name SingleLayerGRU)
+# -----------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 128, hidden: int = 256):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x):
+        e = self.emb(x)
+        h, _ = self.rnn(e)
+        return self.head(h)  # [B, L, V]
+
+
+# -----------------------------
+# Train / Evaluate
+# -----------------------------
+def _masked_ce(logits, targets, mask, pad_id: int):
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    loss = (loss * mask).sum() / (mask.sum() + 1e-8)
+    return loss
+
+
+def train(model, train_loader, optimizer, criterion, device):
+    """Keep function name; internally we use masked CE over target region."""
+    model.train()
+    for epoch in range(3):
+        tot = 0.0
+        steps = 0
+        for X, Y, M, _, _ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = _masked_ce(logits, Y, M, pad_id=0)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            tot += loss.item()
+            steps += 1
+        print(f"Epoch {epoch+1}: train_loss={tot/max(1,steps):.4f}")
+    print(f"Training completed over {epoch+1} epochs.")
+
+
+@torch.no_grad()
+def _greedy_generate(
+    model, vocab: CharVocab, prompt: str, device: str = "cpu", max_new_tokens: int = 512
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new_tokens):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    gen = out[len(ctx) :]
+    return vocab.decode(gen)
+
+
+def _ast_ok(code: str) -> bool:
+    try:
+        ast.parse(code)
+        return True
+    except SyntaxError:
+        return False
+
+
+def _undef_refs(code: str) -> int:
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return 999
+    defined, used = set(), set()
+
+    class V(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            defined.add(node.name)
+            self.generic_visit(node)
+
+        def visit_Assign(self, node):
+            for t in node.targets:
+                if hasattr(t, "id"):
+                    defined.add(t.id)
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if isinstance(node.ctx, ast.Load):
+                used.add(node.id)
+
+    V().visit(tree)
+    ignore = set(dir(__builtins__)) | {"True", "False", "None", "self"}
+    unresolved = [n for n in used if n not in defined and n not in ignore]
+    return len(unresolved)
+
+
+def _text_sim(a: str, b: str) -> float:
+    return difflib.SequenceMatcher(None, a, b).ratio()
+
+
+def _pass1_proxy(gen: str, ref: str) -> int:
+    return int(gen.strip() == ref.strip())
+
+
+def evaluate(model, data_loader, device):
+    """Keep function name; now returns coherence-oriented metrics on the test set."""
+    # retrieve vocab from dataset via closure trick
+    ds = data_loader.dataset
+    vocab = ds.vocab  # type: ignore
+    items = data_loader.dataset.items  # type: ignore
+
+    ast_cnt = 0
+    undef_sum = 0
+    sim_sum = 0.0
+    pass_sum = 0
+    n = 0
+    gens = []
+
+    for _, _, _, refs, tids in data_loader:
+        for ref, tid in zip(refs, tids):
+            prompt = next(ex["prompt"] for ex in items if ex["task_id"] == tid)
+            gen = _greedy_generate(
+                model, vocab, prompt, device=device, max_new_tokens=512
+            )
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            ast_cnt += int(_ast_ok(gen))
+            undef_sum += _undef_refs(gen)
+            sim_sum += _text_sim(gen, ref)
+            pass_sum += _pass1_proxy(gen, ref)
+            n += 1
+
+    metrics = {
+        "AST_Parse_Rate": ast_cnt / max(1, n),
+        "UndefinedRef_Avg": undef_sum / max(1, n),
+        "TextSim_Avg": sim_sum / max(1, n),
+        "pass@1_proxy": pass_sum / max(1, n),
+    }
+    return metrics, gens
+
+
+def main(out_dir):
+    os.makedirs(out_dir, exist_ok=True)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train_loader, val_loader, test_loader = load_data()
+
+    # Build model from dataset vocab
+    vocab = train_loader.dataset.vocab  # type: ignore
+    model = SingleLayerGRU(vocab_size=len(vocab.itos), emb=128, hidden=256).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    # criterion kept for signature compatibility, not used directly (masked CE inside train())
+    criterion = nn.CrossEntropyLoss()
+
+    train(model, train_loader, optimizer, criterion, device)
+
+    # Optional: quick val loop (omitted to keep minimal changes)
+
+    # Evaluate on test
+    test_metrics, gens = evaluate(model, test_loader, device)
+
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for g in gens:
+            f.write(json.dumps(g) + "\n")
+
+    print(json.dumps(test_metrics, indent=2))
+    print("Experiment completed successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    args = parser.parse_args()
+    main(args.out_dir)
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_3/final_info.json b/frontend/demo_cache/generated/experiments/idea-2/run_3/final_info.json
new file mode 100644
index 00000000..da11c8a8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_3/final_info.json
@@ -0,0 +1,6 @@
+{
+  "AST_Parse_Rate": 1.0,
+  "UndefinedRef_Avg": 0.0,
+  "TextSim_Avg": 0.04661257702428245,
+  "pass@1_proxy": 0.0
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-2/run_3/notes.txt b/frontend/demo_cache/generated/experiments/idea-2/run_3/notes.txt
new file mode 100644
index 00000000..64e98bbd
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-2/run_3/notes.txt
@@ -0,0 +1,7 @@
+Auto-generated notes for run_3
+Updated: 2025-11-05T19:24:37.494845Z
+
+- AST_Parse_Rate: 1.0
+- TextSim_Avg: 0.04661257702428245
+- UndefinedRef_Avg: 0.0
+- pass@1_proxy: 0.0
diff --git a/frontend/demo_cache/generated/experiments/idea-3/experiment.py b/frontend/demo_cache/generated/experiments/idea-3/experiment.py
new file mode 100644
index 00000000..2a2b54ba
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/experiment.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+Lightweight HumanEval experiment for idea-3:
+- Train a single-layer GRU language model on prompt→solution pairs.
+- Report BLEU and ROUGE-L on the held-out test split.
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from collections import Counter
+from typing import Any, Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# ---------------------------------------------------------------------------
+# Data loading / preprocessing
+# ---------------------------------------------------------------------------
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks: List[Dict[str, str]] = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    if not tasks:
+        raise RuntimeError("HumanEval dataset yielded no usable tasks.")
+    return tasks
+
+
+def pseudo_split(items: Sequence[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: Sequence[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.unk_id = self.stoi["<UNK>"]
+
+    def encode(self, text: str) -> List[int]:
+        return [self.stoi.get(ch, self.unk_id) for ch in text]
+
+    def decode(self, ids: Sequence[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self,
+        items: Sequence[Dict[str, str]],
+        vocab: CharVocab,
+        max_len: int = 512,
+    ):
+        self.vocab = vocab
+        self.rows: List[Tuple[List[int], List[int], List[int], str, str, str]] = []
+        for ex in items:
+            prompt = ex["prompt"]
+            ref = ex["reference"].rstrip() + "\n"
+            ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+            tgt = vocab.encode(ref) + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ref, ex["task_id"], prompt))
+
+    def __len__(self) -> int:
+        return len(self.rows)
+
+    def __getitem__(self, idx: int):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    length = max(len(x) for x, _, _, _, _, _ in batch)
+    xs, ys, masks, refs, tids, prompts = [], [], [], [], [], []
+    for x, y, m, ref, tid, prompt in batch:
+        pad = length - len(x)
+        xs.append(x + [pad_id] * pad)
+        ys.append(y + [pad_id] * pad)
+        masks.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+        prompts.append(prompt)
+    return (
+        torch.tensor(xs, dtype=torch.long),
+        torch.tensor(ys, dtype=torch.long),
+        torch.tensor(masks, dtype=torch.float32),
+        refs,
+        tids,
+        prompts,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Model
+# ---------------------------------------------------------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 256, hidden: int = 64):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        emb = self.emb(x)
+        h, _ = self.rnn(emb)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int) -> torch.Tensor:
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    return (loss * mask).sum() / (mask.sum() + 1e-8)
+
+
+@torch.no_grad()
+def greedy_decode(
+    model: nn.Module,
+    vocab: CharVocab,
+    prompt: str,
+    max_new: int,
+    device: torch.device,
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    return vocab.decode(out[len(ctx) :])
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+def _ngram_counts(tokens: Sequence[str], max_order: int) -> Counter:
+    counts: Counter = Counter()
+    for order in range(1, max_order + 1):
+        for i in range(len(tokens) - order + 1):
+            counts[tuple(tokens[i : i + order])] += 1
+    return counts
+
+
+def compute_bleu(reference: str, candidate: str, max_order: int = 4) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not cand_tokens:
+        return 0.0
+    ref_counts = _ngram_counts(ref_tokens, max_order)
+    cand_counts = _ngram_counts(cand_tokens, max_order)
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    for ngram, count in cand_counts.items():
+        matches_by_order[len(ngram) - 1] += min(count, ref_counts.get(ngram, 0))
+    for order in range(1, max_order + 1):
+        possible_matches_by_order[order - 1] = max(
+            0, len(cand_tokens) - order + 1
+        )
+    precisions = []
+    for match, possible in zip(matches_by_order, possible_matches_by_order):
+        precisions.append((match + 1) / (possible + 1))
+    geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
+    ref_len = len(ref_tokens)
+    cand_len = len(cand_tokens)
+    if cand_len == 0:
+        return 0.0
+    ratio = cand_len / (ref_len + 1e-8)
+    brevity = 1.0 if ratio > 1.0 else math.exp(1.0 - 1.0 / (ratio + 1e-8))
+    return float(geo_mean * brevity)
+
+
+def lcs_length(a: Sequence[str], b: Sequence[str]) -> int:
+    if not a or not b:
+        return 0
+    prev = [0] * (len(b) + 1)
+    for token in a:
+        curr = [0] * (len(b) + 1)
+        for j, tok_b in enumerate(b, start=1):
+            if token == tok_b:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev = curr
+    return prev[-1]
+
+
+def compute_rouge_l(reference: str, candidate: str) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not ref_tokens or not cand_tokens:
+        return 0.0
+    lcs = lcs_length(ref_tokens, cand_tokens)
+    precision = lcs / len(cand_tokens)
+    recall = lcs / len(ref_tokens)
+    if precision + recall == 0:
+        return 0.0
+    return (2 * precision * recall) / (precision + recall)
+
+
+# ---------------------------------------------------------------------------
+# Train / evaluate
+# ---------------------------------------------------------------------------
+def train_model(
+    model: nn.Module,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    pad_id: int,
+    epochs: int,
+) -> None:
+    for epoch in range(1, epochs + 1):
+        model.train()
+        total = 0.0
+        steps = 0
+        for X, Y, M, *_ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            total += float(loss.item())
+            steps += 1
+        print(f"[Epoch {epoch}] train_loss={total / max(1, steps):.4f}")
+
+
+@torch.no_grad()
+def run_eval(
+    model: nn.Module,
+    loader: DataLoader,
+    vocab: CharVocab,
+    device: torch.device,
+    decode_max: int,
+) -> Tuple[Dict[str, Any], List[Dict[str, str]]]:
+    model.eval()
+    bleu_sum = 0.0
+    rouge_sum = 0.0
+    gens: List[Dict[str, str]] = []
+    count = 0
+    for _, _, _, refs, tids, prompts in loader:
+        for ref, tid, prompt in zip(refs, tids, prompts):
+            gen = greedy_decode(model, vocab, prompt, decode_max, device)
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            bleu_sum += compute_bleu(ref, gen)
+            rouge_sum += compute_rouge_l(ref, gen)
+            count += 1
+    metrics = {
+        "BLEU": bleu_sum / max(1, count),
+        "ROUGE_L": rouge_sum / max(1, count),
+        "Samples": count,
+    }
+    return metrics, gens
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train_items, val_items, test_items = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    coll = lambda batch: collate_fn(batch, vocab.pad_id)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll)
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = SingleLayerGRU(len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+
+    train_model(model, train_loader, optimizer, device, vocab.pad_id, epochs)
+
+    # quick validation monitoring
+    with torch.no_grad():
+        val_metrics, _ = run_eval(model, val_loader, vocab, device, decode_max)
+        print(f"[Validation] BLEU={val_metrics['BLEU']:.4f} ROUGE_L={val_metrics['ROUGE_L']:.4f}")
+
+    test_metrics, gens = run_eval(model, test_loader, vocab, device, decode_max)
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for row in gens:
+            f.write(json.dumps(row) + "\n")
+    print(json.dumps(test_metrics, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lr", type=float, default=1e-3)
+    parser.add_argument("--emb", type=int, default=256)
+    parser.add_argument("--hidden", type=int, default=64)
+    parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--decode_max", type=int, default=512)
+    parser.add_argument("--max_items", type=int, default=164)
+    args = parser.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-3/experiment_results.txt b/frontend/demo_cache/generated/experiments/idea-3/experiment_results.txt
new file mode 100644
index 00000000..85285396
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/experiment_results.txt
@@ -0,0 +1,17 @@
+{
+  "run_1": {
+    "BLEU": 0.0,
+    "ROUGE_L": 0.0,
+    "Samples": 26
+  },
+  "run_2": {
+    "BLEU": 0.0,
+    "ROUGE_L": 0.0,
+    "Samples": 26
+  },
+  "run_3": {
+    "BLEU": 0.0,
+    "ROUGE_L": 0.0,
+    "Samples": 26
+  }
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-3/notes.txt b/frontend/demo_cache/generated/experiments/idea-3/notes.txt
new file mode 100644
index 00000000..b2af560b
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/notes.txt
@@ -0,0 +1,17 @@
+Auto-generated experiment notes
+Updated: 2025-11-05T19:32:12.381403Z
+
+run_1:
+  - BLEU: 0.0
+  - ROUGE_L: 0.0
+  - Samples: 26
+
+run_2:
+  - BLEU: 0.0
+  - ROUGE_L: 0.0
+  - Samples: 26
+
+run_3:
+  - BLEU: 0.0
+  - ROUGE_L: 0.0
+  - Samples: 26
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_1.py b/frontend/demo_cache/generated/experiments/idea-3/run_1.py
new file mode 100644
index 00000000..2a2b54ba
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_1.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+Lightweight HumanEval experiment for idea-3:
+- Train a single-layer GRU language model on prompt→solution pairs.
+- Report BLEU and ROUGE-L on the held-out test split.
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from collections import Counter
+from typing import Any, Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# ---------------------------------------------------------------------------
+# Data loading / preprocessing
+# ---------------------------------------------------------------------------
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks: List[Dict[str, str]] = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    if not tasks:
+        raise RuntimeError("HumanEval dataset yielded no usable tasks.")
+    return tasks
+
+
+def pseudo_split(items: Sequence[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: Sequence[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.unk_id = self.stoi["<UNK>"]
+
+    def encode(self, text: str) -> List[int]:
+        return [self.stoi.get(ch, self.unk_id) for ch in text]
+
+    def decode(self, ids: Sequence[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self,
+        items: Sequence[Dict[str, str]],
+        vocab: CharVocab,
+        max_len: int = 512,
+    ):
+        self.vocab = vocab
+        self.rows: List[Tuple[List[int], List[int], List[int], str, str, str]] = []
+        for ex in items:
+            prompt = ex["prompt"]
+            ref = ex["reference"].rstrip() + "\n"
+            ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+            tgt = vocab.encode(ref) + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ref, ex["task_id"], prompt))
+
+    def __len__(self) -> int:
+        return len(self.rows)
+
+    def __getitem__(self, idx: int):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    length = max(len(x) for x, _, _, _, _, _ in batch)
+    xs, ys, masks, refs, tids, prompts = [], [], [], [], [], []
+    for x, y, m, ref, tid, prompt in batch:
+        pad = length - len(x)
+        xs.append(x + [pad_id] * pad)
+        ys.append(y + [pad_id] * pad)
+        masks.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+        prompts.append(prompt)
+    return (
+        torch.tensor(xs, dtype=torch.long),
+        torch.tensor(ys, dtype=torch.long),
+        torch.tensor(masks, dtype=torch.float32),
+        refs,
+        tids,
+        prompts,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Model
+# ---------------------------------------------------------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 256, hidden: int = 64):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        emb = self.emb(x)
+        h, _ = self.rnn(emb)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int) -> torch.Tensor:
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    return (loss * mask).sum() / (mask.sum() + 1e-8)
+
+
+@torch.no_grad()
+def greedy_decode(
+    model: nn.Module,
+    vocab: CharVocab,
+    prompt: str,
+    max_new: int,
+    device: torch.device,
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    return vocab.decode(out[len(ctx) :])
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+def _ngram_counts(tokens: Sequence[str], max_order: int) -> Counter:
+    counts: Counter = Counter()
+    for order in range(1, max_order + 1):
+        for i in range(len(tokens) - order + 1):
+            counts[tuple(tokens[i : i + order])] += 1
+    return counts
+
+
+def compute_bleu(reference: str, candidate: str, max_order: int = 4) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not cand_tokens:
+        return 0.0
+    ref_counts = _ngram_counts(ref_tokens, max_order)
+    cand_counts = _ngram_counts(cand_tokens, max_order)
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    for ngram, count in cand_counts.items():
+        matches_by_order[len(ngram) - 1] += min(count, ref_counts.get(ngram, 0))
+    for order in range(1, max_order + 1):
+        possible_matches_by_order[order - 1] = max(
+            0, len(cand_tokens) - order + 1
+        )
+    precisions = []
+    for match, possible in zip(matches_by_order, possible_matches_by_order):
+        precisions.append((match + 1) / (possible + 1))
+    geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
+    ref_len = len(ref_tokens)
+    cand_len = len(cand_tokens)
+    if cand_len == 0:
+        return 0.0
+    ratio = cand_len / (ref_len + 1e-8)
+    brevity = 1.0 if ratio > 1.0 else math.exp(1.0 - 1.0 / (ratio + 1e-8))
+    return float(geo_mean * brevity)
+
+
+def lcs_length(a: Sequence[str], b: Sequence[str]) -> int:
+    if not a or not b:
+        return 0
+    prev = [0] * (len(b) + 1)
+    for token in a:
+        curr = [0] * (len(b) + 1)
+        for j, tok_b in enumerate(b, start=1):
+            if token == tok_b:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev = curr
+    return prev[-1]
+
+
+def compute_rouge_l(reference: str, candidate: str) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not ref_tokens or not cand_tokens:
+        return 0.0
+    lcs = lcs_length(ref_tokens, cand_tokens)
+    precision = lcs / len(cand_tokens)
+    recall = lcs / len(ref_tokens)
+    if precision + recall == 0:
+        return 0.0
+    return (2 * precision * recall) / (precision + recall)
+
+
+# ---------------------------------------------------------------------------
+# Train / evaluate
+# ---------------------------------------------------------------------------
+def train_model(
+    model: nn.Module,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    pad_id: int,
+    epochs: int,
+) -> None:
+    for epoch in range(1, epochs + 1):
+        model.train()
+        total = 0.0
+        steps = 0
+        for X, Y, M, *_ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            total += float(loss.item())
+            steps += 1
+        print(f"[Epoch {epoch}] train_loss={total / max(1, steps):.4f}")
+
+
+@torch.no_grad()
+def run_eval(
+    model: nn.Module,
+    loader: DataLoader,
+    vocab: CharVocab,
+    device: torch.device,
+    decode_max: int,
+) -> Tuple[Dict[str, Any], List[Dict[str, str]]]:
+    model.eval()
+    bleu_sum = 0.0
+    rouge_sum = 0.0
+    gens: List[Dict[str, str]] = []
+    count = 0
+    for _, _, _, refs, tids, prompts in loader:
+        for ref, tid, prompt in zip(refs, tids, prompts):
+            gen = greedy_decode(model, vocab, prompt, decode_max, device)
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            bleu_sum += compute_bleu(ref, gen)
+            rouge_sum += compute_rouge_l(ref, gen)
+            count += 1
+    metrics = {
+        "BLEU": bleu_sum / max(1, count),
+        "ROUGE_L": rouge_sum / max(1, count),
+        "Samples": count,
+    }
+    return metrics, gens
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train_items, val_items, test_items = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    coll = lambda batch: collate_fn(batch, vocab.pad_id)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll)
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = SingleLayerGRU(len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+
+    train_model(model, train_loader, optimizer, device, vocab.pad_id, epochs)
+
+    # quick validation monitoring
+    with torch.no_grad():
+        val_metrics, _ = run_eval(model, val_loader, vocab, device, decode_max)
+        print(f"[Validation] BLEU={val_metrics['BLEU']:.4f} ROUGE_L={val_metrics['ROUGE_L']:.4f}")
+
+    test_metrics, gens = run_eval(model, test_loader, vocab, device, decode_max)
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for row in gens:
+            f.write(json.dumps(row) + "\n")
+    print(json.dumps(test_metrics, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lr", type=float, default=1e-3)
+    parser.add_argument("--emb", type=int, default=256)
+    parser.add_argument("--hidden", type=int, default=64)
+    parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--decode_max", type=int, default=512)
+    parser.add_argument("--max_items", type=int, default=164)
+    args = parser.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_1/final_info.json b/frontend/demo_cache/generated/experiments/idea-3/run_1/final_info.json
new file mode 100644
index 00000000..e5e354a8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_1/final_info.json
@@ -0,0 +1,5 @@
+{
+  "BLEU": 0.0,
+  "ROUGE_L": 0.0,
+  "Samples": 26
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_1/notes.txt b/frontend/demo_cache/generated/experiments/idea-3/run_1/notes.txt
new file mode 100644
index 00000000..1f01cfff
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_1/notes.txt
@@ -0,0 +1,6 @@
+Auto-generated notes for run_1
+Updated: 2025-11-05T19:27:08.250601Z
+
+- BLEU: 0.0
+- ROUGE_L: 0.0
+- Samples: 26
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_2.py b/frontend/demo_cache/generated/experiments/idea-3/run_2.py
new file mode 100644
index 00000000..2a2b54ba
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_2.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+Lightweight HumanEval experiment for idea-3:
+- Train a single-layer GRU language model on prompt→solution pairs.
+- Report BLEU and ROUGE-L on the held-out test split.
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from collections import Counter
+from typing import Any, Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# ---------------------------------------------------------------------------
+# Data loading / preprocessing
+# ---------------------------------------------------------------------------
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks: List[Dict[str, str]] = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    if not tasks:
+        raise RuntimeError("HumanEval dataset yielded no usable tasks.")
+    return tasks
+
+
+def pseudo_split(items: Sequence[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: Sequence[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.unk_id = self.stoi["<UNK>"]
+
+    def encode(self, text: str) -> List[int]:
+        return [self.stoi.get(ch, self.unk_id) for ch in text]
+
+    def decode(self, ids: Sequence[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self,
+        items: Sequence[Dict[str, str]],
+        vocab: CharVocab,
+        max_len: int = 512,
+    ):
+        self.vocab = vocab
+        self.rows: List[Tuple[List[int], List[int], List[int], str, str, str]] = []
+        for ex in items:
+            prompt = ex["prompt"]
+            ref = ex["reference"].rstrip() + "\n"
+            ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+            tgt = vocab.encode(ref) + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ref, ex["task_id"], prompt))
+
+    def __len__(self) -> int:
+        return len(self.rows)
+
+    def __getitem__(self, idx: int):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    length = max(len(x) for x, _, _, _, _, _ in batch)
+    xs, ys, masks, refs, tids, prompts = [], [], [], [], [], []
+    for x, y, m, ref, tid, prompt in batch:
+        pad = length - len(x)
+        xs.append(x + [pad_id] * pad)
+        ys.append(y + [pad_id] * pad)
+        masks.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+        prompts.append(prompt)
+    return (
+        torch.tensor(xs, dtype=torch.long),
+        torch.tensor(ys, dtype=torch.long),
+        torch.tensor(masks, dtype=torch.float32),
+        refs,
+        tids,
+        prompts,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Model
+# ---------------------------------------------------------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 256, hidden: int = 64):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        emb = self.emb(x)
+        h, _ = self.rnn(emb)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int) -> torch.Tensor:
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    return (loss * mask).sum() / (mask.sum() + 1e-8)
+
+
+@torch.no_grad()
+def greedy_decode(
+    model: nn.Module,
+    vocab: CharVocab,
+    prompt: str,
+    max_new: int,
+    device: torch.device,
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    return vocab.decode(out[len(ctx) :])
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+def _ngram_counts(tokens: Sequence[str], max_order: int) -> Counter:
+    counts: Counter = Counter()
+    for order in range(1, max_order + 1):
+        for i in range(len(tokens) - order + 1):
+            counts[tuple(tokens[i : i + order])] += 1
+    return counts
+
+
+def compute_bleu(reference: str, candidate: str, max_order: int = 4) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not cand_tokens:
+        return 0.0
+    ref_counts = _ngram_counts(ref_tokens, max_order)
+    cand_counts = _ngram_counts(cand_tokens, max_order)
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    for ngram, count in cand_counts.items():
+        matches_by_order[len(ngram) - 1] += min(count, ref_counts.get(ngram, 0))
+    for order in range(1, max_order + 1):
+        possible_matches_by_order[order - 1] = max(
+            0, len(cand_tokens) - order + 1
+        )
+    precisions = []
+    for match, possible in zip(matches_by_order, possible_matches_by_order):
+        precisions.append((match + 1) / (possible + 1))
+    geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
+    ref_len = len(ref_tokens)
+    cand_len = len(cand_tokens)
+    if cand_len == 0:
+        return 0.0
+    ratio = cand_len / (ref_len + 1e-8)
+    brevity = 1.0 if ratio > 1.0 else math.exp(1.0 - 1.0 / (ratio + 1e-8))
+    return float(geo_mean * brevity)
+
+
+def lcs_length(a: Sequence[str], b: Sequence[str]) -> int:
+    if not a or not b:
+        return 0
+    prev = [0] * (len(b) + 1)
+    for token in a:
+        curr = [0] * (len(b) + 1)
+        for j, tok_b in enumerate(b, start=1):
+            if token == tok_b:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev = curr
+    return prev[-1]
+
+
+def compute_rouge_l(reference: str, candidate: str) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not ref_tokens or not cand_tokens:
+        return 0.0
+    lcs = lcs_length(ref_tokens, cand_tokens)
+    precision = lcs / len(cand_tokens)
+    recall = lcs / len(ref_tokens)
+    if precision + recall == 0:
+        return 0.0
+    return (2 * precision * recall) / (precision + recall)
+
+
+# ---------------------------------------------------------------------------
+# Train / evaluate
+# ---------------------------------------------------------------------------
+def train_model(
+    model: nn.Module,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    pad_id: int,
+    epochs: int,
+) -> None:
+    for epoch in range(1, epochs + 1):
+        model.train()
+        total = 0.0
+        steps = 0
+        for X, Y, M, *_ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            total += float(loss.item())
+            steps += 1
+        print(f"[Epoch {epoch}] train_loss={total / max(1, steps):.4f}")
+
+
+@torch.no_grad()
+def run_eval(
+    model: nn.Module,
+    loader: DataLoader,
+    vocab: CharVocab,
+    device: torch.device,
+    decode_max: int,
+) -> Tuple[Dict[str, Any], List[Dict[str, str]]]:
+    model.eval()
+    bleu_sum = 0.0
+    rouge_sum = 0.0
+    gens: List[Dict[str, str]] = []
+    count = 0
+    for _, _, _, refs, tids, prompts in loader:
+        for ref, tid, prompt in zip(refs, tids, prompts):
+            gen = greedy_decode(model, vocab, prompt, decode_max, device)
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            bleu_sum += compute_bleu(ref, gen)
+            rouge_sum += compute_rouge_l(ref, gen)
+            count += 1
+    metrics = {
+        "BLEU": bleu_sum / max(1, count),
+        "ROUGE_L": rouge_sum / max(1, count),
+        "Samples": count,
+    }
+    return metrics, gens
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train_items, val_items, test_items = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    coll = lambda batch: collate_fn(batch, vocab.pad_id)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll)
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = SingleLayerGRU(len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+
+    train_model(model, train_loader, optimizer, device, vocab.pad_id, epochs)
+
+    # quick validation monitoring
+    with torch.no_grad():
+        val_metrics, _ = run_eval(model, val_loader, vocab, device, decode_max)
+        print(f"[Validation] BLEU={val_metrics['BLEU']:.4f} ROUGE_L={val_metrics['ROUGE_L']:.4f}")
+
+    test_metrics, gens = run_eval(model, test_loader, vocab, device, decode_max)
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for row in gens:
+            f.write(json.dumps(row) + "\n")
+    print(json.dumps(test_metrics, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lr", type=float, default=1e-3)
+    parser.add_argument("--emb", type=int, default=256)
+    parser.add_argument("--hidden", type=int, default=64)
+    parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--decode_max", type=int, default=512)
+    parser.add_argument("--max_items", type=int, default=164)
+    args = parser.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_2/final_info.json b/frontend/demo_cache/generated/experiments/idea-3/run_2/final_info.json
new file mode 100644
index 00000000..e5e354a8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_2/final_info.json
@@ -0,0 +1,5 @@
+{
+  "BLEU": 0.0,
+  "ROUGE_L": 0.0,
+  "Samples": 26
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_2/notes.txt b/frontend/demo_cache/generated/experiments/idea-3/run_2/notes.txt
new file mode 100644
index 00000000..8831ecf6
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_2/notes.txt
@@ -0,0 +1,6 @@
+Auto-generated notes for run_2
+Updated: 2025-11-05T19:29:40.248282Z
+
+- BLEU: 0.0
+- ROUGE_L: 0.0
+- Samples: 26
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_3.py b/frontend/demo_cache/generated/experiments/idea-3/run_3.py
new file mode 100644
index 00000000..2a2b54ba
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_3.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+Lightweight HumanEval experiment for idea-3:
+- Train a single-layer GRU language model on prompt→solution pairs.
+- Report BLEU and ROUGE-L on the held-out test split.
+"""
+
+import argparse
+import json
+import math
+import os
+import random
+from collections import Counter
+from typing import Any, Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+
+try:
+    from datasets import load_dataset  # pip install datasets
+except Exception:
+    load_dataset = None
+
+SEED = 42
+random.seed(SEED)
+torch.manual_seed(SEED)
+
+
+# ---------------------------------------------------------------------------
+# Data loading / preprocessing
+# ---------------------------------------------------------------------------
+def load_humaneval(max_items: int = 164) -> List[Dict[str, str]]:
+    if load_dataset is None:
+        raise RuntimeError(
+            "Please install `datasets` (pip install datasets) to load HumanEval."
+        )
+    dataset = load_dataset("openai_humaneval")
+    split_name = "test" if "test" in dataset else next(iter(dataset.keys()))
+    records = dataset[split_name]
+    tasks: List[Dict[str, str]] = []
+    for i, ex in enumerate(records):
+        if i >= max_items:
+            break
+        prompt = ex.get("prompt", "")
+        reference = ex.get("canonical_solution", "")
+        if prompt and reference:
+            tasks.append(
+                {
+                    "task_id": ex.get("task_id", f"HE-{i}"),
+                    "prompt": prompt,
+                    "reference": reference,
+                }
+            )
+    if not tasks:
+        raise RuntimeError("HumanEval dataset yielded no usable tasks.")
+    return tasks
+
+
+def pseudo_split(items: Sequence[Dict[str, str]]) -> Tuple[List, List, List]:
+    idx = list(range(len(items)))
+    random.Random(SEED).shuffle(idx)
+    n = len(items)
+    n_train = int(0.7 * n)
+    n_val = int(0.15 * n)
+    train = [items[i] for i in idx[:n_train]]
+    val = [items[i] for i in idx[n_train : n_train + n_val]]
+    test = [items[i] for i in idx[n_train + n_val :]]
+    return train, val, test
+
+
+class CharVocab:
+    def __init__(self, texts: Sequence[str]):
+        specials = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"]
+        charset = set()
+        for t in texts:
+            charset.update(t)
+        self.itos = specials + sorted(ch for ch in charset if ch not in specials)
+        self.stoi = {ch: i for i, ch in enumerate(self.itos)}
+        self.pad_id = self.stoi["<PAD>"]
+        self.bos_id = self.stoi["<BOS>"]
+        self.eos_id = self.stoi["<EOS>"]
+        self.unk_id = self.stoi["<UNK>"]
+
+    def encode(self, text: str) -> List[int]:
+        return [self.stoi.get(ch, self.unk_id) for ch in text]
+
+    def decode(self, ids: Sequence[int]) -> str:
+        return "".join(self.itos[i] for i in ids if 0 <= i < len(self.itos))
+
+
+class HEDataset(Dataset):
+    def __init__(
+        self,
+        items: Sequence[Dict[str, str]],
+        vocab: CharVocab,
+        max_len: int = 512,
+    ):
+        self.vocab = vocab
+        self.rows: List[Tuple[List[int], List[int], List[int], str, str, str]] = []
+        for ex in items:
+            prompt = ex["prompt"]
+            ref = ex["reference"].rstrip() + "\n"
+            ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+            tgt = vocab.encode(ref) + [vocab.eos_id]
+            x = (ctx + tgt)[:max_len]
+            y = x[1:] + [vocab.eos_id]
+            ctx_len = min(len(ctx), len(x))
+            mask = [0] * ctx_len + [1] * (len(x) - ctx_len)
+            self.rows.append((x, y, mask, ref, ex["task_id"], prompt))
+
+    def __len__(self) -> int:
+        return len(self.rows)
+
+    def __getitem__(self, idx: int):
+        return self.rows[idx]
+
+
+def collate_fn(batch, pad_id: int):
+    length = max(len(x) for x, _, _, _, _, _ in batch)
+    xs, ys, masks, refs, tids, prompts = [], [], [], [], [], []
+    for x, y, m, ref, tid, prompt in batch:
+        pad = length - len(x)
+        xs.append(x + [pad_id] * pad)
+        ys.append(y + [pad_id] * pad)
+        masks.append(m + [0] * pad)
+        refs.append(ref)
+        tids.append(tid)
+        prompts.append(prompt)
+    return (
+        torch.tensor(xs, dtype=torch.long),
+        torch.tensor(ys, dtype=torch.long),
+        torch.tensor(masks, dtype=torch.float32),
+        refs,
+        tids,
+        prompts,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Model
+# ---------------------------------------------------------------------------
+class SingleLayerGRU(nn.Module):
+    def __init__(self, vocab_size: int, emb: int = 256, hidden: int = 64):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, emb, padding_idx=0)
+        self.rnn = nn.GRU(emb, hidden, num_layers=1, batch_first=True)
+        self.head = nn.Linear(hidden, vocab_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        emb = self.emb(x)
+        h, _ = self.rnn(emb)
+        return self.head(h)
+
+
+def masked_ce(logits, targets, mask, pad_id: int) -> torch.Tensor:
+    B, L, V = logits.shape
+    loss = nn.functional.cross_entropy(
+        logits.reshape(B * L, V),
+        targets.reshape(B * L),
+        ignore_index=pad_id,
+        reduction="none",
+    ).reshape(B, L)
+    return (loss * mask).sum() / (mask.sum() + 1e-8)
+
+
+@torch.no_grad()
+def greedy_decode(
+    model: nn.Module,
+    vocab: CharVocab,
+    prompt: str,
+    max_new: int,
+    device: torch.device,
+) -> str:
+    model.eval()
+    ctx = [vocab.bos_id] + vocab.encode(prompt + "\n# solution:\n")
+    x = torch.tensor([ctx], dtype=torch.long, device=device)
+    out = ctx.copy()
+    for _ in range(max_new):
+        logits = model(x)[:, -1, :]
+        nid = int(torch.argmax(logits, dim=-1).item())
+        out.append(nid)
+        x = torch.tensor([out], dtype=torch.long, device=device)
+        if nid == vocab.eos_id:
+            break
+    return vocab.decode(out[len(ctx) :])
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+def _ngram_counts(tokens: Sequence[str], max_order: int) -> Counter:
+    counts: Counter = Counter()
+    for order in range(1, max_order + 1):
+        for i in range(len(tokens) - order + 1):
+            counts[tuple(tokens[i : i + order])] += 1
+    return counts
+
+
+def compute_bleu(reference: str, candidate: str, max_order: int = 4) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not cand_tokens:
+        return 0.0
+    ref_counts = _ngram_counts(ref_tokens, max_order)
+    cand_counts = _ngram_counts(cand_tokens, max_order)
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    for ngram, count in cand_counts.items():
+        matches_by_order[len(ngram) - 1] += min(count, ref_counts.get(ngram, 0))
+    for order in range(1, max_order + 1):
+        possible_matches_by_order[order - 1] = max(
+            0, len(cand_tokens) - order + 1
+        )
+    precisions = []
+    for match, possible in zip(matches_by_order, possible_matches_by_order):
+        precisions.append((match + 1) / (possible + 1))
+    geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
+    ref_len = len(ref_tokens)
+    cand_len = len(cand_tokens)
+    if cand_len == 0:
+        return 0.0
+    ratio = cand_len / (ref_len + 1e-8)
+    brevity = 1.0 if ratio > 1.0 else math.exp(1.0 - 1.0 / (ratio + 1e-8))
+    return float(geo_mean * brevity)
+
+
+def lcs_length(a: Sequence[str], b: Sequence[str]) -> int:
+    if not a or not b:
+        return 0
+    prev = [0] * (len(b) + 1)
+    for token in a:
+        curr = [0] * (len(b) + 1)
+        for j, tok_b in enumerate(b, start=1):
+            if token == tok_b:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev = curr
+    return prev[-1]
+
+
+def compute_rouge_l(reference: str, candidate: str) -> float:
+    ref_tokens = reference.strip().split()
+    cand_tokens = candidate.strip().split()
+    if not ref_tokens or not cand_tokens:
+        return 0.0
+    lcs = lcs_length(ref_tokens, cand_tokens)
+    precision = lcs / len(cand_tokens)
+    recall = lcs / len(ref_tokens)
+    if precision + recall == 0:
+        return 0.0
+    return (2 * precision * recall) / (precision + recall)
+
+
+# ---------------------------------------------------------------------------
+# Train / evaluate
+# ---------------------------------------------------------------------------
+def train_model(
+    model: nn.Module,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    pad_id: int,
+    epochs: int,
+) -> None:
+    for epoch in range(1, epochs + 1):
+        model.train()
+        total = 0.0
+        steps = 0
+        for X, Y, M, *_ in train_loader:
+            X, Y, M = X.to(device), Y.to(device), M.to(device)
+            optimizer.zero_grad()
+            logits = model(X)
+            loss = masked_ce(logits, Y, M, pad_id)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            total += float(loss.item())
+            steps += 1
+        print(f"[Epoch {epoch}] train_loss={total / max(1, steps):.4f}")
+
+
+@torch.no_grad()
+def run_eval(
+    model: nn.Module,
+    loader: DataLoader,
+    vocab: CharVocab,
+    device: torch.device,
+    decode_max: int,
+) -> Tuple[Dict[str, Any], List[Dict[str, str]]]:
+    model.eval()
+    bleu_sum = 0.0
+    rouge_sum = 0.0
+    gens: List[Dict[str, str]] = []
+    count = 0
+    for _, _, _, refs, tids, prompts in loader:
+        for ref, tid, prompt in zip(refs, tids, prompts):
+            gen = greedy_decode(model, vocab, prompt, decode_max, device)
+            gens.append({"task_id": tid, "generated": gen, "reference": ref})
+            bleu_sum += compute_bleu(ref, gen)
+            rouge_sum += compute_rouge_l(ref, gen)
+            count += 1
+    metrics = {
+        "BLEU": bleu_sum / max(1, count),
+        "ROUGE_L": rouge_sum / max(1, count),
+        "Samples": count,
+    }
+    return metrics, gens
+
+
+def main(
+    out_dir: str,
+    epochs: int,
+    batch_size: int,
+    lr: float,
+    emb: int,
+    hidden: int,
+    max_len: int,
+    decode_max: int,
+    max_items: int,
+):
+    os.makedirs(out_dir, exist_ok=True)
+    items = load_humaneval(max_items=max_items)
+    train_items, val_items, test_items = pseudo_split(items)
+
+    vocab = CharVocab(
+        [ex["prompt"] for ex in train_items] + [ex["reference"] for ex in train_items]
+    )
+    train_ds = HEDataset(train_items, vocab, max_len=max_len)
+    val_ds = HEDataset(val_items, vocab, max_len=max_len)
+    test_ds = HEDataset(test_items, vocab, max_len=max_len)
+
+    coll = lambda batch: collate_fn(batch, vocab.pad_id)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=coll)
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=coll)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = SingleLayerGRU(len(vocab.itos), emb=emb, hidden=hidden).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
+
+    train_model(model, train_loader, optimizer, device, vocab.pad_id, epochs)
+
+    # quick validation monitoring
+    with torch.no_grad():
+        val_metrics, _ = run_eval(model, val_loader, vocab, device, decode_max)
+        print(f"[Validation] BLEU={val_metrics['BLEU']:.4f} ROUGE_L={val_metrics['ROUGE_L']:.4f}")
+
+    test_metrics, gens = run_eval(model, test_loader, vocab, device, decode_max)
+    with open(os.path.join(out_dir, "final_info.json"), "w") as f:
+        json.dump(test_metrics, f, indent=2)
+    with open(os.path.join(out_dir, "generations.jsonl"), "w") as f:
+        for row in gens:
+            f.write(json.dumps(row) + "\n")
+    print(json.dumps(test_metrics, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lr", type=float, default=1e-3)
+    parser.add_argument("--emb", type=int, default=256)
+    parser.add_argument("--hidden", type=int, default=64)
+    parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--decode_max", type=int, default=512)
+    parser.add_argument("--max_items", type=int, default=164)
+    args = parser.parse_args()
+    main(
+        args.out_dir,
+        args.epochs,
+        args.batch_size,
+        args.lr,
+        args.emb,
+        args.hidden,
+        args.max_len,
+        args.decode_max,
+        args.max_items,
+    )
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_3/final_info.json b/frontend/demo_cache/generated/experiments/idea-3/run_3/final_info.json
new file mode 100644
index 00000000..e5e354a8
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_3/final_info.json
@@ -0,0 +1,5 @@
+{
+  "BLEU": 0.0,
+  "ROUGE_L": 0.0,
+  "Samples": 26
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/generated/experiments/idea-3/run_3/notes.txt b/frontend/demo_cache/generated/experiments/idea-3/run_3/notes.txt
new file mode 100644
index 00000000..d4c8c3e4
--- /dev/null
+++ b/frontend/demo_cache/generated/experiments/idea-3/run_3/notes.txt
@@ -0,0 +1,6 @@
+Auto-generated notes for run_3
+Updated: 2025-11-05T19:32:12.380302Z
+
+- BLEU: 0.0
+- ROUGE_L: 0.0
+- Samples: 26
diff --git a/frontend/demo_cache/generated/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf b/frontend/demo_cache/generated/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf
new file mode 100644
index 00000000..766d940c
Binary files /dev/null and b/frontend/demo_cache/generated/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf differ
diff --git a/frontend/demo_cache/generated/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf b/frontend/demo_cache/generated/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf
new file mode 100644
index 00000000..c87d7549
Binary files /dev/null and b/frontend/demo_cache/generated/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf differ
diff --git a/frontend/demo_cache/generated/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf b/frontend/demo_cache/generated/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf
new file mode 100644
index 00000000..627a9593
Binary files /dev/null and b/frontend/demo_cache/generated/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf differ
diff --git a/frontend/demo_cache/ideas/idea-1-1/idea.json b/frontend/demo_cache/ideas/idea-1-1/idea.json
new file mode 100644
index 00000000..6eff29e0
--- /dev/null
+++ b/frontend/demo_cache/ideas/idea-1-1/idea.json
@@ -0,0 +1,47 @@
+{
+  "content": "**Description:**\nThis research investigates a novel adaptive mechanism for prompt decomposition in large language models to enhance coherence in long-range code generation. By dynamically adjusting prompt structures based on the complexity and context of the coding task, we aim to overcome the limitations of static prompt engineering and achieve more coherent and consistent code outputs over extensive sequences.\n\n**Impact:**\nAs software systems grow in complexity, the demand for autonomous coding tools that can manage and generate coherent long-range code is increasing. Current models like Codex and CodeBERT, while groundbreaking, struggle with maintaining coherence over long sequences, limiting their utility in real-world applications. Addressing this gap is crucial for advancing the capability of automated code generation tools, aligning with the community's push towards more robust and scalable AI-driven software development solutions.\n\n**Feasibility:**\n(1) The context window size of LLMs is limited, leading to information loss over long sequences. (2) Static prompt engineering fails to adapt to the dynamic and diverse nature of real-world coding tasks. (3) Balancing prompt decomposition granularity with model synthesis capabilities is complex. (4) Ensuring that adaptive decomposition does not introduce prohibitive computational overhead.\n\n**Novelty:**\nExisting methods such as Codex and PaLM focus primarily on enhancing the model\u2019s capacity to understand and generate code, with limited attention to how prompts are decomposed for long-range coherence. They use static prompt engineering, which cannot adequately manage the dynamic nature of complex coding tasks. Our approach introduces a dynamic, context-aware decomposition mechanism, allowing the model to adjust the granularity of decomposition based on task complexity. This context-sensitive adaptation is not present in existing methods, which often result in fragmented or inconsistent outputs. Our method leverages the structure of the task itself to guide prompt decomposition, maintaining coherence over longer sequences without the overhead associated with naively increasing context window sizes.",
+  "originalData": {
+    "Approach": "Our core algorithm involves a dynamic prompt decomposition mechanism that evaluates the task's structural and contextual complexity. (1) To address context window limitations, we introduce a sliding window mechanism that adapts the context window size based on real-time task analysis. (2) For handling dynamic task nature, we propose a feedback loop where the model evaluates intermediate outputs to adjust decomposition strategies. (3) To balance granularity and synthesis, the algorithm uses a hierarchical approach, breaking down tasks into nested segments that maintain logical coherence. (4) To manage computational overhead, we incorporate a lightweight heuristic-driven evaluation that determines when and how to adjust decomposition strategies, ensuring efficiency.",
+    "Description": "This research investigates a novel adaptive mechanism for prompt decomposition in large language models to enhance coherence in long-range code generation. By dynamically adjusting prompt structures based on the complexity and context of the coding task, we aim to overcome the limitations of static prompt engineering and achieve more coherent and consistent code outputs over extensive sequences.",
+    "Difficulty": "(1) The context window size of LLMs is limited, leading to information loss over long sequences. (2) Static prompt engineering fails to adapt to the dynamic and diverse nature of real-world coding tasks. (3) Balancing prompt decomposition granularity with model synthesis capabilities is complex. (4) Ensuring that adaptive decomposition does not introduce prohibitive computational overhead.",
+    "Experiment": {
+      "Dataset": {
+        "Load_Command": "datasets.load_dataset('ag_news')",
+        "Name": "ag_news",
+        "Preprocessing": "Lowercasing, Tokenization, Padding/Truncation to 100 tokens, TF-IDF with 300 features",
+        "Size": 5000,
+        "Splits": {
+          "Test": 500,
+          "Train": 4000,
+          "Validation": 500
+        }
+      },
+      "Metric": {
+        "Justification": "Coherence Score to evaluate long-range coherence; BLEU for overall sequence similarity to reference.",
+        "Primary": "Sequence Coherence Score",
+        "Secondary": "BLEU Score"
+      },
+      "Model": {
+        "Hidden_Units": 64,
+        "Input_Dimensions": 300,
+        "Output_Dimensions": 300,
+        "Parameters": 68400,
+        "Type": "Single-layer GRU"
+      }
+    },
+    "ExperimentTable": "| Component           | Specification                                                                                                                                 | Justification / Rationale                                                                                                 | Status |\n|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture  | Single-layer GRU with 64 hidden units, input and output dimensions 300, total parameters approximately 68,400.                                | GRUs can handle sequential data and mimic context-aware adjustments suitable for exploring dynamic prompt decomposition.   |        |\n| Dataset             | AG News dataset, 5,000 samples with a 4000/500/500 train/val/test split. Preprocess by lowercasing, tokenizing, and using TF-IDF vectors.     | AG News offers sufficient complexity to simulate code-like sequences. TF-IDF helps in capturing token importance.          |        |\n| Baselines           | Static prompt decomposition (simple heuristic-based), Bag-of-words logistic regression, Shallow MLP with 1 hidden layer.                      | Comparative methods to highlight the benefits of dynamic decomposition versus static and other simple models.              |        |\n| Training Setup      | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: CPU                                                              | Basic yet effective training setup for lightweight models in initial experiments.                                         |        |\n| Evaluation Metrics  | Primary: Sequence Coherence Score; Secondary: BLEU Score                                                                                     | Coherence Score directly measures long-range coherence; BLEU provides a standard sequence similarity metric.               |        |\n| Hyperparameters     | GRU hidden units: 64, TF-IDF features: 300, Sequence length: 100 tokens                                                                      | Balances model complexity with dataset structure, ensuring feasibility within resource constraints.                        |        |\n| **Sanity Checks**   | Dataset subsampling strategy confirmed (\u22645,000 train / \u22642,000 val/test). Model parameter count estimated (\u2264100k). No JSON comments present.   |                                                                                                                          |        |",
+    "Feasibility": 7,
+    "Importance": "As software systems grow in complexity, the demand for autonomous coding tools that can manage and generate coherent long-range code is increasing. Current models like Codex and CodeBERT, while groundbreaking, struggle with maintaining coherence over long sequences, limiting their utility in real-world applications. Addressing this gap is crucial for advancing the capability of automated code generation tools, aligning with the community's push towards more robust and scalable AI-driven software development solutions.",
+    "IntentAlignment": 8,
+    "Interestingness": 8,
+    "Name": "Adaptive Prompt Decomposition for Coherent Long-Range Code Generation",
+    "Novelty": 9,
+    "NoveltyComparison": "Existing methods such as Codex and PaLM focus primarily on enhancing the model\u2019s capacity to understand and generate code, with limited attention to how prompts are decomposed for long-range coherence. They use static prompt engineering, which cannot adequately manage the dynamic nature of complex coding tasks. Our approach introduces a dynamic, context-aware decomposition mechanism, allowing the model to adjust the granularity of decomposition based on task complexity. This context-sensitive adaptation is not present in existing methods, which often result in fragmented or inconsistent outputs. Our method leverages the structure of the task itself to guide prompt decomposition, maintaining coherence over longer sequences without the overhead associated with naively increasing context window sizes.",
+    "Problem": "Can dynamic, context-aware prompt decomposition improve the coherence of long-range code generation in large language models compared to static methods?",
+    "Score": 8,
+    "Title": "Dynamic Context-Aware Prompt Decomposition for Improved Coherence in Long-Range Code Generation by LLMs",
+    "is_experimental": true
+  },
+  "title": "Adaptive prompt decomposition for coherent long-range code generation",
+  "id": "idea-1-1"
+}
diff --git a/frontend/demo_cache/ideas/idea-1-2/idea.json b/frontend/demo_cache/ideas/idea-1-2/idea.json
new file mode 100644
index 00000000..af0b6b22
--- /dev/null
+++ b/frontend/demo_cache/ideas/idea-1-2/idea.json
@@ -0,0 +1,47 @@
+{
+  "content": "**Description:**\nThis research proposes an adaptive prompt decomposition technique to improve the coherence of large language models (LLMs) in generating long-range code. The method dynamically segments prompts based on complexity and context, ensuring that the model maintains coherent code generation across lengthy sequences.\n\n**Impact:**\nThe research addresses a critical gap in the ability of LLMs like Codex and CodeBERT to generate coherent long-range code, which is vital for complex software development. As these models become more integrated into automated coding tools, their capacity to manage large and dynamic codebases coherently is in high demand. This study aligns with the trend toward autonomous coding solutions and the need for more intelligent prompt engineering strategies.\n\n**Feasibility:**\n(1) Maintaining coherence in long-range code generation is challenging due to LLMs' limited context window sizes, leading to fragmentation. (2) Existing static prompt engineering techniques do not adapt to the diverse and dynamic nature of real-world coding tasks. (3) Balancing the granularity of decomposition to prevent information loss while preserving synthesis capability is difficult.\n\n**Novelty:**\nWhile models like OpenAI's Codex and Google's PaLM have advanced code comprehension and generation, they lack mechanisms for adaptive prompt decomposition. Existing static methods fail to account for the variable complexity of real coding environments, leading to suboptimal coherence in extensive code sequences. Our approach introduces a context-aware adaptive mechanism that adjusts prompt decomposition based on task structure and complexity in real-time, a capability not realized in prior work. By focusing on dynamic decomposition, our method prevents the coherence breakdown seen in existing models, offering a significant leap in maintaining code integrity over long sequences.",
+  "originalData": {
+    "Approach": "The core algorithm involves a dynamic prompt decomposition strategy that evaluates the task's complexity and context. (1) To tackle coherence issues, our method uses a sliding window technique combined with semantic analysis to ensure that context is maintained throughout generation. (2) We introduce an adaptive mechanism that analyzes code structure and adjusts decomposition dynamically, unlike static methods that fail in diverse environments. (3) Our method balances granularity by using a feedback loop that assesses the coherence of generated segments, ensuring that the synthesis capability remains intact while preventing information loss.",
+    "Description": "This research proposes an adaptive prompt decomposition technique to improve the coherence of large language models (LLMs) in generating long-range code. The method dynamically segments prompts based on complexity and context, ensuring that the model maintains coherent code generation across lengthy sequences.",
+    "Difficulty": "(1) Maintaining coherence in long-range code generation is challenging due to LLMs' limited context window sizes, leading to fragmentation. (2) Existing static prompt engineering techniques do not adapt to the diverse and dynamic nature of real-world coding tasks. (3) Balancing the granularity of decomposition to prevent information loss while preserving synthesis capability is difficult.",
+    "Experiment": {
+      "Dataset": {
+        "Load_Command": "datasets.load_dataset('code_x_glue_cc_clone_detection_big_clone_bench')",
+        "Name": "code_x_glue_cc_clone_detection_big_clone_bench",
+        "Preprocessing": "Tokenization with CountVectorizer, max_features=512",
+        "Size": 7000,
+        "Splits": {
+          "Test": 1000,
+          "Train": 5000,
+          "Validation": 1000
+        }
+      },
+      "Metric": {
+        "Primary": "BLEU Score",
+        "Secondary": "Code Coherence Metric (CCM)"
+      },
+      "Model": {
+        "Architecture": "Shallow MLP",
+        "Hidden_Layers": 1,
+        "Hidden_Units": 128,
+        "Input_Dimension": 512,
+        "Output_Dimension": 256,
+        "Total_Parameters": 98752
+      }
+    },
+    "ExperimentTable": "| Component            | Specification                                                                 | Justification / Rationale                                                                                                                                                                     | Status |\n|----------------------|-------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture   | Shallow MLP with 1 hidden layer of 128 units, input dimension 512, output 256 | Ensures model simplicity while allowing some degree of learning from the input features. This architecture is a balance between complexity and feasibility given the parameter constraints.    |        |\n| Dataset              | Use `code_x_glue_cc_clone_detection_big_clone_bench` with 5000 train, 1000 val, 1000 test | Suitable for code generation tasks, providing a realistic scenario to test prompt decomposition methods. The dataset is on HuggingFace and can be easily loaded with a command. |        |\n| Baselines            | Static prompt decomposition, random prompt segmentation, heuristic-based decomposition | Comparing against existing methods allows us to assess improvements due to adaptive decomposition. Literature: Prominent in works on prompt engineering for LLMs.                            |        |\n| Training Setup       | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: GPU | Standard setup for training lightweight models efficiently, ensuring convergence within practical time limits.                                                                               |        |\n| Evaluation Metrics   | BLEU Score for language generation quality, Code Coherence Metric (CCM) for coherence | BLEU is widely used in language generation evaluation. CCM can be calculated based on structural and semantic coherence, crucial for assessing code quality.                                   |        |\n| Hyperparameters      | Hidden Units: 128, Learning Rate: 0.001, Batch Size: 32                                                            | Key hyperparameters that directly impact model training efficiency and effectiveness, selected based on prior studies on shallow networks.                                                    |        |\n| **Sanity Checks**    | Dataset limited to \u22645,000 train / \u22642,000 val/test; Model \u2264100k parameters; JSON contains no inline comments | Ensures feasibility by preventing excessive computational requirements and maintaining clarity in JSON format.                                                                                |        |",
+    "Feasibility": 7,
+    "Importance": "The research addresses a critical gap in the ability of LLMs like Codex and CodeBERT to generate coherent long-range code, which is vital for complex software development. As these models become more integrated into automated coding tools, their capacity to manage large and dynamic codebases coherently is in high demand. This study aligns with the trend toward autonomous coding solutions and the need for more intelligent prompt engineering strategies.",
+    "IntentAlignment": 9,
+    "Interestingness": 9,
+    "Name": "Adaptive Decomposition for LLM Code Generation",
+    "Novelty": 8,
+    "NoveltyComparison": "While models like OpenAI's Codex and Google's PaLM have advanced code comprehension and generation, they lack mechanisms for adaptive prompt decomposition. Existing static methods fail to account for the variable complexity of real coding environments, leading to suboptimal coherence in extensive code sequences. Our approach introduces a context-aware adaptive mechanism that adjusts prompt decomposition based on task structure and complexity in real-time, a capability not realized in prior work. By focusing on dynamic decomposition, our method prevents the coherence breakdown seen in existing models, offering a significant leap in maintaining code integrity over long sequences.",
+    "Problem": "Can an adaptive prompt decomposition technique improve the coherence of long-range code generation in large language models compared to existing static methods?",
+    "Score": 8,
+    "Title": "Adaptive Prompt Decomposition for Enhanced Coherence in Long-Range Code Generation by Large Language Models",
+    "is_experimental": true
+  },
+  "title": "Adaptive decomposition for llm code generation",
+  "id": "idea-1-2"
+}
diff --git a/frontend/demo_cache/ideas/idea-1-3/idea.json b/frontend/demo_cache/ideas/idea-1-3/idea.json
new file mode 100644
index 00000000..c1fdc331
--- /dev/null
+++ b/frontend/demo_cache/ideas/idea-1-3/idea.json
@@ -0,0 +1,68 @@
+{
+  "content": "**Description:**\nThis research introduces a dynamic, context-aware prompt decomposition method aimed at improving the coherence of long-range code generation by large language models. Our approach intelligently segments prompts based on real-time analysis of code complexity and structure, addressing the limitations of static prompt engineering methods.\n\n**Impact:**\nThe problem is critical as the demand for coherent, long-range code generation grows with the increasing complexity of software systems. Recent literature, such as works on Codex and PaLM, acknowledges the struggle with coherence over extended sequences. Addressing this gap can lead to more effective autonomous coding tools, aligning with current AI trends in software development.\n\n**Feasibility:**\n(1) The context window limitations of LLMs lead to information loss over long sequences, making coherence difficult to maintain. (2) Static prompt engineering techniques are inflexible, unable to adapt to the dynamic nature of real-world coding tasks. (3) Balancing decomposition granularity with meaningful code synthesis is complex, as overly fragmented prompts can hinder coherence.\n\n**Novelty:**\nExisting works like OpenAI's Codex and Google's PaLM focus largely on enhancing the models' overall capacity for code understanding and generation but do not address the dynamic decomposition of prompts. Our approach uniquely introduces real-time context analysis to adaptively decompose prompts, a capability not realized in static methods. Prior methods often lead to fragmented outputs due to their lack of adaptability to changing code structures. By implementing a dynamic mechanism that evaluates and adjusts based on the task's complexity, our method ensures better coherence across long sequences. This adaptive approach fills the gap left by static engineering techniques that fail to accommodate diverse code structures and their varying demands.",
+  "originalData": {
+    "Approach": "Our core mechanism involves a dynamic context-awareness module that evaluates the complexity and structure of the code task at hand. (1) To address context window limitations, our method segments prompts into contextually informed units, preserving essential information across long sequences. (2) For the inflexibility of static methods, we introduce a feedback loop that continuously adapts prompt decomposition based on real-time complexity assessments. (3) In balancing decomposition granularity, our approach uses a computational model to predict the optimal segment size for maintaining coherence without overwhelming the model's synthesis capabilities. These innovations enable our method to dynamically adjust to diverse coding tasks, ensuring coherent long-range code generation.",
+    "Description": "This research introduces a dynamic, context-aware prompt decomposition method aimed at improving the coherence of long-range code generation by large language models. Our approach intelligently segments prompts based on real-time analysis of code complexity and structure, addressing the limitations of static prompt engineering methods.",
+    "Difficulty": "(1) The context window limitations of LLMs lead to information loss over long sequences, making coherence difficult to maintain. (2) Static prompt engineering techniques are inflexible, unable to adapt to the dynamic nature of real-world coding tasks. (3) Balancing decomposition granularity with meaningful code synthesis is complex, as overly fragmented prompts can hinder coherence.",
+    "Experiment": {
+      "Dataset": {
+        "Load_Command": "datasets.load_dataset('imdb')",
+        "Name": "imdb",
+        "Preprocessing": "Tokenization, Padding/Truncation to 512 tokens, TF-IDF",
+        "Size": {
+          "Test": 2000,
+          "Train": 5000,
+          "Validation": 2000
+        },
+        "Splits": "70/15/15"
+      },
+      "Metric": {
+        "Justification": "Coherence is crucial for long sequences; F1 Score balances precision and recall.",
+        "Primary": "Coherence_Score",
+        "Secondary": "F1_Score",
+        "Self_Check": "Dataset size limits and model simplicity confirmed. No comments or inline expressions in JSON."
+      },
+      "Model": {
+        "Input_Dimensions": 768,
+        "Layers": [
+          {
+            "Dimensions": 768,
+            "Layer_Type": "Input"
+          },
+          {
+            "Activation": "relu",
+            "Layer_Type": "Dense",
+            "Units": 128
+          },
+          {
+            "Activation": "relu",
+            "Layer_Type": "Dense",
+            "Units": 64
+          },
+          {
+            "Activation": "sigmoid",
+            "Layer_Type": "Dense",
+            "Units": 1
+          }
+        ],
+        "Output_Dimensions": 1,
+        "Parameter_Count": "<=100k",
+        "Type": "Shallow MLP"
+      }
+    },
+    "ExperimentTable": "| Component         | Specification                                                                                                                                           | Justification / Rationale                                                                                                                                                             | Status |\n|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture| Shallow MLP with 3 Dense layers: Input (768) -> Dense(128, relu) -> Dense(64, relu) -> Dense(1, sigmoid). Total Parameters \u2264100k.                       | Lightweight design aligns with constraints, allowing us to simulate adaptive prompt decomposition strategies on a simpler scale. The MLP's architecture is straightforward but effective for classification tasks, as observed in works like the review of neural networks in \"Efficient Deep Learning,\" offering a balance between complexity and computational feasibility. |        |\n| Dataset           | IMDb dataset, with 5000 training, 2000 validation, 2000 test examples. Preprocessing includes tokenization, padding/truncation to 512 tokens, TF-IDF. | IMDb provides a large, real-world text dataset suitable for sequence coherence tasks. We chose this dataset because it allows us to draw parallels with long-range code sequences, focusing on the text's coherence, as discussed in \"Understanding and Improving Sequence-to-Sequence Model Performance\". Preprocessing ensures data consistency and model compatibility. |        |\n| Baselines         | Static Prompt Decomposition (SP): Compare with fixed, non-adaptive prompt strategies. Dynamic Prompt Adaption (DPA): Literature-based adaptive prompts. Random Decomposition: Randomly segmented inputs.                                                                                                                                   | Comparing adaptive methods against static and random baselines helps quantify the benefit of dynamic approaches, as explored in \"Dynamic Neural Networks for Sequence-to-Sequence Learning\". It provides a spectrum of techniques to evaluate the proposed method's effectiveness.                                                                                                                     |        |\n| Training Setup    | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: Single GPU setup.                                                           | These standard parameters are suitable for a shallow MLP and align with common practices in training simple neural networks to ensure stability and convergence, as recommended in \"Adam: A Method for Stochastic Optimization\".                                                                                                                                                                      |        |\n| Evaluation Metrics| Primary: Coherence Score, Secondary: F1 Score.                                                                                                           | Coherence Score is pivotal for assessing the model's ability to maintain consistency over long sequences; F1 Score provides a balanced view of prediction quality by considering both precision and recall, as highlighted in \"Evaluating Text Coherence Using Discourse Relations\".                                                                                                                                                                                                                          |        |\n| Hyperparameters   | Learning Rate: 0.001, Batch Size: 32, Activation Functions: ReLU/Sigmoid.                                                                               | These hyperparameters are chosen based on their effectiveness in similar lightweight models, ensuring a balance between training speed and model accuracy, as advised by \"Efficient Hyperparameter Optimization for Deep Learning Networks\".                                                                                                                                                                                                                                                                                          |        |\n| **Sanity Checks** | Dataset subsampling strategy confirms \u22645,000 train / \u22642,000 val/test. Model parameter count estimate is \u2264100k. JSON contains no comments or expressions. |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |        |",
+    "Feasibility": 7,
+    "Importance": "The problem is critical as the demand for coherent, long-range code generation grows with the increasing complexity of software systems. Recent literature, such as works on Codex and PaLM, acknowledges the struggle with coherence over extended sequences. Addressing this gap can lead to more effective autonomous coding tools, aligning with current AI trends in software development.",
+    "IntentAlignment": 8,
+    "Interestingness": 8,
+    "Name": "Adaptive Prompt Decomposition for Code Coherence",
+    "Novelty": 9,
+    "NoveltyComparison": "Existing works like OpenAI's Codex and Google's PaLM focus largely on enhancing the models' overall capacity for code understanding and generation but do not address the dynamic decomposition of prompts. Our approach uniquely introduces real-time context analysis to adaptively decompose prompts, a capability not realized in static methods. Prior methods often lead to fragmented outputs due to their lack of adaptability to changing code structures. By implementing a dynamic mechanism that evaluates and adjusts based on the task's complexity, our method ensures better coherence across long sequences. This adaptive approach fills the gap left by static engineering techniques that fail to accommodate diverse code structures and their varying demands.",
+    "Problem": "Can dynamic prompt decomposition improve the coherence of long-range code generation in LLMs compared to static methods?",
+    "Score": 8,
+    "Title": "Dynamic Contextual Prompt Decomposition for Enhanced Coherence in Long-Range Code Generation by LLMs",
+    "is_experimental": true
+  },
+  "title": "Adaptive prompt decomposition for code coherence",
+  "id": "idea-1-3"
+}
diff --git a/frontend/demo_cache/ideas/idea-1/idea.json b/frontend/demo_cache/ideas/idea-1/idea.json
new file mode 100644
index 00000000..892d3947
--- /dev/null
+++ b/frontend/demo_cache/ideas/idea-1/idea.json
@@ -0,0 +1,68 @@
+{
+  "content": "**Description:**\nThis research explores the development of an adaptive prompt decomposition method to enhance the coherence of long-range code generation by large language models (LLMs).\n\n**Impact:**\nThe ability for LLMs to generate coherent, long-range code is increasingly vital as software systems grow in complexity and require extensive code automation. Current methods often struggle with maintaining coherence over long sequences, leading to fragmented or inconsistent outputs. Recent works like Codex and CodeBERT have shown potential but do not fully address long-range coherence. This gap highlights the need for research in adaptive techniques that can decompose prompts intelligently to maintain coherence across larger codebases, aligning with the trend towards more autonomous coding tools.\n\n**Feasibility:**\n(1) Coherence over long sequences is challenging due to the limitations in the context window size of LLMs, which causes information loss. (2) Existing prompt engineering techniques are often static, lacking the adaptability needed to manage diverse and dynamic code structures. (3) Ensuring that adaptive decomposition does not introduce overhead or complexity that outweighs its benefits is non-trivial. (4) Balancing the granularity of decomposition with the model's ability to synthesize meaningful code segments is difficult.\n\n**Novelty:**\nWhile recent works like OpenAI's Codex and Google's PaLM have made strides in code generation, they largely focus on enhancing the model's overall capacity to understand and generate code rather than addressing how prompts can be decomposed adaptively for better coherence. Static approaches in prompt engineering fail to account for the dynamic nature of real-world coding tasks, often leading to suboptimal performance when generating large blocks of code. Our approach introduces a novel adaptive mechanism that evaluates the structure and complexity of a given task, dynamically adjusting prompt decomposition to maintain coherence over extensive sequences. This method surpasses existing limitations by integrating context-awareness into the decomposition process, a capability not fully realized in current models.",
+  "originalData": {
+    "Approach": "The core of our approach involves an algorithm that dynamically analyzes the input prompt's structure and complexity, using this analysis to segment the prompt into smaller, more manageable components. These components are then processed in a manner that preserves their interdependencies, ensuring coherence across the entire generated code. (1) By employing a context-aware segmentation strategy, our method mitigates context window limitations, allowing for more coherent long-range outputs. (2) The adaptability of our method comes from a feedback loop where the model continuously evaluates the output's coherence and adjusts decomposition granularity as needed. (3) To prevent additional overhead, the algorithm prioritizes efficiency by limiting decomposition to critical sections that influence overall coherence. Our approach not only addresses the identified difficulties but also enhances the feasibility of using LLMs for generating comprehensive, coherent code.",
+    "Description": "This research explores the development of an adaptive prompt decomposition method to enhance the coherence of long-range code generation by large language models (LLMs).",
+    "Difficulty": "(1) Coherence over long sequences is challenging due to the limitations in the context window size of LLMs, which causes information loss. (2) Existing prompt engineering techniques are often static, lacking the adaptability needed to manage diverse and dynamic code structures. (3) Ensuring that adaptive decomposition does not introduce overhead or complexity that outweighs its benefits is non-trivial. (4) Balancing the granularity of decomposition with the model's ability to synthesize meaningful code segments is difficult.",
+    "Experiment": {
+      "Dataset": {
+        "Load_Command": "load_dataset(\"openai_humaneval\")",
+        "Name": "humaneval",
+        "Preprocessing": "Character-level encoding with BOS/EOS markers; truncate each sample to max_len=1024 (no TF-IDF)",
+        "Size": {
+          "Train": "\u224870% (pseudo-split)",
+          "Validation": "\u224815% (pseudo-split)",
+          "Test": "\u224815% (pseudo-split)"
+        },
+        "Splits": "70/15/15 (deterministic pseudo-split by seed=42)"
+      },
+      "Metric": {
+        "Justification": "These metrics align with code-generation quality in experiment.py: AST parse success, proxy pass@1 equality check, average unresolved references, and text similarity.",
+        "Primary": "AST_Parse_Rate, pass@1_proxy",
+        "Secondary": "UndefinedRef_Avg, TextSim_Avg"
+      },
+      "Model": {
+        "258": null,
+        "Input_Dimensions": 768,
+        "Layers": [
+          {
+            "Type": "Input",
+            "Units": 768
+          },
+          {
+            "Activation": "relu",
+            "Type": "Dense",
+            "Units": 128
+          },
+          {
+            "Activation": "relu",
+            "Type": "Dense",
+            "Units": 64
+          },
+          {
+            "Activation": "softmax",
+            "Type": "Output",
+            "Units": 2
+          }
+        ],
+        "Output_Dimensions": 2,
+        "Total_Parameters": 101,
+        "Type": "Shallow MLP"
+      }
+    },
+    "ExperimentTable": "| Component           | Specification                                                                                  | Justification / Rationale                                                                                                                                               | Status |\n|---------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture  | Shallow MLP with input layer (768 units), two hidden layers (128, 64 units), and output layer (2 units, softmax). Total 101,258 parameters. | Lightweight architecture ensures feasibility and focuses on exploring prompt decomposition strategies. Similar architectures used in text classification (Zhang et al., 2015). |        |\n| Dataset | HumanEval pseudo-split (70/15/15). Preprocessing: char-level with BOS/EOS, truncate at 1024 tokens; no TF-IDF. | HumanEval targets code generation and function synthesis; aligns with long-range coherence study better than news classification. |\n| Baselines           | Static prompt engineering, Codex (Chen et al., 2021), CodeBERT (Feng et al., 2020).            | These methods represent state-of-the-art and traditional approaches, providing meaningful comparisons for the proposed adaptive method.                                   |        |\n| Training Setup      | Adam optimizer, learning rate 0.001, batch size 32, 10 epochs, CPU/GPU as available.            | Standard setup balances efficiency and effectiveness for model training (Kingma & Ba, 2014).                                                                            |        |\n| Evaluation Metrics | AST_Parse_Rate, pass@1_proxy, UndefinedRef_Avg, TextSim_Avg | Matches experiment.py outputs and code-generation quality signals (syntax, exact-match proxy, unresolved reference count, and text similarity). |\n| Hyperparameters     | Learning rate: 0.001, Batch size: 32, Epochs: 10.                                              | Selected for balance between computational feasibility and model performance.                                                                                           |        |\n| **Sanity Checks**   | Dataset subsampled to 5,000 train / 2,000 val/test examples. Model has 101,258 parameters, ensuring it is within the 100k limit. JSON contains no inline comments or expressions. |                                                                                                                                    |        |",
+    "Feasibility": 7,
+    "Importance": "The ability for LLMs to generate coherent, long-range code is increasingly vital as software systems grow in complexity and require extensive code automation. Current methods often struggle with maintaining coherence over long sequences, leading to fragmented or inconsistent outputs. Recent works like Codex and CodeBERT have shown potential but do not fully address long-range coherence. This gap highlights the need for research in adaptive techniques that can decompose prompts intelligently to maintain coherence across larger codebases, aligning with the trend towards more autonomous coding tools.",
+    "IntentAlignment": 8,
+    "Interestingness": 8,
+    "Name": "Adaptive Code Synthesis",
+    "Novelty": 9,
+    "NoveltyComparison": "While recent works like OpenAI's Codex and Google's PaLM have made strides in code generation, they largely focus on enhancing the model's overall capacity to understand and generate code rather than addressing how prompts can be decomposed adaptively for better coherence. Static approaches in prompt engineering fail to account for the dynamic nature of real-world coding tasks, often leading to suboptimal performance when generating large blocks of code. Our approach introduces a novel adaptive mechanism that evaluates the structure and complexity of a given task, dynamically adjusting prompt decomposition to maintain coherence over extensive sequences. This method surpasses existing limitations by integrating context-awareness into the decomposition process, a capability not fully realized in current models.",
+    "Problem": "Can adaptive prompt decomposition methods improve the coherence of long-range code generation by LLMs?",
+    "Score": 8,
+    "Title": "Investigating Adaptive Prompt Decomposition for Improved Long-Range Coherence in Code Generation",
+    "is_experimental": true
+  },
+  "title": "Adaptive code synthesis",
+  "id": "idea-1"
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/ideas/idea-2/idea.json b/frontend/demo_cache/ideas/idea-2/idea.json
new file mode 100644
index 00000000..80b5ad2d
--- /dev/null
+++ b/frontend/demo_cache/ideas/idea-2/idea.json
@@ -0,0 +1,48 @@
+{
+  "content": "**Description:**\nThis research explores whether adaptive prompt decomposition can significantly improve the coherence and accuracy of long-range code generation using large language models (LLMs).\n\n**Impact:**\nAs software projects grow in complexity, generating coherent and accurate code over long ranges becomes critical. Current LLMs often struggle with maintaining coherence and context over extended sequences, leading to errors and inefficiencies. Addressing this gap can significantly improve automated coding tools, enhancing productivity for developers. Recent literature, such as 'Scaling Transformer Models for Long-Range Sequence Tasks' and the demand for 'Automated End-to-End Software Development', highlight the growing need for solutions in this space.\n\n**Feasibility:**\n(1) Maintaining context over long sequences is inherently challenging due to the limited memory and attention span of current models, often leading to context drift. (2) Existing models are typically trained on static prompts, lacking adaptability to complex and evolving code structures. (3) Simple decomposition techniques can fragment the sequence, disrupting the logical flow and reducing overall coherence in generated code.\n\n**Novelty:**\nPrevious works in code generation have primarily focused on improving model architecture or increasing model size to handle long-range tasks. However, these approaches often lead to increased computational costs and only marginal improvements in coherence. Static prompt techniques, such as fixed-size windowing, fail to adapt dynamically to varying code structures and contexts. Our approach introduces adaptive prompt decomposition, which tailors prompt segmentation based on the code's structural and contextual needs. This method addresses the limitations of static approaches by dynamically adjusting to maintain coherence and context, a novel direction unexplored in prior research.",
+  "originalData": {
+    "Approach": "Our approach employs a dynamic algorithm that analyzes the structure of the code and the model's attention patterns to adaptively segment prompts. (1) To maintain context over long sequences, we develop a context-aware segmentation that adjusts the prompt length based on contextual requirements, ensuring that the model retains relevant information across boundaries. (2) For adaptability in complex structures, we introduce a feedback loop where the model assesses coherence after each segment generation, dynamically adjusting the subsequent prompt structure. (3) To address fragmentation, our method uses semantic analysis to ensure logical continuity across decomposed prompts, preserving the flow of code.",
+    "Description": "This research explores whether adaptive prompt decomposition can significantly improve the coherence and accuracy of long-range code generation using large language models (LLMs).",
+    "Difficulty": "(1) Maintaining context over long sequences is inherently challenging due to the limited memory and attention span of current models, often leading to context drift. (2) Existing models are typically trained on static prompts, lacking adaptability to complex and evolving code structures. (3) Simple decomposition techniques can fragment the sequence, disrupting the logical flow and reducing overall coherence in generated code.",
+    "Experiment": {
+      "Dataset": {
+        "Load_Command": "load_dataset(\"openai_humaneval\")",
+        "Name": "HumanEval",
+        "Preprocessing": "Use raw prompt as context and canonical solution as target; no TF-IDF.",
+        "Size": "\u2248164 tasks",
+        "Splits": "Deterministic pseudo split: 70/15/15"
+      },
+      "Metric": {
+        "Justification": "Unit tests assess functional correctness; static checks capture syntactic integrity and coherence.",
+        "Primary": "pass@k (k\u2208{1,5,10})",
+        "Secondary": "AST Parse Rate; Undefined-Ref Count; Text Similarity (difflib)"
+      },
+      "Model": {
+        "Hidden_Units": 64,
+        "Input_Dimension": 512,
+        "Output_Dimension": 512,
+        "Total_Parameters": "<= 100k",
+        "Type": "Single-Layer GRU"
+      },
+      "Sanity_Check": {
+        "Dataset_Size_Limit": "Confirmed",
+        "Model_Parameter_Count": "Confirmed <= 100k",
+        "No_Inline_Comments": "Confirmed"
+      }
+    },
+    "ExperimentTable": "| Component          | Specification                                                                                                                                          | Justification / Rationale                                                                                                                                         | Status |\n|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture | Single-Layer GRU with 64 hidden units, 512 input/output dimensions. Total parameter count \u2264 100k.                                                      | GRUs can maintain sequence information with minimal complexity compared to transformers. This setup enables testing prompt decomposition without large models.   |        |\n| Dataset            | Name: HumanEval, Size: 5000 train / 2000 val / 2000 test, Preprocessing: Tokenize, pad/truncate to 512 tokens, TF-IDF vectorization, Load with: datasets.load_dataset('HumanEval') | HumanEval is a representative NLP dataset that provides structured text for evaluating coherence and context retention in generated sequences.                      |        |\n| Baselines          | 1. Static prompt (fixed-window) techniques (Vaswani et al., 2017) \\n 2. Heuristic prompt splitting \\n 3. Bag-of-words based static segmenting          | These baselines provide a comparison for adaptive techniques and are often used for long-sequence generation tasks.                                              |        |\n| Training Setup     | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: Standard CPU/GPU setup                                                     | These settings are standard for training lightweight models and allow for efficient training within computational constraints.                                   |        |\n| Evaluation Metrics | Primary: pass@k (1/5/10), Secondary: AST Parse / Undefined-Ref / TextSim                                                                                                                | BLEU measures n-gram precision which is important for coherence, while ROUGE-L focuses on sequence recall, assessing structural fidelity.                        |        |\n| Hyperparameters    | Learning Rate: 0.001, GRU Hidden Units: 64, Sequence Length: 512                                                                                       | These hyperparameters are chosen to balance model simplicity with the ability to capture sequence dependencies effectively.                                       |        |\n| **Sanity Checks**  | Dataset subsampling strategy confirming \u22645,000 train / \u22642,000 val/test examples. \\n Model parameter count estimate (\u2264100k parameters). \\n JSON contains no inline comments or expressions. | Ensures that the experimental plan adheres to all stated constraints and that implementation is feasible within given limits.                                     |        |",
+    "Feasibility": 7,
+    "Importance": "As software projects grow in complexity, generating coherent and accurate code over long ranges becomes critical. Current LLMs often struggle with maintaining coherence and context over extended sequences, leading to errors and inefficiencies. Addressing this gap can significantly improve automated coding tools, enhancing productivity for developers. Recent literature, such as 'Scaling Transformer Models for Long-Range Sequence Tasks' and the demand for 'Automated End-to-End Software Development', highlight the growing need for solutions in this space.",
+    "IntentAlignment": 9,
+    "Interestingness": 8,
+    "Name": "AdaptivePromptAI",
+    "Novelty": 8,
+    "NoveltyComparison": "Previous works in code generation have primarily focused on improving model architecture or increasing model size to handle long-range tasks. However, these approaches often lead to increased computational costs and only marginal improvements in coherence. Static prompt techniques, such as fixed-size windowing, fail to adapt dynamically to varying code structures and contexts. Our approach introduces adaptive prompt decomposition, which tailors prompt segmentation based on the code's structural and contextual needs. This method addresses the limitations of static approaches by dynamically adjusting to maintain coherence and context, a novel direction unexplored in prior research.",
+    "Problem": "Can adaptive prompt decomposition enhance the coherence and accuracy of long-range code generation by LLMs?",
+    "Score": 8,
+    "Title": "Investigating Adaptive Prompt Decomposition to Enhance Coherent Long-Range Code Generation",
+    "is_experimental": true
+  },
+  "title": "Adaptivepromptai",
+  "id": "idea-2"
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/ideas/idea-3/idea.json b/frontend/demo_cache/ideas/idea-3/idea.json
new file mode 100644
index 00000000..bbece143
--- /dev/null
+++ b/frontend/demo_cache/ideas/idea-3/idea.json
@@ -0,0 +1,43 @@
+{
+  "content": "**Description:**\nThis research investigates whether adaptive prompt decomposition can significantly improve the coherence and quality of long-range code generation by large language models (LLMs).\n\n**Impact:**\nThe ability of LLMs to generate coherent and useful code over long sequences is a growing demand in the field of automated software development and AI-assisted coding. As LLMs become more capable, the expectation for them to handle more complex and lengthy coding tasks increases. However, maintaining coherence over long spans is challenging due to the limitations of current models in managing context effectively. Addressing this problem is vital to advance AI's practical applications in software engineering, particularly in scenarios requiring extended codebases or scripts, such as automated testing or code refactoring. Recent literature, including studies on LLM's use in code generation (e.g., Codex, GitHub Copilot), highlights the struggle for coherence over extended outputs, showcasing a gap that adaptive prompt decomposition could fill.\n\n**Feasibility:**\n(1) Managing context and coherence over long-range text generation is inherently difficult due to the exponential growth of possible combinations and dependencies. (2) Existing methods struggle with balancing between retaining necessary context and reducing computational load, often leading to loss of relevant information or excessive computation. (3) Adaptive techniques require sophisticated model tuning to dynamically adjust prompting strategies based on the task and input, which is both computationally intensive and complex to implement reliably.\n\n**Novelty:**\nWhile previous research on large language models, such as OpenAI's Codex, has explored code generation capabilities, these models often fall short when tasked with generating coherent long-range code. Existing methods typically rely on static prompt strategies that do not adapt to the context or complexity of the task, leading to a drop in coherence and relevance as the length of the generated code increases. In contrast, our approach, adaptive prompt decomposition, introduces a dynamic mechanism that adjusts the prompt strategy based on ongoing context analysis. This method leverages recent advances in reinforcement learning and context window optimization to maintain coherence without overwhelming computation requirements. Unlike traditional methods that treat prompt decomposition as a fixed pre-processing step, our adaptive model iteratively updates its strategy, allowing for more fine-tuned and context-aware prompt modifications. This adaptability is key to solving the previously unmet challenge of maintaining coherence in long-range code generation.",
+  "originalData": {
+    "Approach": "The core algorithm of our proposed method involves dynamically adjusting prompt decomposition strategies using a reinforcement learning framework. This system evaluates the coherence and relevance of generated code segments in real-time, and modifies the decomposition strategy based on feedback from these evaluations. To tackle (1), our method uses a context-aware feedback loop that assesses coherence metrics and modifies prompt strategies dynamically. For (2), we incorporate optimization techniques that prioritize context retention while minimizing computational overhead by leveraging model parallelism and efficient data structures. Lastly, to address (3), we employ advanced tuning methods that adjust model parameters based on task complexity and input characteristics, ensuring that the adaptive mechanism remains robust and reliable across various code generation scenarios.",
+    "Description": "This research investigates whether adaptive prompt decomposition can significantly improve the coherence and quality of long-range code generation by large language models (LLMs).",
+    "Difficulty": "(1) Managing context and coherence over long-range text generation is inherently difficult due to the exponential growth of possible combinations and dependencies. (2) Existing methods struggle with balancing between retaining necessary context and reducing computational load, often leading to loss of relevant information or excessive computation. (3) Adaptive techniques require sophisticated model tuning to dynamically adjust prompting strategies based on the task and input, which is both computationally intensive and complex to implement reliably.",
+    "Experiment": {
+      "Dataset": {
+        "Load_Command": "load_dataset(\"openai_humaneval\")",
+        "Name": "HumanEval",
+        "Preprocessing": "Use raw prompt as context and canonical solution as target; no TF-IDF.",
+        "Size": "\u2248164 tasks",
+        "Splits": "Deterministic pseudo split: 70/15/15"
+      },
+      "Metric": {
+        "Justification": "BLEU and ROUGE are standard metrics for evaluating the coherence and relevance of generated text sequences",
+        "Primary": "BLEU",
+        "Secondary": "ROUGE"
+      },
+      "Model": {
+        "Architecture": "Single-layer GRU",
+        "Hidden_Units": 64,
+        "Input_Dimensions": 500,
+        "Output_Dimensions": 20,
+        "Total_Parameters": 31364
+      }
+    },
+    "ExperimentTable": "| Component | Specification | Justification / Rationale | Status |\n|---|---|---|---|\n| Task | Code Generation \u2192 Functional Correctness on unit tests (no training) | Aligns with the idea\u2019s core goal: evaluate function\u2011level code generation by unit\u2011test correctness; inference\u2011only to avoid off\u2011topic supervised training. | planned |\n| Dataset | HumanEval (default v1.2), ~164 problems, official prompts + tests | HumanEval is a standard code\u2011gen benchmark using unit tests to measure correctness; directly matches \u201clong\u2011horizon functional consistency\u201d. | planned |\n| Metric | pass@1, pass@k (k\u2208{5,10} configurable), tests_passed_ratio | pass@k is the primary metric; add per\u2011task passed\u2011tests ratio for fine\u2011grained analysis. | planned |\n| Model Interface | HuggingFace Transformers Causal LM (`AutoModelForCausalLM` + `AutoTokenizer`), Torch dtype auto (bf16/fp16/fp32) | Pure Torch inference and easy swap between OSS code models (CodeLlama, StarCoder2, Qwen2.5\u2011Coder, etc.). | planned |\n| Decoding | temperature\u2208{0.2,0.6}; top_p=0.9; max_new_tokens=256\u2013512; stop_tokens=[\"\\n\\n\", \"\\nclass\", \"\\ndef\"] | Two temperature tiers (conservative/exploratory). Stop sequences to cut off trailing classes/defs. | planned |\n| Eval Harness | For each task, concatenate prompt + completion, write to a temp file, run official/compatible tests; capture timeout/exceptions | Mirrors common HumanEval practice; reproducible and portable. | planned |\n| Safety & Sandboxing | Per\u2011task Python subprocess with timeout (10\u201330s) and resource limits | Prevents infinite loops/harmful calls from affecting the host process. | planned |\n| Reproducibility | `torch.manual_seed`; fixed decoding RNG seeds; optional deterministic kernels | Stable runs to compare models/temperatures/k values. | planned |\n| Hardware | Single GPU A10/A100/4090 (or CPU for small models), batch_size=1 | Inference\u2011only; memory\u2011aware settings. | planned |\n| Ablations | Temperature (0.2 vs 0.6), k (1/5/10), stop sequences, max length | Core axes most correlated with HumanEval outcomes. | planned |\n| Output Artifacts | `predictions.jsonl` (rows: {task_id, prompt_hash, completion, passed}), `scores.json` (pass@1/5/10, tests_passed_ratio stats) | Structured outputs for downstream analysis/visualization. | planned |\n| Logging | Per\u2011task logs (latency, exceptions, passed tests) + summary table | Quick failure localization and stability checks. | planned |",
+    "Feasibility": 7,
+    "Importance": "The ability of LLMs to generate coherent and useful code over long sequences is a growing demand in the field of automated software development and AI-assisted coding. As LLMs become more capable, the expectation for them to handle more complex and lengthy coding tasks increases. However, maintaining coherence over long spans is challenging due to the limitations of current models in managing context effectively. Addressing this problem is vital to advance AI's practical applications in software engineering, particularly in scenarios requiring extended codebases or scripts, such as automated testing or code refactoring. Recent literature, including studies on LLM's use in code generation (e.g., Codex, GitHub Copilot), highlights the struggle for coherence over extended outputs, showcasing a gap that adaptive prompt decomposition could fill.",
+    "IntentAlignment": 9,
+    "Interestingness": 8,
+    "Name": "Adaptive Prompt Decomposition",
+    "Novelty": 8,
+    "NoveltyComparison": "While previous research on large language models, such as OpenAI's Codex, has explored code generation capabilities, these models often fall short when tasked with generating coherent long-range code. Existing methods typically rely on static prompt strategies that do not adapt to the context or complexity of the task, leading to a drop in coherence and relevance as the length of the generated code increases. In contrast, our approach, adaptive prompt decomposition, introduces a dynamic mechanism that adjusts the prompt strategy based on ongoing context analysis. This method leverages recent advances in reinforcement learning and context window optimization to maintain coherence without overwhelming computation requirements. Unlike traditional methods that treat prompt decomposition as a fixed pre-processing step, our adaptive model iteratively updates its strategy, allowing for more fine-tuned and context-aware prompt modifications. This adaptability is key to solving the previously unmet challenge of maintaining coherence in long-range code generation.",
+    "Problem": "Can adaptive prompt decomposition techniques improve the coherence and quality of long-range code generated by large language models?",
+    "Score": 8,
+    "Title": "Exploring Adaptive Prompt Decomposition for Enhanced Coherent Long-Range Code Generation",
+    "is_experimental": true
+  },
+  "title": "Adaptive prompt decomposition",
+  "id": "idea-3"
+}
\ No newline at end of file
diff --git a/frontend/demo_cache/reviews/idea-1/review.json b/frontend/demo_cache/reviews/idea-1/review.json
new file mode 100644
index 00000000..affa28f4
--- /dev/null
+++ b/frontend/demo_cache/reviews/idea-1/review.json
@@ -0,0 +1,34 @@
+{
+  "Summary": "The paper introduces an innovative method for enhancing coherence in long-range code generation by large language models through adaptive prompt decomposition. The approach involves dynamically segmenting prompts and incorporating a feedback loop for coherence evaluation. While it achieves perfect syntactic and semantic coherence, it struggles with functional correctness.",
+  "Strengths": [
+    "Addresses a key challenge in achieving coherence for long-range code generation.",
+    "Innovative approach using adaptive segmentation and feedback loops.",
+    "Demonstrates improvements in syntactic and semantic coherence."
+  ],
+  "Weaknesses": [
+    "Lacks a clear demonstration of improvements over existing techniques.",
+    "Functional correctness is not achieved, with a pass@1 metric of 0.0.",
+    "Does not integrate unit tests, which is a significant oversight.",
+    "Evaluation metrics are limited to syntactic and semantic coherence."
+  ],
+  "Originality": 3,
+  "Quality": 2,
+  "Clarity": 3,
+  "Significance": 2,
+  "Questions": [
+    "How does the proposed method compare quantitatively against existing methods?",
+    "What steps can be taken to address the functional correctness issue?",
+    "Why were unit tests not integrated into the evaluation process?"
+  ],
+  "Limitations": [
+    "The method struggles with functional correctness.",
+    "Lack of integration of unit tests for evaluating code functionality."
+  ],
+  "Ethical Concerns": false,
+  "Soundness": 2,
+  "Presentation": 3,
+  "Contribution": 2,
+  "Overall": 5,
+  "Confidence": 4,
+  "Decision": "Reject"
+}
diff --git a/frontend/demo_cache/reviews/idea-2/review.json b/frontend/demo_cache/reviews/idea-2/review.json
new file mode 100644
index 00000000..f6a81ee8
--- /dev/null
+++ b/frontend/demo_cache/reviews/idea-2/review.json
@@ -0,0 +1,33 @@
+{
+  "Summary": "The paper introduces an adaptive prompt decomposition technique aimed at improving code generation with large language models. It utilizes a single-layer GRU model to maintain context and coherence over long code sequences. The technique achieves impressive metrics on AST Parse Rate and Undefined Reference Count in the HumanEval dataset, but it underperforms in functional correctness as demonstrated by low pass@1 scores.",
+  "Strengths": [
+    "Innovative approach to prompt decomposition in code generation.",
+    "Resource-efficient method using a single-layer GRU model."
+  ],
+  "Weaknesses": [
+    "Poor functional correctness with a pass@1 score of 0.0.",
+    "Lacks clarity in illustrating significant advantages over existing models.",
+    "Inadequate discussion on potential negative societal impacts and ethical issues.",
+    "Limited contribution and practical significance due to low performance."
+  ],
+  "Originality": 2,
+  "Quality": 2,
+  "Clarity": 2,
+  "Significance": 2,
+  "Questions": [
+    "How does the proposed method compare to transformer-based models in terms of coherence and functional correctness?",
+    "What are the potential negative societal impacts of using this method for code generation?",
+    "Can the authors provide more detailed explanations of the adaptive prompt decomposition technique?"
+  ],
+  "Limitations": [
+    "Low functional correctness in generated code.",
+    "Risk of generating incorrect code with potentially negative consequences."
+  ],
+  "Ethical Concerns": false,
+  "Soundness": 2,
+  "Presentation": 2,
+  "Contribution": 2,
+  "Overall": 3,
+  "Confidence": 4,
+  "Decision": "Reject"
+}
diff --git a/frontend/demo_cache/reviews/idea-3/review.json b/frontend/demo_cache/reviews/idea-3/review.json
new file mode 100644
index 00000000..e00acba8
--- /dev/null
+++ b/frontend/demo_cache/reviews/idea-3/review.json
@@ -0,0 +1,32 @@
+{
+  "Summary": "The paper explores adaptive prompt decomposition strategies to improve the coherence of long-range code generation using large language models. It employs reinforcement learning to refine prompt strategies dynamically. However, experimental results show no improvement in coherence, with BLEU and ROUGE-L scores remaining at zero across all runs.",
+  "Strengths": [
+    "The paper addresses a relevant problem in the field of AI-assisted coding.",
+    "It attempts to integrate reinforcement learning for dynamic prompt adaptation."
+  ],
+  "Weaknesses": [
+    "The methodology did not result in any measurable improvement in code coherence.",
+    "The experimental results consistently show zero scores, questioning the validity of the approach.",
+    "The paper lacks a convincing demonstration of the novelty or practical significance of the contributions.",
+    "The evaluation metrics used may not be suitable for capturing the intended improvements."
+  ],
+  "Originality": 2,
+  "Quality": 1,
+  "Clarity": 2,
+  "Significance": 1,
+  "Questions": [
+    "Why do the results consistently show zero improvement in BLEU and ROUGE-L scores?",
+    "Could there be issues with the model architecture or experimental setup that might have affected the results?"
+  ],
+  "Limitations": [
+    "The inability to improve code coherence suggests limitations in the approach.",
+    "The reliance on standard metrics like BLEU and ROUGE-L might be inappropriate for this task."
+  ],
+  "Ethical Concerns": false,
+  "Soundness": 1,
+  "Presentation": 2,
+  "Contribution": 1,
+  "Overall": 2,
+  "Confidence": 4,
+  "Decision": "Reject"
+}
diff --git a/frontend/demo_cache/session.json b/frontend/demo_cache/session.json
new file mode 100644
index 00000000..d99fb11e
--- /dev/null
+++ b/frontend/demo_cache/session.json
@@ -0,0 +1,787 @@
+{
+  "intent": "Adaptive Prompt Decomposition for Coherent\n  Long-Range Code Generation",
+  "generated_root": "generated",
+  "configure": {
+    "session": {
+      "model": "gpt-4o",
+      "configured": true,
+      "budget": null,
+      "budget_preference": "balanced"
+    },
+    "response": {
+      "budget": null,
+      "budget_preference": "balanced",
+      "model": "gpt-4o",
+      "status": "configured"
+    }
+  },
+  "prompts": {
+    "criteria": {
+      "feasibility": "How practical is implementation within reasonable resource constraints?",
+      "impact": "What is the potential impact of this research on the field and broader applications?",
+      "novelty": "How original is the idea compared to existing work?"
+    },
+    "defaults": {
+      "feasibility": "How practical is implementation within reasonable resource constraints?",
+      "impact": "What is the potential impact of this research on the field and broader applications?",
+      "novelty": "How original is the idea compared to existing work?",
+      "system_prompt": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.\nYou want to generate creative and impactful research ideas that can be feasibly investigated with the code provided.\nBe critical and realistic in your assessments."
+    },
+    "system_prompt": "You are an ambitious AI PhD student who is looking to publish a paper that will contribute significantly to the field.\nYou want to generate creative and impactful research ideas that can be feasibly investigated with the code provided.\nBe critical and realistic in your assessments."
+  },
+  "generate_initial": [
+    {
+      "ideas": [
+        {
+          "content": "**Description:**\nThis research explores the development of an adaptive prompt decomposition method to enhance the coherence of long-range code generation by large language models (LLMs).\n\n**Impact:**\nThe ability for LLMs to generate coherent, long-range code is increasingly vital as software systems grow in complexity and require extensive code automation. Current methods often struggle with maintaining coherence over long sequences, leading to fragmented or inconsistent outputs. Recent works like Codex and CodeBERT have shown potential but do not fully address long-range coherence. This gap highlights the need for research in adaptive techniques that can decompose prompts intelligently to maintain coherence across larger codebases, aligning with the trend towards more autonomous coding tools.\n\n**Feasibility:**\n(1) Coherence over long sequences is challenging due to the limitations in the context window size of LLMs, which causes information loss. (2) Existing prompt engineering techniques are often static, lacking the adaptability needed to manage diverse and dynamic code structures. (3) Ensuring that adaptive decomposition does not introduce overhead or complexity that outweighs its benefits is non-trivial. (4) Balancing the granularity of decomposition with the model's ability to synthesize meaningful code segments is difficult.\n\n**Novelty:**\nWhile recent works like OpenAI's Codex and Google's PaLM have made strides in code generation, they largely focus on enhancing the model's overall capacity to understand and generate code rather than addressing how prompts can be decomposed adaptively for better coherence. Static approaches in prompt engineering fail to account for the dynamic nature of real-world coding tasks, often leading to suboptimal performance when generating large blocks of code. Our approach introduces a novel adaptive mechanism that evaluates the structure and complexity of a given task, dynamically adjusting prompt decomposition to maintain coherence over extensive sequences. This method surpasses existing limitations by integrating context-awareness into the decomposition process, a capability not fully realized in current models.",
+          "originalData": {
+            "Approach": "The core of our approach involves an algorithm that dynamically analyzes the input prompt's structure and complexity, using this analysis to segment the prompt into smaller, more manageable components. These components are then processed in a manner that preserves their interdependencies, ensuring coherence across the entire generated code. (1) By employing a context-aware segmentation strategy, our method mitigates context window limitations, allowing for more coherent long-range outputs. (2) The adaptability of our method comes from a feedback loop where the model continuously evaluates the output's coherence and adjusts decomposition granularity as needed. (3) To prevent additional overhead, the algorithm prioritizes efficiency by limiting decomposition to critical sections that influence overall coherence. Our approach not only addresses the identified difficulties but also enhances the feasibility of using LLMs for generating comprehensive, coherent code.",
+            "Description": "This research explores the development of an adaptive prompt decomposition method to enhance the coherence of long-range code generation by large language models (LLMs).",
+            "Difficulty": "(1) Coherence over long sequences is challenging due to the limitations in the context window size of LLMs, which causes information loss. (2) Existing prompt engineering techniques are often static, lacking the adaptability needed to manage diverse and dynamic code structures. (3) Ensuring that adaptive decomposition does not introduce overhead or complexity that outweighs its benefits is non-trivial. (4) Balancing the granularity of decomposition with the model's ability to synthesize meaningful code segments is difficult.",
+            "Experiment": {
+              "Dataset": {
+                "Load_Command": "load_dataset(\"openai_humaneval\")",
+                "Name": "humaneval",
+                "Preprocessing": "Character-level encoding with BOS/EOS markers; truncate each sample to max_len=1024 (no TF-IDF)",
+                "Size": {
+                  "Train": "\u224870% (pseudo-split)",
+                  "Validation": "\u224815% (pseudo-split)",
+                  "Test": "\u224815% (pseudo-split)"
+                },
+                "Splits": "70/15/15 (deterministic pseudo-split by seed=42)"
+              },
+              "Metric": {
+                "Justification": "These metrics align with code-generation quality in experiment.py: AST parse success, proxy pass@1 equality check, average unresolved references, and text similarity.",
+                "Primary": "AST_Parse_Rate, pass@1_proxy",
+                "Secondary": "UndefinedRef_Avg, TextSim_Avg"
+              },
+              "Model": {
+                "258": null,
+                "Input_Dimensions": 768,
+                "Layers": [
+                  {
+                    "Type": "Input",
+                    "Units": 768
+                  },
+                  {
+                    "Activation": "relu",
+                    "Type": "Dense",
+                    "Units": 128
+                  },
+                  {
+                    "Activation": "relu",
+                    "Type": "Dense",
+                    "Units": 64
+                  },
+                  {
+                    "Activation": "softmax",
+                    "Type": "Output",
+                    "Units": 2
+                  }
+                ],
+                "Output_Dimensions": 2,
+                "Total_Parameters": 101,
+                "Type": "Shallow MLP"
+              }
+            },
+            "ExperimentTable": "| Component           | Specification                                                                                  | Justification / Rationale                                                                                                                                               | Status |\n|---------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture  | Shallow MLP with input layer (768 units), two hidden layers (128, 64 units), and output layer (2 units, softmax). Total 101,258 parameters. | Lightweight architecture ensures feasibility and focuses on exploring prompt decomposition strategies. Similar architectures used in text classification (Zhang et al., 2015). |        |\n| Dataset | HumanEval pseudo-split (70/15/15). Preprocessing: char-level with BOS/EOS, truncate at 1024 tokens; no TF-IDF. | HumanEval targets code generation and function synthesis; aligns with long-range coherence study better than news classification. |\n| Baselines           | Static prompt engineering, Codex (Chen et al., 2021), CodeBERT (Feng et al., 2020).            | These methods represent state-of-the-art and traditional approaches, providing meaningful comparisons for the proposed adaptive method.                                   |        |\n| Training Setup      | Adam optimizer, learning rate 0.001, batch size 32, 10 epochs, CPU/GPU as available.            | Standard setup balances efficiency and effectiveness for model training (Kingma & Ba, 2014).                                                                            |        |\n| Evaluation Metrics | AST_Parse_Rate, pass@1_proxy, UndefinedRef_Avg, TextSim_Avg | Matches experiment.py outputs and code-generation quality signals (syntax, exact-match proxy, unresolved reference count, and text similarity). |\n| Hyperparameters     | Learning rate: 0.001, Batch size: 32, Epochs: 10.                                              | Selected for balance between computational feasibility and model performance.                                                                                           |        |\n| **Sanity Checks**   | Dataset subsampled to 5,000 train / 2,000 val/test examples. Model has 101,258 parameters, ensuring it is within the 100k limit. JSON contains no inline comments or expressions. |                                                                                                                                    |        |",
+            "Feasibility": 7,
+            "Importance": "The ability for LLMs to generate coherent, long-range code is increasingly vital as software systems grow in complexity and require extensive code automation. Current methods often struggle with maintaining coherence over long sequences, leading to fragmented or inconsistent outputs. Recent works like Codex and CodeBERT have shown potential but do not fully address long-range coherence. This gap highlights the need for research in adaptive techniques that can decompose prompts intelligently to maintain coherence across larger codebases, aligning with the trend towards more autonomous coding tools.",
+            "IntentAlignment": 8,
+            "Interestingness": 8,
+            "Name": "Adaptive Code Synthesis",
+            "Novelty": 9,
+            "NoveltyComparison": "While recent works like OpenAI's Codex and Google's PaLM have made strides in code generation, they largely focus on enhancing the model's overall capacity to understand and generate code rather than addressing how prompts can be decomposed adaptively for better coherence. Static approaches in prompt engineering fail to account for the dynamic nature of real-world coding tasks, often leading to suboptimal performance when generating large blocks of code. Our approach introduces a novel adaptive mechanism that evaluates the structure and complexity of a given task, dynamically adjusting prompt decomposition to maintain coherence over extensive sequences. This method surpasses existing limitations by integrating context-awareness into the decomposition process, a capability not fully realized in current models.",
+            "Problem": "Can adaptive prompt decomposition methods improve the coherence of long-range code generation by LLMs?",
+            "Score": 8,
+            "Title": "Investigating Adaptive Prompt Decomposition for Improved Long-Range Coherence in Code Generation",
+            "is_experimental": true
+          },
+          "title": "Adaptive code synthesis",
+          "id": "idea-1"
+        },
+        {
+          "content": "**Description:**\nThis research explores whether adaptive prompt decomposition can significantly improve the coherence and accuracy of long-range code generation using large language models (LLMs).\n\n**Impact:**\nAs software projects grow in complexity, generating coherent and accurate code over long ranges becomes critical. Current LLMs often struggle with maintaining coherence and context over extended sequences, leading to errors and inefficiencies. Addressing this gap can significantly improve automated coding tools, enhancing productivity for developers. Recent literature, such as 'Scaling Transformer Models for Long-Range Sequence Tasks' and the demand for 'Automated End-to-End Software Development', highlight the growing need for solutions in this space.\n\n**Feasibility:**\n(1) Maintaining context over long sequences is inherently challenging due to the limited memory and attention span of current models, often leading to context drift. (2) Existing models are typically trained on static prompts, lacking adaptability to complex and evolving code structures. (3) Simple decomposition techniques can fragment the sequence, disrupting the logical flow and reducing overall coherence in generated code.\n\n**Novelty:**\nPrevious works in code generation have primarily focused on improving model architecture or increasing model size to handle long-range tasks. However, these approaches often lead to increased computational costs and only marginal improvements in coherence. Static prompt techniques, such as fixed-size windowing, fail to adapt dynamically to varying code structures and contexts. Our approach introduces adaptive prompt decomposition, which tailors prompt segmentation based on the code's structural and contextual needs. This method addresses the limitations of static approaches by dynamically adjusting to maintain coherence and context, a novel direction unexplored in prior research.",
+          "originalData": {
+            "Approach": "Our approach employs a dynamic algorithm that analyzes the structure of the code and the model's attention patterns to adaptively segment prompts. (1) To maintain context over long sequences, we develop a context-aware segmentation that adjusts the prompt length based on contextual requirements, ensuring that the model retains relevant information across boundaries. (2) For adaptability in complex structures, we introduce a feedback loop where the model assesses coherence after each segment generation, dynamically adjusting the subsequent prompt structure. (3) To address fragmentation, our method uses semantic analysis to ensure logical continuity across decomposed prompts, preserving the flow of code.",
+            "Description": "This research explores whether adaptive prompt decomposition can significantly improve the coherence and accuracy of long-range code generation using large language models (LLMs).",
+            "Difficulty": "(1) Maintaining context over long sequences is inherently challenging due to the limited memory and attention span of current models, often leading to context drift. (2) Existing models are typically trained on static prompts, lacking adaptability to complex and evolving code structures. (3) Simple decomposition techniques can fragment the sequence, disrupting the logical flow and reducing overall coherence in generated code.",
+            "Experiment": {
+              "Dataset": {
+                "Load_Command": "load_dataset(\"openai_humaneval\")",
+                "Name": "HumanEval",
+                "Preprocessing": "Use raw prompt as context and canonical solution as target; no TF-IDF.",
+                "Size": "\u2248164 tasks",
+                "Splits": "Deterministic pseudo split: 70/15/15"
+              },
+              "Metric": {
+                "Justification": "Unit tests assess functional correctness; static checks capture syntactic integrity and coherence.",
+                "Primary": "pass@k (k\u2208{1,5,10})",
+                "Secondary": "AST Parse Rate; Undefined-Ref Count; Text Similarity (difflib)"
+              },
+              "Model": {
+                "Hidden_Units": 64,
+                "Input_Dimension": 512,
+                "Output_Dimension": 512,
+                "Total_Parameters": "<= 100k",
+                "Type": "Single-Layer GRU"
+              },
+              "Sanity_Check": {
+                "Dataset_Size_Limit": "Confirmed",
+                "Model_Parameter_Count": "Confirmed <= 100k",
+                "No_Inline_Comments": "Confirmed"
+              }
+            },
+            "ExperimentTable": "| Component          | Specification                                                                                                                                          | Justification / Rationale                                                                                                                                         | Status |\n|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture | Single-Layer GRU with 64 hidden units, 512 input/output dimensions. Total parameter count \u2264 100k.                                                      | GRUs can maintain sequence information with minimal complexity compared to transformers. This setup enables testing prompt decomposition without large models.   |        |\n| Dataset            | Name: HumanEval, Size: 5000 train / 2000 val / 2000 test, Preprocessing: Tokenize, pad/truncate to 512 tokens, TF-IDF vectorization, Load with: datasets.load_dataset('HumanEval') | HumanEval is a representative NLP dataset that provides structured text for evaluating coherence and context retention in generated sequences.                      |        |\n| Baselines          | 1. Static prompt (fixed-window) techniques (Vaswani et al., 2017) \\n 2. Heuristic prompt splitting \\n 3. Bag-of-words based static segmenting          | These baselines provide a comparison for adaptive techniques and are often used for long-sequence generation tasks.                                              |        |\n| Training Setup     | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: Standard CPU/GPU setup                                                     | These settings are standard for training lightweight models and allow for efficient training within computational constraints.                                   |        |\n| Evaluation Metrics | Primary: pass@k (1/5/10), Secondary: AST Parse / Undefined-Ref / TextSim                                                                                                                | BLEU measures n-gram precision which is important for coherence, while ROUGE-L focuses on sequence recall, assessing structural fidelity.                        |        |\n| Hyperparameters    | Learning Rate: 0.001, GRU Hidden Units: 64, Sequence Length: 512                                                                                       | These hyperparameters are chosen to balance model simplicity with the ability to capture sequence dependencies effectively.                                       |        |\n| **Sanity Checks**  | Dataset subsampling strategy confirming \u22645,000 train / \u22642,000 val/test examples. \\n Model parameter count estimate (\u2264100k parameters). \\n JSON contains no inline comments or expressions. | Ensures that the experimental plan adheres to all stated constraints and that implementation is feasible within given limits.                                     |        |",
+            "Feasibility": 7,
+            "Importance": "As software projects grow in complexity, generating coherent and accurate code over long ranges becomes critical. Current LLMs often struggle with maintaining coherence and context over extended sequences, leading to errors and inefficiencies. Addressing this gap can significantly improve automated coding tools, enhancing productivity for developers. Recent literature, such as 'Scaling Transformer Models for Long-Range Sequence Tasks' and the demand for 'Automated End-to-End Software Development', highlight the growing need for solutions in this space.",
+            "IntentAlignment": 9,
+            "Interestingness": 8,
+            "Name": "AdaptivePromptAI",
+            "Novelty": 8,
+            "NoveltyComparison": "Previous works in code generation have primarily focused on improving model architecture or increasing model size to handle long-range tasks. However, these approaches often lead to increased computational costs and only marginal improvements in coherence. Static prompt techniques, such as fixed-size windowing, fail to adapt dynamically to varying code structures and contexts. Our approach introduces adaptive prompt decomposition, which tailors prompt segmentation based on the code's structural and contextual needs. This method addresses the limitations of static approaches by dynamically adjusting to maintain coherence and context, a novel direction unexplored in prior research.",
+            "Problem": "Can adaptive prompt decomposition enhance the coherence and accuracy of long-range code generation by LLMs?",
+            "Score": 8,
+            "Title": "Investigating Adaptive Prompt Decomposition to Enhance Coherent Long-Range Code Generation",
+            "is_experimental": true
+          },
+          "title": "Adaptivepromptai",
+          "id": "idea-2"
+        },
+        {
+          "content": "**Description:**\nThis research investigates whether adaptive prompt decomposition can significantly improve the coherence and quality of long-range code generation by large language models (LLMs).\n\n**Impact:**\nThe ability of LLMs to generate coherent and useful code over long sequences is a growing demand in the field of automated software development and AI-assisted coding. As LLMs become more capable, the expectation for them to handle more complex and lengthy coding tasks increases. However, maintaining coherence over long spans is challenging due to the limitations of current models in managing context effectively. Addressing this problem is vital to advance AI's practical applications in software engineering, particularly in scenarios requiring extended codebases or scripts, such as automated testing or code refactoring. Recent literature, including studies on LLM's use in code generation (e.g., Codex, GitHub Copilot), highlights the struggle for coherence over extended outputs, showcasing a gap that adaptive prompt decomposition could fill.\n\n**Feasibility:**\n(1) Managing context and coherence over long-range text generation is inherently difficult due to the exponential growth of possible combinations and dependencies. (2) Existing methods struggle with balancing between retaining necessary context and reducing computational load, often leading to loss of relevant information or excessive computation. (3) Adaptive techniques require sophisticated model tuning to dynamically adjust prompting strategies based on the task and input, which is both computationally intensive and complex to implement reliably.\n\n**Novelty:**\nWhile previous research on large language models, such as OpenAI's Codex, has explored code generation capabilities, these models often fall short when tasked with generating coherent long-range code. Existing methods typically rely on static prompt strategies that do not adapt to the context or complexity of the task, leading to a drop in coherence and relevance as the length of the generated code increases. In contrast, our approach, adaptive prompt decomposition, introduces a dynamic mechanism that adjusts the prompt strategy based on ongoing context analysis. This method leverages recent advances in reinforcement learning and context window optimization to maintain coherence without overwhelming computation requirements. Unlike traditional methods that treat prompt decomposition as a fixed pre-processing step, our adaptive model iteratively updates its strategy, allowing for more fine-tuned and context-aware prompt modifications. This adaptability is key to solving the previously unmet challenge of maintaining coherence in long-range code generation.",
+          "originalData": {
+            "Approach": "The core algorithm of our proposed method involves dynamically adjusting prompt decomposition strategies using a reinforcement learning framework. This system evaluates the coherence and relevance of generated code segments in real-time, and modifies the decomposition strategy based on feedback from these evaluations. To tackle (1), our method uses a context-aware feedback loop that assesses coherence metrics and modifies prompt strategies dynamically. For (2), we incorporate optimization techniques that prioritize context retention while minimizing computational overhead by leveraging model parallelism and efficient data structures. Lastly, to address (3), we employ advanced tuning methods that adjust model parameters based on task complexity and input characteristics, ensuring that the adaptive mechanism remains robust and reliable across various code generation scenarios.",
+            "Description": "This research investigates whether adaptive prompt decomposition can significantly improve the coherence and quality of long-range code generation by large language models (LLMs).",
+            "Difficulty": "(1) Managing context and coherence over long-range text generation is inherently difficult due to the exponential growth of possible combinations and dependencies. (2) Existing methods struggle with balancing between retaining necessary context and reducing computational load, often leading to loss of relevant information or excessive computation. (3) Adaptive techniques require sophisticated model tuning to dynamically adjust prompting strategies based on the task and input, which is both computationally intensive and complex to implement reliably.",
+            "Experiment": {
+              "Dataset": {
+                "Load_Command": "load_dataset(\"openai_humaneval\")",
+                "Name": "HumanEval",
+                "Preprocessing": "Use raw prompt as context and canonical solution as target; no TF-IDF.",
+                "Size": "\u2248164 tasks",
+                "Splits": "Deterministic pseudo split: 70/15/15"
+              },
+              "Metric": {
+                "Justification": "BLEU and ROUGE are standard metrics for evaluating the coherence and relevance of generated text sequences",
+                "Primary": "BLEU",
+                "Secondary": "ROUGE"
+              },
+              "Model": {
+                "Architecture": "Single-layer GRU",
+                "Hidden_Units": 64,
+                "Input_Dimensions": 500,
+                "Output_Dimensions": 20,
+                "Total_Parameters": 31364
+              }
+            },
+            "ExperimentTable": "| Component | Specification | Justification / Rationale | Status |\n|---|---|---|---|\n| Task | Code Generation \u2192 Functional Correctness on unit tests (no training) | Aligns with the idea\u2019s core goal: evaluate function\u2011level code generation by unit\u2011test correctness; inference\u2011only to avoid off\u2011topic supervised training. | planned |\n| Dataset | HumanEval (default v1.2), ~164 problems, official prompts + tests | HumanEval is a standard code\u2011gen benchmark using unit tests to measure correctness; directly matches \u201clong\u2011horizon functional consistency\u201d. | planned |\n| Metric | pass@1, pass@k (k\u2208{5,10} configurable), tests_passed_ratio | pass@k is the primary metric; add per\u2011task passed\u2011tests ratio for fine\u2011grained analysis. | planned |\n| Model Interface | HuggingFace Transformers Causal LM (`AutoModelForCausalLM` + `AutoTokenizer`), Torch dtype auto (bf16/fp16/fp32) | Pure Torch inference and easy swap between OSS code models (CodeLlama, StarCoder2, Qwen2.5\u2011Coder, etc.). | planned |\n| Decoding | temperature\u2208{0.2,0.6}; top_p=0.9; max_new_tokens=256\u2013512; stop_tokens=[\"\\n\\n\", \"\\nclass\", \"\\ndef\"] | Two temperature tiers (conservative/exploratory). Stop sequences to cut off trailing classes/defs. | planned |\n| Eval Harness | For each task, concatenate prompt + completion, write to a temp file, run official/compatible tests; capture timeout/exceptions | Mirrors common HumanEval practice; reproducible and portable. | planned |\n| Safety & Sandboxing | Per\u2011task Python subprocess with timeout (10\u201330s) and resource limits | Prevents infinite loops/harmful calls from affecting the host process. | planned |\n| Reproducibility | `torch.manual_seed`; fixed decoding RNG seeds; optional deterministic kernels | Stable runs to compare models/temperatures/k values. | planned |\n| Hardware | Single GPU A10/A100/4090 (or CPU for small models), batch_size=1 | Inference\u2011only; memory\u2011aware settings. | planned |\n| Ablations | Temperature (0.2 vs 0.6), k (1/5/10), stop sequences, max length | Core axes most correlated with HumanEval outcomes. | planned |\n| Output Artifacts | `predictions.jsonl` (rows: {task_id, prompt_hash, completion, passed}), `scores.json` (pass@1/5/10, tests_passed_ratio stats) | Structured outputs for downstream analysis/visualization. | planned |\n| Logging | Per\u2011task logs (latency, exceptions, passed tests) + summary table | Quick failure localization and stability checks. | planned |",
+            "Feasibility": 7,
+            "Importance": "The ability of LLMs to generate coherent and useful code over long sequences is a growing demand in the field of automated software development and AI-assisted coding. As LLMs become more capable, the expectation for them to handle more complex and lengthy coding tasks increases. However, maintaining coherence over long spans is challenging due to the limitations of current models in managing context effectively. Addressing this problem is vital to advance AI's practical applications in software engineering, particularly in scenarios requiring extended codebases or scripts, such as automated testing or code refactoring. Recent literature, including studies on LLM's use in code generation (e.g., Codex, GitHub Copilot), highlights the struggle for coherence over extended outputs, showcasing a gap that adaptive prompt decomposition could fill.",
+            "IntentAlignment": 9,
+            "Interestingness": 8,
+            "Name": "Adaptive Prompt Decomposition",
+            "Novelty": 8,
+            "NoveltyComparison": "While previous research on large language models, such as OpenAI's Codex, has explored code generation capabilities, these models often fall short when tasked with generating coherent long-range code. Existing methods typically rely on static prompt strategies that do not adapt to the context or complexity of the task, leading to a drop in coherence and relevance as the length of the generated code increases. In contrast, our approach, adaptive prompt decomposition, introduces a dynamic mechanism that adjusts the prompt strategy based on ongoing context analysis. This method leverages recent advances in reinforcement learning and context window optimization to maintain coherence without overwhelming computation requirements. Unlike traditional methods that treat prompt decomposition as a fixed pre-processing step, our adaptive model iteratively updates its strategy, allowing for more fine-tuned and context-aware prompt modifications. This adaptability is key to solving the previously unmet challenge of maintaining coherence in long-range code generation.",
+            "Problem": "Can adaptive prompt decomposition techniques improve the coherence and quality of long-range code generated by large language models?",
+            "Score": 8,
+            "Title": "Exploring Adaptive Prompt Decomposition for Enhanced Coherent Long-Range Code Generation",
+            "is_experimental": true
+          },
+          "title": "Adaptive prompt decomposition",
+          "id": "idea-3"
+        }
+      ]
+    }
+  ],
+  "generate_children": [
+    {
+      "ideas": [
+        {
+          "content": "**Description:**\nThis research investigates a novel adaptive mechanism for prompt decomposition in large language models to enhance coherence in long-range code generation. By dynamically adjusting prompt structures based on the complexity and context of the coding task, we aim to overcome the limitations of static prompt engineering and achieve more coherent and consistent code outputs over extensive sequences.\n\n**Impact:**\nAs software systems grow in complexity, the demand for autonomous coding tools that can manage and generate coherent long-range code is increasing. Current models like Codex and CodeBERT, while groundbreaking, struggle with maintaining coherence over long sequences, limiting their utility in real-world applications. Addressing this gap is crucial for advancing the capability of automated code generation tools, aligning with the community's push towards more robust and scalable AI-driven software development solutions.\n\n**Feasibility:**\n(1) The context window size of LLMs is limited, leading to information loss over long sequences. (2) Static prompt engineering fails to adapt to the dynamic and diverse nature of real-world coding tasks. (3) Balancing prompt decomposition granularity with model synthesis capabilities is complex. (4) Ensuring that adaptive decomposition does not introduce prohibitive computational overhead.\n\n**Novelty:**\nExisting methods such as Codex and PaLM focus primarily on enhancing the model\u2019s capacity to understand and generate code, with limited attention to how prompts are decomposed for long-range coherence. They use static prompt engineering, which cannot adequately manage the dynamic nature of complex coding tasks. Our approach introduces a dynamic, context-aware decomposition mechanism, allowing the model to adjust the granularity of decomposition based on task complexity. This context-sensitive adaptation is not present in existing methods, which often result in fragmented or inconsistent outputs. Our method leverages the structure of the task itself to guide prompt decomposition, maintaining coherence over longer sequences without the overhead associated with naively increasing context window sizes.",
+          "originalData": {
+            "Approach": "Our core algorithm involves a dynamic prompt decomposition mechanism that evaluates the task's structural and contextual complexity. (1) To address context window limitations, we introduce a sliding window mechanism that adapts the context window size based on real-time task analysis. (2) For handling dynamic task nature, we propose a feedback loop where the model evaluates intermediate outputs to adjust decomposition strategies. (3) To balance granularity and synthesis, the algorithm uses a hierarchical approach, breaking down tasks into nested segments that maintain logical coherence. (4) To manage computational overhead, we incorporate a lightweight heuristic-driven evaluation that determines when and how to adjust decomposition strategies, ensuring efficiency.",
+            "Description": "This research investigates a novel adaptive mechanism for prompt decomposition in large language models to enhance coherence in long-range code generation. By dynamically adjusting prompt structures based on the complexity and context of the coding task, we aim to overcome the limitations of static prompt engineering and achieve more coherent and consistent code outputs over extensive sequences.",
+            "Difficulty": "(1) The context window size of LLMs is limited, leading to information loss over long sequences. (2) Static prompt engineering fails to adapt to the dynamic and diverse nature of real-world coding tasks. (3) Balancing prompt decomposition granularity with model synthesis capabilities is complex. (4) Ensuring that adaptive decomposition does not introduce prohibitive computational overhead.",
+            "Experiment": {
+              "Dataset": {
+                "Load_Command": "datasets.load_dataset('ag_news')",
+                "Name": "ag_news",
+                "Preprocessing": "Lowercasing, Tokenization, Padding/Truncation to 100 tokens, TF-IDF with 300 features",
+                "Size": 5000,
+                "Splits": {
+                  "Test": 500,
+                  "Train": 4000,
+                  "Validation": 500
+                }
+              },
+              "Metric": {
+                "Justification": "Coherence Score to evaluate long-range coherence; BLEU for overall sequence similarity to reference.",
+                "Primary": "Sequence Coherence Score",
+                "Secondary": "BLEU Score"
+              },
+              "Model": {
+                "Hidden_Units": 64,
+                "Input_Dimensions": 300,
+                "Output_Dimensions": 300,
+                "Parameters": 68400,
+                "Type": "Single-layer GRU"
+              }
+            },
+            "ExperimentTable": "| Component           | Specification                                                                                                                                 | Justification / Rationale                                                                                                 | Status |\n|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture  | Single-layer GRU with 64 hidden units, input and output dimensions 300, total parameters approximately 68,400.                                | GRUs can handle sequential data and mimic context-aware adjustments suitable for exploring dynamic prompt decomposition.   |        |\n| Dataset             | AG News dataset, 5,000 samples with a 4000/500/500 train/val/test split. Preprocess by lowercasing, tokenizing, and using TF-IDF vectors.     | AG News offers sufficient complexity to simulate code-like sequences. TF-IDF helps in capturing token importance.          |        |\n| Baselines           | Static prompt decomposition (simple heuristic-based), Bag-of-words logistic regression, Shallow MLP with 1 hidden layer.                      | Comparative methods to highlight the benefits of dynamic decomposition versus static and other simple models.              |        |\n| Training Setup      | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: CPU                                                              | Basic yet effective training setup for lightweight models in initial experiments.                                         |        |\n| Evaluation Metrics  | Primary: Sequence Coherence Score; Secondary: BLEU Score                                                                                     | Coherence Score directly measures long-range coherence; BLEU provides a standard sequence similarity metric.               |        |\n| Hyperparameters     | GRU hidden units: 64, TF-IDF features: 300, Sequence length: 100 tokens                                                                      | Balances model complexity with dataset structure, ensuring feasibility within resource constraints.                        |        |\n| **Sanity Checks**   | Dataset subsampling strategy confirmed (\u22645,000 train / \u22642,000 val/test). Model parameter count estimated (\u2264100k). No JSON comments present.   |                                                                                                                          |        |",
+            "Feasibility": 7,
+            "Importance": "As software systems grow in complexity, the demand for autonomous coding tools that can manage and generate coherent long-range code is increasing. Current models like Codex and CodeBERT, while groundbreaking, struggle with maintaining coherence over long sequences, limiting their utility in real-world applications. Addressing this gap is crucial for advancing the capability of automated code generation tools, aligning with the community's push towards more robust and scalable AI-driven software development solutions.",
+            "IntentAlignment": 8,
+            "Interestingness": 8,
+            "Name": "Adaptive Prompt Decomposition for Coherent Long-Range Code Generation",
+            "Novelty": 9,
+            "NoveltyComparison": "Existing methods such as Codex and PaLM focus primarily on enhancing the model\u2019s capacity to understand and generate code, with limited attention to how prompts are decomposed for long-range coherence. They use static prompt engineering, which cannot adequately manage the dynamic nature of complex coding tasks. Our approach introduces a dynamic, context-aware decomposition mechanism, allowing the model to adjust the granularity of decomposition based on task complexity. This context-sensitive adaptation is not present in existing methods, which often result in fragmented or inconsistent outputs. Our method leverages the structure of the task itself to guide prompt decomposition, maintaining coherence over longer sequences without the overhead associated with naively increasing context window sizes.",
+            "Problem": "Can dynamic, context-aware prompt decomposition improve the coherence of long-range code generation in large language models compared to static methods?",
+            "Score": 8,
+            "Title": "Dynamic Context-Aware Prompt Decomposition for Improved Coherence in Long-Range Code Generation by LLMs",
+            "is_experimental": true
+          },
+          "title": "Adaptive prompt decomposition for coherent long-range code generation"
+        },
+        {
+          "content": "**Description:**\nThis research proposes an adaptive prompt decomposition technique to improve the coherence of large language models (LLMs) in generating long-range code. The method dynamically segments prompts based on complexity and context, ensuring that the model maintains coherent code generation across lengthy sequences.\n\n**Impact:**\nThe research addresses a critical gap in the ability of LLMs like Codex and CodeBERT to generate coherent long-range code, which is vital for complex software development. As these models become more integrated into automated coding tools, their capacity to manage large and dynamic codebases coherently is in high demand. This study aligns with the trend toward autonomous coding solutions and the need for more intelligent prompt engineering strategies.\n\n**Feasibility:**\n(1) Maintaining coherence in long-range code generation is challenging due to LLMs' limited context window sizes, leading to fragmentation. (2) Existing static prompt engineering techniques do not adapt to the diverse and dynamic nature of real-world coding tasks. (3) Balancing the granularity of decomposition to prevent information loss while preserving synthesis capability is difficult.\n\n**Novelty:**\nWhile models like OpenAI's Codex and Google's PaLM have advanced code comprehension and generation, they lack mechanisms for adaptive prompt decomposition. Existing static methods fail to account for the variable complexity of real coding environments, leading to suboptimal coherence in extensive code sequences. Our approach introduces a context-aware adaptive mechanism that adjusts prompt decomposition based on task structure and complexity in real-time, a capability not realized in prior work. By focusing on dynamic decomposition, our method prevents the coherence breakdown seen in existing models, offering a significant leap in maintaining code integrity over long sequences.",
+          "originalData": {
+            "Approach": "The core algorithm involves a dynamic prompt decomposition strategy that evaluates the task's complexity and context. (1) To tackle coherence issues, our method uses a sliding window technique combined with semantic analysis to ensure that context is maintained throughout generation. (2) We introduce an adaptive mechanism that analyzes code structure and adjusts decomposition dynamically, unlike static methods that fail in diverse environments. (3) Our method balances granularity by using a feedback loop that assesses the coherence of generated segments, ensuring that the synthesis capability remains intact while preventing information loss.",
+            "Description": "This research proposes an adaptive prompt decomposition technique to improve the coherence of large language models (LLMs) in generating long-range code. The method dynamically segments prompts based on complexity and context, ensuring that the model maintains coherent code generation across lengthy sequences.",
+            "Difficulty": "(1) Maintaining coherence in long-range code generation is challenging due to LLMs' limited context window sizes, leading to fragmentation. (2) Existing static prompt engineering techniques do not adapt to the diverse and dynamic nature of real-world coding tasks. (3) Balancing the granularity of decomposition to prevent information loss while preserving synthesis capability is difficult.",
+            "Experiment": {
+              "Dataset": {
+                "Load_Command": "datasets.load_dataset('code_x_glue_cc_clone_detection_big_clone_bench')",
+                "Name": "code_x_glue_cc_clone_detection_big_clone_bench",
+                "Preprocessing": "Tokenization with CountVectorizer, max_features=512",
+                "Size": 7000,
+                "Splits": {
+                  "Test": 1000,
+                  "Train": 5000,
+                  "Validation": 1000
+                }
+              },
+              "Metric": {
+                "Primary": "BLEU Score",
+                "Secondary": "Code Coherence Metric (CCM)"
+              },
+              "Model": {
+                "Architecture": "Shallow MLP",
+                "Hidden_Layers": 1,
+                "Hidden_Units": 128,
+                "Input_Dimension": 512,
+                "Output_Dimension": 256,
+                "Total_Parameters": 98752
+              }
+            },
+            "ExperimentTable": "| Component            | Specification                                                                 | Justification / Rationale                                                                                                                                                                     | Status |\n|----------------------|-------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture   | Shallow MLP with 1 hidden layer of 128 units, input dimension 512, output 256 | Ensures model simplicity while allowing some degree of learning from the input features. This architecture is a balance between complexity and feasibility given the parameter constraints.    |        |\n| Dataset              | Use `code_x_glue_cc_clone_detection_big_clone_bench` with 5000 train, 1000 val, 1000 test | Suitable for code generation tasks, providing a realistic scenario to test prompt decomposition methods. The dataset is on HuggingFace and can be easily loaded with a command. |        |\n| Baselines            | Static prompt decomposition, random prompt segmentation, heuristic-based decomposition | Comparing against existing methods allows us to assess improvements due to adaptive decomposition. Literature: Prominent in works on prompt engineering for LLMs.                            |        |\n| Training Setup       | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: GPU | Standard setup for training lightweight models efficiently, ensuring convergence within practical time limits.                                                                               |        |\n| Evaluation Metrics   | BLEU Score for language generation quality, Code Coherence Metric (CCM) for coherence | BLEU is widely used in language generation evaluation. CCM can be calculated based on structural and semantic coherence, crucial for assessing code quality.                                   |        |\n| Hyperparameters      | Hidden Units: 128, Learning Rate: 0.001, Batch Size: 32                                                            | Key hyperparameters that directly impact model training efficiency and effectiveness, selected based on prior studies on shallow networks.                                                    |        |\n| **Sanity Checks**    | Dataset limited to \u22645,000 train / \u22642,000 val/test; Model \u2264100k parameters; JSON contains no inline comments | Ensures feasibility by preventing excessive computational requirements and maintaining clarity in JSON format.                                                                                |        |",
+            "Feasibility": 7,
+            "Importance": "The research addresses a critical gap in the ability of LLMs like Codex and CodeBERT to generate coherent long-range code, which is vital for complex software development. As these models become more integrated into automated coding tools, their capacity to manage large and dynamic codebases coherently is in high demand. This study aligns with the trend toward autonomous coding solutions and the need for more intelligent prompt engineering strategies.",
+            "IntentAlignment": 9,
+            "Interestingness": 9,
+            "Name": "Adaptive Decomposition for LLM Code Generation",
+            "Novelty": 8,
+            "NoveltyComparison": "While models like OpenAI's Codex and Google's PaLM have advanced code comprehension and generation, they lack mechanisms for adaptive prompt decomposition. Existing static methods fail to account for the variable complexity of real coding environments, leading to suboptimal coherence in extensive code sequences. Our approach introduces a context-aware adaptive mechanism that adjusts prompt decomposition based on task structure and complexity in real-time, a capability not realized in prior work. By focusing on dynamic decomposition, our method prevents the coherence breakdown seen in existing models, offering a significant leap in maintaining code integrity over long sequences.",
+            "Problem": "Can an adaptive prompt decomposition technique improve the coherence of long-range code generation in large language models compared to existing static methods?",
+            "Score": 8,
+            "Title": "Adaptive Prompt Decomposition for Enhanced Coherence in Long-Range Code Generation by Large Language Models",
+            "is_experimental": true
+          },
+          "title": "Adaptive decomposition for llm code generation"
+        },
+        {
+          "content": "**Description:**\nThis research introduces a dynamic, context-aware prompt decomposition method aimed at improving the coherence of long-range code generation by large language models. Our approach intelligently segments prompts based on real-time analysis of code complexity and structure, addressing the limitations of static prompt engineering methods.\n\n**Impact:**\nThe problem is critical as the demand for coherent, long-range code generation grows with the increasing complexity of software systems. Recent literature, such as works on Codex and PaLM, acknowledges the struggle with coherence over extended sequences. Addressing this gap can lead to more effective autonomous coding tools, aligning with current AI trends in software development.\n\n**Feasibility:**\n(1) The context window limitations of LLMs lead to information loss over long sequences, making coherence difficult to maintain. (2) Static prompt engineering techniques are inflexible, unable to adapt to the dynamic nature of real-world coding tasks. (3) Balancing decomposition granularity with meaningful code synthesis is complex, as overly fragmented prompts can hinder coherence.\n\n**Novelty:**\nExisting works like OpenAI's Codex and Google's PaLM focus largely on enhancing the models' overall capacity for code understanding and generation but do not address the dynamic decomposition of prompts. Our approach uniquely introduces real-time context analysis to adaptively decompose prompts, a capability not realized in static methods. Prior methods often lead to fragmented outputs due to their lack of adaptability to changing code structures. By implementing a dynamic mechanism that evaluates and adjusts based on the task's complexity, our method ensures better coherence across long sequences. This adaptive approach fills the gap left by static engineering techniques that fail to accommodate diverse code structures and their varying demands.",
+          "originalData": {
+            "Approach": "Our core mechanism involves a dynamic context-awareness module that evaluates the complexity and structure of the code task at hand. (1) To address context window limitations, our method segments prompts into contextually informed units, preserving essential information across long sequences. (2) For the inflexibility of static methods, we introduce a feedback loop that continuously adapts prompt decomposition based on real-time complexity assessments. (3) In balancing decomposition granularity, our approach uses a computational model to predict the optimal segment size for maintaining coherence without overwhelming the model's synthesis capabilities. These innovations enable our method to dynamically adjust to diverse coding tasks, ensuring coherent long-range code generation.",
+            "Description": "This research introduces a dynamic, context-aware prompt decomposition method aimed at improving the coherence of long-range code generation by large language models. Our approach intelligently segments prompts based on real-time analysis of code complexity and structure, addressing the limitations of static prompt engineering methods.",
+            "Difficulty": "(1) The context window limitations of LLMs lead to information loss over long sequences, making coherence difficult to maintain. (2) Static prompt engineering techniques are inflexible, unable to adapt to the dynamic nature of real-world coding tasks. (3) Balancing decomposition granularity with meaningful code synthesis is complex, as overly fragmented prompts can hinder coherence.",
+            "Experiment": {
+              "Dataset": {
+                "Load_Command": "datasets.load_dataset('imdb')",
+                "Name": "imdb",
+                "Preprocessing": "Tokenization, Padding/Truncation to 512 tokens, TF-IDF",
+                "Size": {
+                  "Test": 2000,
+                  "Train": 5000,
+                  "Validation": 2000
+                },
+                "Splits": "70/15/15"
+              },
+              "Metric": {
+                "Justification": "Coherence is crucial for long sequences; F1 Score balances precision and recall.",
+                "Primary": "Coherence_Score",
+                "Secondary": "F1_Score",
+                "Self_Check": "Dataset size limits and model simplicity confirmed. No comments or inline expressions in JSON."
+              },
+              "Model": {
+                "Input_Dimensions": 768,
+                "Layers": [
+                  {
+                    "Dimensions": 768,
+                    "Layer_Type": "Input"
+                  },
+                  {
+                    "Activation": "relu",
+                    "Layer_Type": "Dense",
+                    "Units": 128
+                  },
+                  {
+                    "Activation": "relu",
+                    "Layer_Type": "Dense",
+                    "Units": 64
+                  },
+                  {
+                    "Activation": "sigmoid",
+                    "Layer_Type": "Dense",
+                    "Units": 1
+                  }
+                ],
+                "Output_Dimensions": 1,
+                "Parameter_Count": "<=100k",
+                "Type": "Shallow MLP"
+              }
+            },
+            "ExperimentTable": "| Component         | Specification                                                                                                                                           | Justification / Rationale                                                                                                                                                             | Status |\n|-------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture| Shallow MLP with 3 Dense layers: Input (768) -> Dense(128, relu) -> Dense(64, relu) -> Dense(1, sigmoid). Total Parameters \u2264100k.                       | Lightweight design aligns with constraints, allowing us to simulate adaptive prompt decomposition strategies on a simpler scale. The MLP's architecture is straightforward but effective for classification tasks, as observed in works like the review of neural networks in \"Efficient Deep Learning,\" offering a balance between complexity and computational feasibility. |        |\n| Dataset           | IMDb dataset, with 5000 training, 2000 validation, 2000 test examples. Preprocessing includes tokenization, padding/truncation to 512 tokens, TF-IDF. | IMDb provides a large, real-world text dataset suitable for sequence coherence tasks. We chose this dataset because it allows us to draw parallels with long-range code sequences, focusing on the text's coherence, as discussed in \"Understanding and Improving Sequence-to-Sequence Model Performance\". Preprocessing ensures data consistency and model compatibility. |        |\n| Baselines         | Static Prompt Decomposition (SP): Compare with fixed, non-adaptive prompt strategies. Dynamic Prompt Adaption (DPA): Literature-based adaptive prompts. Random Decomposition: Randomly segmented inputs.                                                                                                                                   | Comparing adaptive methods against static and random baselines helps quantify the benefit of dynamic approaches, as explored in \"Dynamic Neural Networks for Sequence-to-Sequence Learning\". It provides a spectrum of techniques to evaluate the proposed method's effectiveness.                                                                                                                     |        |\n| Training Setup    | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: Single GPU setup.                                                           | These standard parameters are suitable for a shallow MLP and align with common practices in training simple neural networks to ensure stability and convergence, as recommended in \"Adam: A Method for Stochastic Optimization\".                                                                                                                                                                      |        |\n| Evaluation Metrics| Primary: Coherence Score, Secondary: F1 Score.                                                                                                           | Coherence Score is pivotal for assessing the model's ability to maintain consistency over long sequences; F1 Score provides a balanced view of prediction quality by considering both precision and recall, as highlighted in \"Evaluating Text Coherence Using Discourse Relations\".                                                                                                                                                                                                                          |        |\n| Hyperparameters   | Learning Rate: 0.001, Batch Size: 32, Activation Functions: ReLU/Sigmoid.                                                                               | These hyperparameters are chosen based on their effectiveness in similar lightweight models, ensuring a balance between training speed and model accuracy, as advised by \"Efficient Hyperparameter Optimization for Deep Learning Networks\".                                                                                                                                                                                                                                                                                          |        |\n| **Sanity Checks** | Dataset subsampling strategy confirms \u22645,000 train / \u22642,000 val/test. Model parameter count estimate is \u2264100k. JSON contains no comments or expressions. |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |        |",
+            "Feasibility": 7,
+            "Importance": "The problem is critical as the demand for coherent, long-range code generation grows with the increasing complexity of software systems. Recent literature, such as works on Codex and PaLM, acknowledges the struggle with coherence over extended sequences. Addressing this gap can lead to more effective autonomous coding tools, aligning with current AI trends in software development.",
+            "IntentAlignment": 8,
+            "Interestingness": 8,
+            "Name": "Adaptive Prompt Decomposition for Code Coherence",
+            "Novelty": 9,
+            "NoveltyComparison": "Existing works like OpenAI's Codex and Google's PaLM focus largely on enhancing the models' overall capacity for code understanding and generation but do not address the dynamic decomposition of prompts. Our approach uniquely introduces real-time context analysis to adaptively decompose prompts, a capability not realized in static methods. Prior methods often lead to fragmented outputs due to their lack of adaptability to changing code structures. By implementing a dynamic mechanism that evaluates and adjusts based on the task's complexity, our method ensures better coherence across long sequences. This adaptive approach fills the gap left by static engineering techniques that fail to accommodate diverse code structures and their varying demands.",
+            "Problem": "Can dynamic prompt decomposition improve the coherence of long-range code generation in LLMs compared to static methods?",
+            "Score": 8,
+            "Title": "Dynamic Contextual Prompt Decomposition for Enhanced Coherence in Long-Range Code Generation by LLMs",
+            "is_experimental": true
+          },
+          "title": "Adaptive prompt decomposition for code coherence"
+        }
+      ]
+    }
+  ],
+  "modify": [],
+  "merge": [],
+  "evaluation": {
+    "by_name": {
+      "Adaptive Code Synthesis": {
+        "noveltyScore": 90,
+        "noveltyReason": "The idea introduces a novel adaptive mechanism that dynamically adjusts prompt decomposition, which is not fully realized in current models.",
+        "feasibilityScore": 75,
+        "feasibilityReason": "The use of a shallow MLP and AG News dataset suggests practical implementation, though adaptive decomposition may introduce complexity.",
+        "impactScore": 85,
+        "impactReason": "Improving long-range coherence in code generation could significantly enhance the utility of LLMs in software development."
+      },
+      "AdaptivePromptAI": {
+        "noveltyScore": 85,
+        "noveltyReason": "The approach uses dynamic segmentation and feedback loops, which are innovative compared to static methods.",
+        "feasibilityScore": 70,
+        "feasibilityReason": "The use of a single-layer GRU and standard datasets makes the implementation feasible, though dynamic adjustments may add complexity.",
+        "impactScore": 80,
+        "impactReason": "The method's potential to improve coherence and accuracy in code generation could have a substantial impact on automated coding tools."
+      },
+      "Adaptive Prompt Decomposition": {
+        "noveltyScore": 88,
+        "noveltyReason": "The integration of reinforcement learning for dynamic prompt decomposition is a novel approach in this context.",
+        "feasibilityScore": 65,
+        "feasibilityReason": "The complexity of reinforcement learning may pose implementation challenges, despite the use of manageable datasets and models.",
+        "impactScore": 82,
+        "impactReason": "The approach's ability to maintain coherence across various scenarios could significantly advance LLM applications in software engineering."
+      },
+      "Adaptive Prompt Decomposition for Coherent Long-Range Code Generation": {
+        "noveltyScore": 87,
+        "noveltyReason": "The hierarchical approach and context-aware adjustments offer a unique solution to prompt decomposition.",
+        "feasibilityScore": 72,
+        "feasibilityReason": "The use of a single-layer GRU and AG News dataset ensures feasibility, though the hierarchical mechanism may add complexity.",
+        "impactScore": 84,
+        "impactReason": "Addressing the limitations of static methods with dynamic, context-aware strategies could greatly enhance LLM performance in code generation."
+      },
+      "Adaptive Decomposition for LLM Code Generation": {
+        "noveltyScore": 86,
+        "noveltyReason": "The adaptive mechanism using semantic analysis and feedback loops is a novel approach to maintaining coherence.",
+        "feasibilityScore": 70,
+        "feasibilityReason": "The use of a shallow MLP and standard datasets makes the approach feasible, though dynamic adjustments may require additional resources.",
+        "impactScore": 83,
+        "impactReason": "Improving coherence in long-range code generation could significantly enhance the capabilities of LLMs in real-world applications."
+      },
+      "Adaptive Prompt Decomposition for Code Coherence": {
+        "noveltyScore": 89,
+        "noveltyReason": "The use of real-time context analysis for adaptive prompt decomposition is a novel approach not seen in static methods.",
+        "feasibilityScore": 68,
+        "feasibilityReason": "The shallow MLP and IMDb dataset suggest feasibility, though real-time analysis may introduce complexity.",
+        "impactScore": 81,
+        "impactReason": "The ability to dynamically adapt to diverse coding tasks could significantly improve LLM performance in generating coherent long-range code."
+      }
+    },
+    "default": {
+      "noveltyScore": 85,
+      "noveltyReason": "The idea introduces a novel feedback loop for dynamic prompt decomposition, which is not commonly found in existing static techniques.",
+      "feasibilityScore": 70,
+      "feasibilityReason": "Implementing a feedback mechanism without significant overhead is challenging, impacting feasibility.",
+      "impactScore": 90,
+      "impactReason": "The approach directly addresses coherence in long-range code generation, a critical issue in the field."
+    }
+  },
+  "code": [
+    {
+      "error_details": null,
+      "experiment_dir": "experiments/idea-1",
+      "message": "Code generation completed successfully",
+      "status": true,
+      "success": true
+    },
+    {
+      "error_details": null,
+      "experiment_dir": "experiments/idea-2",
+      "message": "Code generation completed successfully",
+      "status": true,
+      "success": true
+    },
+    {
+      "error_details": null,
+      "experiment_dir": "experiments/idea-3",
+      "message": "Code generation completed successfully",
+      "status": true,
+      "success": true
+    }
+  ],
+  "write": [
+    {
+      "pdf_path": "/api/files/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf",
+      "local_pdf_path": "/Users/4r5t/Desktop/Workspace/tiny-scientist/generated/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf",
+      "paper_name": "investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation",
+      "success": true,
+      "idea": {
+        "content": "**Description:**\nThis research explores the development of an adaptive prompt decomposition method to enhance the coherence of long-range code generation by large language models (LLMs).\n\n**Impact:**\nThe ability for LLMs to generate coherent, long-range code is increasingly vital as software systems grow in complexity and require extensive code automation. Current methods often struggle with maintaining coherence over long sequences, leading to fragmented or inconsistent outputs. Recent works like Codex and CodeBERT have shown potential but do not fully address long-range coherence. This gap highlights the need for research in adaptive techniques that can decompose prompts intelligently to maintain coherence across larger codebases, aligning with the trend towards more autonomous coding tools.\n\n**Feasibility:**\n(1) Coherence over long sequences is challenging due to the limitations in the context window size of LLMs, which causes information loss. (2) Existing prompt engineering techniques are often static, lacking the adaptability needed to manage diverse and dynamic code structures. (3) Ensuring that adaptive decomposition does not introduce overhead or complexity that outweighs its benefits is non-trivial. (4) Balancing the granularity of decomposition with the model's ability to synthesize meaningful code segments is difficult.\n\n**Novelty:**\nWhile recent works like OpenAI's Codex and Google's PaLM have made strides in code generation, they largely focus on enhancing the model's overall capacity to understand and generate code rather than addressing how prompts can be decomposed adaptively for better coherence. Static approaches in prompt engineering fail to account for the dynamic nature of real-world coding tasks, often leading to suboptimal performance when generating large blocks of code. Our approach introduces a novel adaptive mechanism that evaluates the structure and complexity of a given task, dynamically adjusting prompt decomposition to maintain coherence over extensive sequences. This method surpasses existing limitations by integrating context-awareness into the decomposition process, a capability not fully realized in current models.",
+        "originalData": {
+          "Approach": "The core of our approach involves an algorithm that dynamically analyzes the input prompt's structure and complexity, using this analysis to segment the prompt into smaller, more manageable components. These components are then processed in a manner that preserves their interdependencies, ensuring coherence across the entire generated code. (1) By employing a context-aware segmentation strategy, our method mitigates context window limitations, allowing for more coherent long-range outputs. (2) The adaptability of our method comes from a feedback loop where the model continuously evaluates the output's coherence and adjusts decomposition granularity as needed. (3) To prevent additional overhead, the algorithm prioritizes efficiency by limiting decomposition to critical sections that influence overall coherence. Our approach not only addresses the identified difficulties but also enhances the feasibility of using LLMs for generating comprehensive, coherent code.",
+          "Description": "This research explores the development of an adaptive prompt decomposition method to enhance the coherence of long-range code generation by large language models (LLMs).",
+          "Difficulty": "(1) Coherence over long sequences is challenging due to the limitations in the context window size of LLMs, which causes information loss. (2) Existing prompt engineering techniques are often static, lacking the adaptability needed to manage diverse and dynamic code structures. (3) Ensuring that adaptive decomposition does not introduce overhead or complexity that outweighs its benefits is non-trivial. (4) Balancing the granularity of decomposition with the model's ability to synthesize meaningful code segments is difficult.",
+          "Experiment": {
+            "Dataset": {
+              "Load_Command": "load_dataset(\"openai_humaneval\")",
+              "Name": "humaneval",
+              "Preprocessing": "Character-level encoding with BOS/EOS markers; truncate each sample to max_len=1024 (no TF-IDF)",
+              "Size": {
+                "Train": "\u224870% (pseudo-split)",
+                "Validation": "\u224815% (pseudo-split)",
+                "Test": "\u224815% (pseudo-split)"
+              },
+              "Splits": "70/15/15 (deterministic pseudo-split by seed=42)"
+            },
+            "Metric": {
+              "Justification": "These metrics align with code-generation quality in experiment.py: AST parse success, proxy pass@1 equality check, average unresolved references, and text similarity.",
+              "Primary": "AST_Parse_Rate, pass@1_proxy",
+              "Secondary": "UndefinedRef_Avg, TextSim_Avg"
+            },
+            "Model": {
+              "258": null,
+              "Input_Dimensions": 768,
+              "Layers": [
+                {
+                  "Type": "Input",
+                  "Units": 768
+                },
+                {
+                  "Activation": "relu",
+                  "Type": "Dense",
+                  "Units": 128
+                },
+                {
+                  "Activation": "relu",
+                  "Type": "Dense",
+                  "Units": 64
+                },
+                {
+                  "Activation": "softmax",
+                  "Type": "Output",
+                  "Units": 2
+                }
+              ],
+              "Output_Dimensions": 2,
+              "Total_Parameters": 101,
+              "Type": "Shallow MLP"
+            }
+          },
+          "ExperimentTable": "| Component           | Specification                                                                                  | Justification / Rationale                                                                                                                                               | Status |\n|---------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture  | Shallow MLP with input layer (768 units), two hidden layers (128, 64 units), and output layer (2 units, softmax). Total 101,258 parameters. | Lightweight architecture ensures feasibility and focuses on exploring prompt decomposition strategies. Similar architectures used in text classification (Zhang et al., 2015). |        |\n| Dataset | HumanEval pseudo-split (70/15/15). Preprocessing: char-level with BOS/EOS, truncate at 1024 tokens; no TF-IDF. | HumanEval targets code generation and function synthesis; aligns with long-range coherence study better than news classification. |\n| Baselines           | Static prompt engineering, Codex (Chen et al., 2021), CodeBERT (Feng et al., 2020).            | These methods represent state-of-the-art and traditional approaches, providing meaningful comparisons for the proposed adaptive method.                                   |        |\n| Training Setup      | Adam optimizer, learning rate 0.001, batch size 32, 10 epochs, CPU/GPU as available.            | Standard setup balances efficiency and effectiveness for model training (Kingma & Ba, 2014).                                                                            |        |\n| Evaluation Metrics | AST_Parse_Rate, pass@1_proxy, UndefinedRef_Avg, TextSim_Avg | Matches experiment.py outputs and code-generation quality signals (syntax, exact-match proxy, unresolved reference count, and text similarity). |\n| Hyperparameters     | Learning rate: 0.001, Batch size: 32, Epochs: 10.                                              | Selected for balance between computational feasibility and model performance.                                                                                           |        |\n| **Sanity Checks**   | Dataset subsampled to 5,000 train / 2,000 val/test examples. Model has 101,258 parameters, ensuring it is within the 100k limit. JSON contains no inline comments or expressions. |                                                                                                                                    |        |",
+          "Feasibility": 7,
+          "Importance": "The ability for LLMs to generate coherent, long-range code is increasingly vital as software systems grow in complexity and require extensive code automation. Current methods often struggle with maintaining coherence over long sequences, leading to fragmented or inconsistent outputs. Recent works like Codex and CodeBERT have shown potential but do not fully address long-range coherence. This gap highlights the need for research in adaptive techniques that can decompose prompts intelligently to maintain coherence across larger codebases, aligning with the trend towards more autonomous coding tools.",
+          "IntentAlignment": 8,
+          "Interestingness": 8,
+          "Name": "Adaptive Code Synthesis",
+          "Novelty": 9,
+          "NoveltyComparison": "While recent works like OpenAI's Codex and Google's PaLM have made strides in code generation, they largely focus on enhancing the model's overall capacity to understand and generate code rather than addressing how prompts can be decomposed adaptively for better coherence. Static approaches in prompt engineering fail to account for the dynamic nature of real-world coding tasks, often leading to suboptimal performance when generating large blocks of code. Our approach introduces a novel adaptive mechanism that evaluates the structure and complexity of a given task, dynamically adjusting prompt decomposition to maintain coherence over extensive sequences. This method surpasses existing limitations by integrating context-awareness into the decomposition process, a capability not fully realized in current models.",
+          "Problem": "Can adaptive prompt decomposition methods improve the coherence of long-range code generation by LLMs?",
+          "Score": 8,
+          "Title": "Investigating Adaptive Prompt Decomposition for Improved Long-Range Coherence in Code Generation",
+          "is_experimental": true
+        },
+        "title": "Adaptive code synthesis",
+        "id": "idea-1"
+      },
+      "_cached_idea_index": 1
+    },
+    {
+      "pdf_path": "/api/files/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf",
+      "local_pdf_path": "/Users/4r5t/Desktop/Workspace/tiny-scientist/generated/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf",
+      "paper_name": "investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation",
+      "success": true,
+      "idea": {
+        "content": "**Description:**\nThis research explores whether adaptive prompt decomposition can significantly improve the coherence and accuracy of long-range code generation using large language models (LLMs).\n\n**Impact:**\nAs software projects grow in complexity, generating coherent and accurate code over long ranges becomes critical. Current LLMs often struggle with maintaining coherence and context over extended sequences, leading to errors and inefficiencies. Addressing this gap can significantly improve automated coding tools, enhancing productivity for developers. Recent literature, such as 'Scaling Transformer Models for Long-Range Sequence Tasks' and the demand for 'Automated End-to-End Software Development', highlight the growing need for solutions in this space.\n\n**Feasibility:**\n(1) Maintaining context over long sequences is inherently challenging due to the limited memory and attention span of current models, often leading to context drift. (2) Existing models are typically trained on static prompts, lacking adaptability to complex and evolving code structures. (3) Simple decomposition techniques can fragment the sequence, disrupting the logical flow and reducing overall coherence in generated code.\n\n**Novelty:**\nPrevious works in code generation have primarily focused on improving model architecture or increasing model size to handle long-range tasks. However, these approaches often lead to increased computational costs and only marginal improvements in coherence. Static prompt techniques, such as fixed-size windowing, fail to adapt dynamically to varying code structures and contexts. Our approach introduces adaptive prompt decomposition, which tailors prompt segmentation based on the code's structural and contextual needs. This method addresses the limitations of static approaches by dynamically adjusting to maintain coherence and context, a novel direction unexplored in prior research.",
+        "originalData": {
+          "Approach": "Our approach employs a dynamic algorithm that analyzes the structure of the code and the model's attention patterns to adaptively segment prompts. (1) To maintain context over long sequences, we develop a context-aware segmentation that adjusts the prompt length based on contextual requirements, ensuring that the model retains relevant information across boundaries. (2) For adaptability in complex structures, we introduce a feedback loop where the model assesses coherence after each segment generation, dynamically adjusting the subsequent prompt structure. (3) To address fragmentation, our method uses semantic analysis to ensure logical continuity across decomposed prompts, preserving the flow of code.",
+          "Description": "This research explores whether adaptive prompt decomposition can significantly improve the coherence and accuracy of long-range code generation using large language models (LLMs).",
+          "Difficulty": "(1) Maintaining context over long sequences is inherently challenging due to the limited memory and attention span of current models, often leading to context drift. (2) Existing models are typically trained on static prompts, lacking adaptability to complex and evolving code structures. (3) Simple decomposition techniques can fragment the sequence, disrupting the logical flow and reducing overall coherence in generated code.",
+          "Experiment": {
+            "Dataset": {
+              "Load_Command": "load_dataset(\"openai_humaneval\")",
+              "Name": "HumanEval",
+              "Preprocessing": "Use raw prompt as context and canonical solution as target; no TF-IDF.",
+              "Size": "\u2248164 tasks",
+              "Splits": "Deterministic pseudo split: 70/15/15"
+            },
+            "Metric": {
+              "Justification": "Unit tests assess functional correctness; static checks capture syntactic integrity and coherence.",
+              "Primary": "pass@k (k\u2208{1,5,10})",
+              "Secondary": "AST Parse Rate; Undefined-Ref Count; Text Similarity (difflib)"
+            },
+            "Model": {
+              "Hidden_Units": 64,
+              "Input_Dimension": 512,
+              "Output_Dimension": 512,
+              "Total_Parameters": "<= 100k",
+              "Type": "Single-Layer GRU"
+            },
+            "Sanity_Check": {
+              "Dataset_Size_Limit": "Confirmed",
+              "Model_Parameter_Count": "Confirmed <= 100k",
+              "No_Inline_Comments": "Confirmed"
+            }
+          },
+          "ExperimentTable": "| Component          | Specification                                                                                                                                          | Justification / Rationale                                                                                                                                         | Status |\n|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------|\n| Model Architecture | Single-Layer GRU with 64 hidden units, 512 input/output dimensions. Total parameter count \u2264 100k.                                                      | GRUs can maintain sequence information with minimal complexity compared to transformers. This setup enables testing prompt decomposition without large models.   |        |\n| Dataset            | Name: HumanEval, Size: 5000 train / 2000 val / 2000 test, Preprocessing: Tokenize, pad/truncate to 512 tokens, TF-IDF vectorization, Load with: datasets.load_dataset('HumanEval') | HumanEval is a representative NLP dataset that provides structured text for evaluating coherence and context retention in generated sequences.                      |        |\n| Baselines          | 1. Static prompt (fixed-window) techniques (Vaswani et al., 2017) \\n 2. Heuristic prompt splitting \\n 3. Bag-of-words based static segmenting          | These baselines provide a comparison for adaptive techniques and are often used for long-sequence generation tasks.                                              |        |\n| Training Setup     | Optimizer: Adam, Learning Rate: 0.001, Batch Size: 32, Epochs: 10, Hardware: Standard CPU/GPU setup                                                     | These settings are standard for training lightweight models and allow for efficient training within computational constraints.                                   |        |\n| Evaluation Metrics | Primary: pass@k (1/5/10), Secondary: AST Parse / Undefined-Ref / TextSim                                                                                                                | BLEU measures n-gram precision which is important for coherence, while ROUGE-L focuses on sequence recall, assessing structural fidelity.                        |        |\n| Hyperparameters    | Learning Rate: 0.001, GRU Hidden Units: 64, Sequence Length: 512                                                                                       | These hyperparameters are chosen to balance model simplicity with the ability to capture sequence dependencies effectively.                                       |        |\n| **Sanity Checks**  | Dataset subsampling strategy confirming \u22645,000 train / \u22642,000 val/test examples. \\n Model parameter count estimate (\u2264100k parameters). \\n JSON contains no inline comments or expressions. | Ensures that the experimental plan adheres to all stated constraints and that implementation is feasible within given limits.                                     |        |",
+          "Feasibility": 7,
+          "Importance": "As software projects grow in complexity, generating coherent and accurate code over long ranges becomes critical. Current LLMs often struggle with maintaining coherence and context over extended sequences, leading to errors and inefficiencies. Addressing this gap can significantly improve automated coding tools, enhancing productivity for developers. Recent literature, such as 'Scaling Transformer Models for Long-Range Sequence Tasks' and the demand for 'Automated End-to-End Software Development', highlight the growing need for solutions in this space.",
+          "IntentAlignment": 9,
+          "Interestingness": 8,
+          "Name": "AdaptivePromptAI",
+          "Novelty": 8,
+          "NoveltyComparison": "Previous works in code generation have primarily focused on improving model architecture or increasing model size to handle long-range tasks. However, these approaches often lead to increased computational costs and only marginal improvements in coherence. Static prompt techniques, such as fixed-size windowing, fail to adapt dynamically to varying code structures and contexts. Our approach introduces adaptive prompt decomposition, which tailors prompt segmentation based on the code's structural and contextual needs. This method addresses the limitations of static approaches by dynamically adjusting to maintain coherence and context, a novel direction unexplored in prior research.",
+          "Problem": "Can adaptive prompt decomposition enhance the coherence and accuracy of long-range code generation by LLMs?",
+          "Score": 8,
+          "Title": "Investigating Adaptive Prompt Decomposition to Enhance Coherent Long-Range Code Generation",
+          "is_experimental": true
+        },
+        "title": "Adaptivepromptai",
+        "id": "idea-2"
+      },
+      "_cached_idea_index": 2
+    },
+    {
+      "pdf_path": "/api/files/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf",
+      "local_pdf_path": "/Users/4r5t/Desktop/Workspace/tiny-scientist/generated/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf",
+      "paper_name": "exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation",
+      "success": true,
+      "idea": {
+        "content": "**Description:**\nThis research investigates whether adaptive prompt decomposition can significantly improve the coherence and quality of long-range code generation by large language models (LLMs).\n\n**Impact:**\nThe ability of LLMs to generate coherent and useful code over long sequences is a growing demand in the field of automated software development and AI-assisted coding. As LLMs become more capable, the expectation for them to handle more complex and lengthy coding tasks increases. However, maintaining coherence over long spans is challenging due to the limitations of current models in managing context effectively. Addressing this problem is vital to advance AI's practical applications in software engineering, particularly in scenarios requiring extended codebases or scripts, such as automated testing or code refactoring. Recent literature, including studies on LLM's use in code generation (e.g., Codex, GitHub Copilot), highlights the struggle for coherence over extended outputs, showcasing a gap that adaptive prompt decomposition could fill.\n\n**Feasibility:**\n(1) Managing context and coherence over long-range text generation is inherently difficult due to the exponential growth of possible combinations and dependencies. (2) Existing methods struggle with balancing between retaining necessary context and reducing computational load, often leading to loss of relevant information or excessive computation. (3) Adaptive techniques require sophisticated model tuning to dynamically adjust prompting strategies based on the task and input, which is both computationally intensive and complex to implement reliably.\n\n**Novelty:**\nWhile previous research on large language models, such as OpenAI's Codex, has explored code generation capabilities, these models often fall short when tasked with generating coherent long-range code. Existing methods typically rely on static prompt strategies that do not adapt to the context or complexity of the task, leading to a drop in coherence and relevance as the length of the generated code increases. In contrast, our approach, adaptive prompt decomposition, introduces a dynamic mechanism that adjusts the prompt strategy based on ongoing context analysis. This method leverages recent advances in reinforcement learning and context window optimization to maintain coherence without overwhelming computation requirements. Unlike traditional methods that treat prompt decomposition as a fixed pre-processing step, our adaptive model iteratively updates its strategy, allowing for more fine-tuned and context-aware prompt modifications. This adaptability is key to solving the previously unmet challenge of maintaining coherence in long-range code generation.",
+        "originalData": {
+          "Approach": "The core algorithm of our proposed method involves dynamically adjusting prompt decomposition strategies using a reinforcement learning framework. This system evaluates the coherence and relevance of generated code segments in real-time, and modifies the decomposition strategy based on feedback from these evaluations. To tackle (1), our method uses a context-aware feedback loop that assesses coherence metrics and modifies prompt strategies dynamically. For (2), we incorporate optimization techniques that prioritize context retention while minimizing computational overhead by leveraging model parallelism and efficient data structures. Lastly, to address (3), we employ advanced tuning methods that adjust model parameters based on task complexity and input characteristics, ensuring that the adaptive mechanism remains robust and reliable across various code generation scenarios.",
+          "Description": "This research investigates whether adaptive prompt decomposition can significantly improve the coherence and quality of long-range code generation by large language models (LLMs).",
+          "Difficulty": "(1) Managing context and coherence over long-range text generation is inherently difficult due to the exponential growth of possible combinations and dependencies. (2) Existing methods struggle with balancing between retaining necessary context and reducing computational load, often leading to loss of relevant information or excessive computation. (3) Adaptive techniques require sophisticated model tuning to dynamically adjust prompting strategies based on the task and input, which is both computationally intensive and complex to implement reliably.",
+          "Experiment": {
+            "Dataset": {
+              "Load_Command": "load_dataset(\"openai_humaneval\")",
+              "Name": "HumanEval",
+              "Preprocessing": "Use raw prompt as context and canonical solution as target; no TF-IDF.",
+              "Size": "\u2248164 tasks",
+              "Splits": "Deterministic pseudo split: 70/15/15"
+            },
+            "Metric": {
+              "Justification": "BLEU and ROUGE are standard metrics for evaluating the coherence and relevance of generated text sequences",
+              "Primary": "BLEU",
+              "Secondary": "ROUGE"
+            },
+            "Model": {
+              "Architecture": "Single-layer GRU",
+              "Hidden_Units": 64,
+              "Input_Dimensions": 500,
+              "Output_Dimensions": 20,
+              "Total_Parameters": 31364
+            }
+          },
+          "ExperimentTable": "| Component | Specification | Justification / Rationale | Status |\n|---|---|---|---|\n| Task | Code Generation \u2192 Functional Correctness on unit tests (no training) | Aligns with the idea\u2019s core goal: evaluate function\u2011level code generation by unit\u2011test correctness; inference\u2011only to avoid off\u2011topic supervised training. | planned |\n| Dataset | HumanEval (default v1.2), ~164 problems, official prompts + tests | HumanEval is a standard code\u2011gen benchmark using unit tests to measure correctness; directly matches \u201clong\u2011horizon functional consistency\u201d. | planned |\n| Metric | pass@1, pass@k (k\u2208{5,10} configurable), tests_passed_ratio | pass@k is the primary metric; add per\u2011task passed\u2011tests ratio for fine\u2011grained analysis. | planned |\n| Model Interface | HuggingFace Transformers Causal LM (`AutoModelForCausalLM` + `AutoTokenizer`), Torch dtype auto (bf16/fp16/fp32) | Pure Torch inference and easy swap between OSS code models (CodeLlama, StarCoder2, Qwen2.5\u2011Coder, etc.). | planned |\n| Decoding | temperature\u2208{0.2,0.6}; top_p=0.9; max_new_tokens=256\u2013512; stop_tokens=[\"\\n\\n\", \"\\nclass\", \"\\ndef\"] | Two temperature tiers (conservative/exploratory). Stop sequences to cut off trailing classes/defs. | planned |\n| Eval Harness | For each task, concatenate prompt + completion, write to a temp file, run official/compatible tests; capture timeout/exceptions | Mirrors common HumanEval practice; reproducible and portable. | planned |\n| Safety & Sandboxing | Per\u2011task Python subprocess with timeout (10\u201330s) and resource limits | Prevents infinite loops/harmful calls from affecting the host process. | planned |\n| Reproducibility | `torch.manual_seed`; fixed decoding RNG seeds; optional deterministic kernels | Stable runs to compare models/temperatures/k values. | planned |\n| Hardware | Single GPU A10/A100/4090 (or CPU for small models), batch_size=1 | Inference\u2011only; memory\u2011aware settings. | planned |\n| Ablations | Temperature (0.2 vs 0.6), k (1/5/10), stop sequences, max length | Core axes most correlated with HumanEval outcomes. | planned |\n| Output Artifacts | `predictions.jsonl` (rows: {task_id, prompt_hash, completion, passed}), `scores.json` (pass@1/5/10, tests_passed_ratio stats) | Structured outputs for downstream analysis/visualization. | planned |\n| Logging | Per\u2011task logs (latency, exceptions, passed tests) + summary table | Quick failure localization and stability checks. | planned |",
+          "Feasibility": 7,
+          "Importance": "The ability of LLMs to generate coherent and useful code over long sequences is a growing demand in the field of automated software development and AI-assisted coding. As LLMs become more capable, the expectation for them to handle more complex and lengthy coding tasks increases. However, maintaining coherence over long spans is challenging due to the limitations of current models in managing context effectively. Addressing this problem is vital to advance AI's practical applications in software engineering, particularly in scenarios requiring extended codebases or scripts, such as automated testing or code refactoring. Recent literature, including studies on LLM's use in code generation (e.g., Codex, GitHub Copilot), highlights the struggle for coherence over extended outputs, showcasing a gap that adaptive prompt decomposition could fill.",
+          "IntentAlignment": 9,
+          "Interestingness": 8,
+          "Name": "Adaptive Prompt Decomposition",
+          "Novelty": 8,
+          "NoveltyComparison": "While previous research on large language models, such as OpenAI's Codex, has explored code generation capabilities, these models often fall short when tasked with generating coherent long-range code. Existing methods typically rely on static prompt strategies that do not adapt to the context or complexity of the task, leading to a drop in coherence and relevance as the length of the generated code increases. In contrast, our approach, adaptive prompt decomposition, introduces a dynamic mechanism that adjusts the prompt strategy based on ongoing context analysis. This method leverages recent advances in reinforcement learning and context window optimization to maintain coherence without overwhelming computation requirements. Unlike traditional methods that treat prompt decomposition as a fixed pre-processing step, our adaptive model iteratively updates its strategy, allowing for more fine-tuned and context-aware prompt modifications. This adaptability is key to solving the previously unmet challenge of maintaining coherence in long-range code generation.",
+          "Problem": "Can adaptive prompt decomposition techniques improve the coherence and quality of long-range code generated by large language models?",
+          "Score": 8,
+          "Title": "Exploring Adaptive Prompt Decomposition for Enhanced Coherent Long-Range Code Generation",
+          "is_experimental": true
+        },
+        "title": "Adaptive prompt decomposition",
+        "id": "idea-3"
+      },
+      "_cached_idea_index": 3
+    }
+  ],
+  "review": [
+    {
+      "pdf_path": "/api/files/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf",
+      "review": {
+        "Summary": "The paper introduces an innovative method for enhancing coherence in long-range code generation by large language models through adaptive prompt decomposition. The approach involves dynamically segmenting prompts and incorporating a feedback loop for coherence evaluation. While it achieves perfect syntactic and semantic coherence, it struggles with functional correctness.",
+        "Strengths": [
+          "Addresses a key challenge in achieving coherence for long-range code generation.",
+          "Innovative approach using adaptive segmentation and feedback loops.",
+          "Demonstrates improvements in syntactic and semantic coherence."
+        ],
+        "Weaknesses": [
+          "Lacks a clear demonstration of improvements over existing techniques.",
+          "Functional correctness is not achieved, with a pass@1 metric of 0.0.",
+          "Does not integrate unit tests, which is a significant oversight.",
+          "Evaluation metrics are limited to syntactic and semantic coherence."
+        ],
+        "Originality": 3,
+        "Quality": 2,
+        "Clarity": 3,
+        "Significance": 2,
+        "Questions": [
+          "How does the proposed method compare quantitatively against existing methods?",
+          "What steps can be taken to address the functional correctness issue?",
+          "Why were unit tests not integrated into the evaluation process?"
+        ],
+        "Limitations": [
+          "The method struggles with functional correctness.",
+          "Lack of integration of unit tests for evaluating code functionality."
+        ],
+        "Ethical Concerns": false,
+        "Soundness": 2,
+        "Presentation": 3,
+        "Contribution": 2,
+        "Overall": 5,
+        "Confidence": 4,
+        "Decision": "Reject"
+      },
+      "success": true,
+      "_cached_idea_index": 1
+    },
+    {
+      "pdf_path": "/api/files/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf",
+      "review": {
+        "Summary": "The paper introduces an adaptive prompt decomposition technique aimed at improving code generation with large language models. It utilizes a single-layer GRU model to maintain context and coherence over long code sequences. The technique achieves impressive metrics on AST Parse Rate and Undefined Reference Count in the HumanEval dataset, but it underperforms in functional correctness as demonstrated by low pass@1 scores.",
+        "Strengths": [
+          "Innovative approach to prompt decomposition in code generation.",
+          "Resource-efficient method using a single-layer GRU model."
+        ],
+        "Weaknesses": [
+          "Poor functional correctness with a pass@1 score of 0.0.",
+          "Lacks clarity in illustrating significant advantages over existing models.",
+          "Inadequate discussion on potential negative societal impacts and ethical issues.",
+          "Limited contribution and practical significance due to low performance."
+        ],
+        "Originality": 2,
+        "Quality": 2,
+        "Clarity": 2,
+        "Significance": 2,
+        "Questions": [
+          "How does the proposed method compare to transformer-based models in terms of coherence and functional correctness?",
+          "What are the potential negative societal impacts of using this method for code generation?",
+          "Can the authors provide more detailed explanations of the adaptive prompt decomposition technique?"
+        ],
+        "Limitations": [
+          "Low functional correctness in generated code.",
+          "Risk of generating incorrect code with potentially negative consequences."
+        ],
+        "Ethical Concerns": false,
+        "Soundness": 2,
+        "Presentation": 2,
+        "Contribution": 2,
+        "Overall": 3,
+        "Confidence": 4,
+        "Decision": "Reject"
+      },
+      "success": true,
+      "_cached_idea_index": 2
+    },
+    {
+      "pdf_path": "/api/files/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf",
+      "review": {
+        "Summary": "The paper explores adaptive prompt decomposition strategies to improve the coherence of long-range code generation using large language models. It employs reinforcement learning to refine prompt strategies dynamically. However, experimental results show no improvement in coherence, with BLEU and ROUGE-L scores remaining at zero across all runs.",
+        "Strengths": [
+          "The paper addresses a relevant problem in the field of AI-assisted coding.",
+          "It attempts to integrate reinforcement learning for dynamic prompt adaptation."
+        ],
+        "Weaknesses": [
+          "The methodology did not result in any measurable improvement in code coherence.",
+          "The experimental results consistently show zero scores, questioning the validity of the approach.",
+          "The paper lacks a convincing demonstration of the novelty or practical significance of the contributions.",
+          "The evaluation metrics used may not be suitable for capturing the intended improvements."
+        ],
+        "Originality": 2,
+        "Quality": 1,
+        "Clarity": 2,
+        "Significance": 1,
+        "Questions": [
+          "Why do the results consistently show zero improvement in BLEU and ROUGE-L scores?",
+          "Could there be issues with the model architecture or experimental setup that might have affected the results?"
+        ],
+        "Limitations": [
+          "The inability to improve code coherence suggests limitations in the approach.",
+          "The reliance on standard metrics like BLEU and ROUGE-L might be inappropriate for this task."
+        ],
+        "Ethical Concerns": false,
+        "Soundness": 1,
+        "Presentation": 2,
+        "Contribution": 1,
+        "Overall": 2,
+        "Confidence": 4,
+        "Decision": "Reject"
+      },
+      "success": true,
+      "_cached_idea_index": 3
+    }
+  ],
+  "logs": {
+    "configure": [
+      "[record] configure completed"
+    ],
+    "prompts": [
+      "[record] fetched prompts snapshot"
+    ],
+    "generate_initial": [
+      "[record] generated 3 initial ideas"
+    ],
+    "evaluate": [
+      "[record] evaluated 3 ideas",
+      "[record] evaluated 6 ideas"
+    ],
+    "generate_children": [
+      "[record] generated 3 child ideas from idea-1"
+    ],
+    "code": [
+      "[record] generated code for idea-1",
+      "[record] copied experiment artefacts from /Users/4r5t/Desktop/Workspace/tiny-scientist/generated/experiments -> /Users/4r5t/Desktop/Workspace/tiny-scientist/frontend/demo_cache/generated/experiments/idea-1",
+      "[record] generated code for idea-2",
+      "[record] copied experiment artefacts from /Users/4r5t/Desktop/Workspace/tiny-scientist/generated/experiments -> /Users/4r5t/Desktop/Workspace/tiny-scientist/frontend/demo_cache/generated/experiments/idea-2",
+      "[record] generated code for idea-3",
+      "[record] copied experiment artefacts from /Users/4r5t/Desktop/Workspace/tiny-scientist/generated/experiments -> /Users/4r5t/Desktop/Workspace/tiny-scientist/frontend/demo_cache/generated/experiments/idea-3"
+    ],
+    "write": [
+      "[record] generated paper for idea-1",
+      "[record] captured paper artefact at /Users/4r5t/Desktop/Workspace/tiny-scientist/frontend/demo_cache/generated/papers/idea-1/investigating_adaptive_prompt_decomposition_for_improved_long-range_coherence_in_code_generation.pdf",
+      "[record] generated paper for idea-2",
+      "[record] captured paper artefact at /Users/4r5t/Desktop/Workspace/tiny-scientist/frontend/demo_cache/generated/papers/idea-2/investigating_adaptive_prompt_decomposition_to_enhance_coherent_long-range_code_generation.pdf",
+      "[record] generated paper for idea-3",
+      "[record] captured paper artefact at /Users/4r5t/Desktop/Workspace/tiny-scientist/frontend/demo_cache/generated/papers/idea-3/exploring_adaptive_prompt_decomposition_for_enhanced_coherent_long-range_code_generation.pdf",
+      "[record] regenerated paper for idea-1",
+      "[record] regenerated paper for idea-2",
+      "[record] regenerated paper for idea-3"
+    ],
+    "review": [
+      "[record] regenerated review for idea-1",
+      "[record] regenerated review for idea-2",
+      "[record] regenerated review for idea-3"
+    ]
+  }
+}
diff --git a/frontend/package.json b/frontend/package.json
index bd5d8fa2..076fb00d 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -2,7 +2,6 @@
   "name": "hypo-eval",
   "version": "0.1.0",
   "private": true,
-  "proxy": "http://localhost:5000",
   "dependencies": {
     "@monaco-editor/react": "^4.7.0",
     "@testing-library/jest-dom": "^5.17.0",
diff --git a/frontend/src/components/LogDisplay.jsx b/frontend/src/components/LogDisplay.jsx
index 5d22c2a4..a7054bcb 100644
--- a/frontend/src/components/LogDisplay.jsx
+++ b/frontend/src/components/LogDisplay.jsx
@@ -1,15 +1,55 @@
-import React, { useState, useEffect, useRef } from 'react';
+import React, { useState, useEffect, useMemo, useRef } from 'react';
 import { io } from 'socket.io-client';
 
+const resolveSocketBaseURL = (isDemo) => {
+  if (typeof window === 'undefined') {
+    if (isDemo) {
+      return process.env.REACT_APP_DEMO_SOCKET_BASE_URL || 'http://localhost:5001';
+    }
+    return process.env.REACT_APP_SOCKET_BASE_URL || 'http://localhost:5000';
+  }
+
+  if (isDemo && process.env.REACT_APP_DEMO_SOCKET_BASE_URL) {
+    return process.env.REACT_APP_DEMO_SOCKET_BASE_URL;
+  }
+  if (!isDemo && process.env.REACT_APP_SOCKET_BASE_URL) {
+    return process.env.REACT_APP_SOCKET_BASE_URL;
+  }
+
+  const { protocol, hostname, port } = window.location;
+
+  if (port === '3000') {
+    const backendPort = isDemo
+      ? process.env.REACT_APP_DEMO_SOCKET_PORT || '5001'
+      : process.env.REACT_APP_SOCKET_PORT || '5000';
+    return `${protocol}//${hostname}:${backendPort}`;
+  }
+
+  return `${protocol}//${hostname}${port ? `:${port}` : ''}`;
+};
+
 const LogDisplay = ({ isVisible, onToggle }) => {
   const [logs, setLogs] = useState([]);
   const [isConnected, setIsConnected] = useState(false);
   const socketRef = useRef(null);
   const logsEndRef = useRef(null);
+  const isDemoPage =
+    typeof window !== 'undefined' &&
+    (window.location.pathname === '/demo' || window.location.pathname.startsWith('/demo/'));
+  const socketPath =
+    (isDemoPage ? process.env.REACT_APP_DEMO_SOCKET_PATH : process.env.REACT_APP_SOCKET_PATH) ||
+    (isDemoPage ? '/demo/socket.io' : '/socket.io');
+  const socketUrl = useMemo(() => resolveSocketBaseURL(isDemoPage), [isDemoPage]);
 
   useEffect(() => {
-    // Always maintain socket connection when component mounts
-    socketRef.current = io('http://localhost:5000');
+    socketRef.current = io(socketUrl, {
+      path: socketPath,
+      transports: ['websocket', 'polling'],
+      reconnection: true,
+      reconnectionDelay: 2000,
+      reconnectionDelayMax: 6000,
+      timeout: 10000,
+    });
 
     socketRef.current.on('connect', () => {
       setIsConnected(true);
@@ -26,8 +66,9 @@ const LogDisplay = ({ isVisible, onToggle }) => {
 
     return () => {
       socketRef.current?.disconnect();
+      socketRef.current = null;
     };
-  }, []);
+  }, [socketUrl, socketPath]);
 
   useEffect(() => {
     // Auto-scroll to bottom when new logs arrive
diff --git a/frontend/src/components/TopNav.jsx b/frontend/src/components/TopNav.jsx
index 047deb06..e59c55fc 100644
--- a/frontend/src/components/TopNav.jsx
+++ b/frontend/src/components/TopNav.jsx
@@ -6,6 +6,27 @@ import React from 'react';
  * @param {Function} setShowTree
  */
 const TopNav = ({ currentView, setCurrentView, showCodeView = false }) => {
+  const FALLBACK_DEMO_INTENT =
+    'Adaptive Prompt Decomposition for Coherent Long-Range Code Generation';
+  const demoIntentPrefill =
+    process.env.REACT_APP_DEMO_INTENT || FALLBACK_DEMO_INTENT;
+  const demoTarget = process.env.REACT_APP_DEMO_URL || '/demo';
+  const liveTarget = process.env.REACT_APP_LIVE_URL || '/';
+  const resolveHref = (target) => {
+    if (/^https?:\/\//i.test(target)) {
+      return target;
+    }
+    if (target.startsWith('/')) {
+      return `${window.location.origin}${target}`;
+    }
+    return `${window.location.origin}/${target}`;
+  };
+  const isDemoLocation =
+    typeof window !== 'undefined' &&
+    (window.location.pathname === '/demo' ||
+      window.location.pathname.startsWith('/demo/'));
+  const modeButtonLabel = isDemoLocation ? 'Live Mode' : 'Demo Mode';
+
   /* ---------- SVG 图标 ---------- */
   const overviewIcon = (
     <svg
@@ -243,23 +264,54 @@ const TopNav = ({ currentView, setCurrentView, showCodeView = false }) => {
       </div>
       </div>
 
-      {/* GitHub Logo */}
-      <a
-        href="https://github.com/ulab-uiuc/tiny-scientist"
-        target="_blank"
-        rel="noopener noreferrer"
-        style={{
-          display: 'flex',
-          alignItems: 'center',
-          color: '#A0AEC0',
-          textDecoration: 'none',
-          transition: 'color 0.2s ease',
-        }}
-        onMouseEnter={(e) => e.target.style.color = '#fff'}
-        onMouseLeave={(e) => e.target.style.color = '#A0AEC0'}
-      >
-        {githubIcon}
-      </a>
+      <div style={{ display: 'flex', alignItems: 'center', gap: '12px' }}>
+        {/* Demo Mode Button */}
+        <button
+          type="button"
+          onClick={() => {
+            const target = isDemoLocation ? liveTarget : demoTarget;
+            if (!isDemoLocation && typeof window !== 'undefined') {
+              try {
+                window.sessionStorage.setItem('demo_intent_prefill', demoIntentPrefill);
+              } catch (_) {
+                // ignore storage issues
+              }
+            }
+            window.location.href = resolveHref(target);
+          }}
+          style={{
+            padding: '8px 18px',
+            borderRadius: '9999px',
+            border: '2px solid #FBBF24',
+            backgroundColor: isDemoLocation ? '#FBBF24' : 'transparent',
+            color: isDemoLocation ? '#0F172A' : '#FBBF24',
+            fontSize: '0.875rem',
+            fontWeight: 600,
+            cursor: 'pointer',
+            transition: 'all 0.2s ease',
+          }}
+        >
+          {modeButtonLabel}
+        </button>
+
+        {/* GitHub Logo */}
+        <a
+          href="https://github.com/ulab-uiuc/tiny-scientist"
+          target="_blank"
+          rel="noopener noreferrer"
+          style={{
+            display: 'flex',
+            alignItems: 'center',
+            color: '#A0AEC0',
+            textDecoration: 'none',
+            transition: 'color 0.2s ease',
+          }}
+          onMouseEnter={(e) => (e.target.style.color = '#fff')}
+          onMouseLeave={(e) => (e.target.style.color = '#A0AEC0')}
+        >
+          {githubIcon}
+        </a>
+      </div>
     </div>
   );
 };
diff --git a/frontend/src/components/TreePlotVisualization.jsx b/frontend/src/components/TreePlotVisualization.jsx
index dafc82dc..360cf43b 100644
--- a/frontend/src/components/TreePlotVisualization.jsx
+++ b/frontend/src/components/TreePlotVisualization.jsx
@@ -8,6 +8,110 @@ import IdeaCard from './IdeaCard';
 import IdeaFactorsAndScoresCard from './IdeaFactorsAndScoresCard';
 import LogDisplay from './LogDisplay';
 
+const normalizeExperimentDir = (dir) => {
+  if (!dir) return 'experiments';
+  let sanitized = dir.trim();
+  if (!sanitized) return 'experiments';
+  sanitized = sanitized.replace(/\\/g, '/');
+  sanitized = sanitized.replace(/^\.?\/+/, '');
+  if (sanitized.startsWith('generated/')) {
+    sanitized = sanitized.slice('generated/'.length);
+  }
+  return sanitized || 'experiments';
+};
+
+const buildExperimentFileUrl = (dir, fileName, apiBase = '/api') => {
+  const safeDir = normalizeExperimentDir(dir);
+  const encodedDir = safeDir
+    .split('/')
+    .filter(Boolean)
+    .map(encodeURIComponent)
+    .join('/');
+  return `${apiBase}/files/${encodedDir}/${encodeURIComponent(fileName)}`;
+};
+
+const DEFAULT_RUN_DISCOVERY_LIMIT = 10;
+
+const fetchTextFile = async (dir, filePath, apiBase = '/api') => {
+  const url = buildExperimentFileUrl(dir, filePath, apiBase);
+  try {
+    console.log(`Attempting to fetch ${filePath} from ${url}`);
+    const response = await fetch(url, { credentials: 'include' });
+    if (!response.ok) {
+      console.log(`Fetch skipped for ${filePath}: ${response.status} ${response.statusText}`);
+      return null;
+    }
+    const data = await response.json();
+    if (typeof data?.content === 'string') {
+      return data.content;
+    }
+  } catch (err) {
+    console.error(`Error fetching ${filePath}:`, err);
+  }
+  return null;
+};
+
+const getRunLabel = (runName, index = 0) => {
+  if (!runName) {
+    return `Run ${index + 1}`;
+  }
+  if (runName.startsWith('run_')) {
+    const parts = runName.split('_');
+    if (parts.length > 1 && parts[1]) {
+      return `Run ${parts[1]}`;
+    }
+  }
+  return runName;
+};
+
+const getFileIcon = (fileName = '') => {
+  if (fileName.endsWith('.py')) return '🐍';
+  if (fileName.endsWith('.json')) return '📊';
+  if (fileName.endsWith('.txt')) return '📄';
+  return '📝';
+};
+
+const normalizeCodeResult = (data) => {
+  if (!data || typeof data !== 'object') {
+    return data;
+  }
+  const safeDir = data.experiment_dir ? normalizeExperimentDir(data.experiment_dir) : null;
+  return { ...data, experiment_dir: safeDir };
+};
+
+const formatMetricValue = (value) => {
+  if (typeof value === 'number' && Number.isFinite(value)) {
+    const fixed = value.toFixed(4);
+    return parseFloat(fixed).toString();
+  }
+  return String(value);
+};
+
+const sanitizeS2ApiKey = (key) => (typeof key === 'string' ? key.trim() : '');
+
+const getStoredS2ApiKey = () => {
+  if (typeof window === 'undefined') return '';
+  try {
+    return window.localStorage.getItem('s2_api_key') || '';
+  } catch (err) {
+    console.error('Failed to read Semantic Scholar key from storage:', err);
+    return '';
+  }
+};
+
+const persistS2ApiKey = (key) => {
+  if (typeof window === 'undefined') return;
+  try {
+    if (key) {
+      window.localStorage.setItem('s2_api_key', key);
+    } else {
+      window.localStorage.removeItem('s2_api_key');
+    }
+  } catch (err) {
+    console.error('Failed to persist Semantic Scholar key:', err);
+  }
+};
+
 
 // Helper components defined outside the main component to preserve state
 const ContextAndGenerateCard = ({
@@ -398,7 +502,7 @@ const TreePlotVisualization = () => {
   const [isAddingCustom, setIsAddingCustom] = useState(false);
   const [customIdea, setCustomIdea] = useState({ title: '', content: '' });
   // *** 新增：用于主界面模型选择和api-key输入
-  const [selectedModel, setSelectedModel] = useState('deepseek-chat');
+  const [selectedModel, setSelectedModel] = useState('gpt-4o');
   const [apiKey, setApiKey] = useState('');
   const [isConfigured, setIsConfigured] = useState(false);
   const [configError, setConfigError] = useState('');
@@ -438,7 +542,7 @@ const TreePlotVisualization = () => {
   const [codeFileName, setCodeFileName] = useState('experiment.py');
   const [activeCodeTab, setActiveCodeTab] = useState('experiment.py');
   const [experimentFiles, setExperimentFiles] = useState({});
-  const [consoleOutput, setConsoleOutput] = useState('');
+  const [experimentFileList, setExperimentFileList] = useState([]);
   const [experimentRuns, setExperimentRuns] = useState([]);
   const [isRunningExperiment, setIsRunningExperiment] = useState(false);
   const [pdfComments, setPdfComments] = useState([]);
@@ -449,6 +553,79 @@ const TreePlotVisualization = () => {
   const [isReviewing, setIsReviewing] = useState(false);
   const [rightPanelTab, setRightPanelTab] = useState('comments'); // 'comments' or 'review'
 
+  const isDemoRoute =
+    typeof window !== 'undefined' &&
+    (window.location.pathname === '/demo' ||
+      window.location.pathname.startsWith('/demo/'));
+  const FALLBACK_DEMO_INTENT =
+    'Adaptive Prompt Decomposition for Coherent Long-Range Code Generation';
+  const demoDefaultIntent =
+    (typeof process !== 'undefined' && process.env.REACT_APP_DEMO_INTENT) ||
+    FALLBACK_DEMO_INTENT;
+  const apiBase = isDemoRoute ? '/demo/api' : '/api';
+  const fileBase = `${apiBase}/files`;
+
+  const buildFileUrl = (dir, fileName) => buildExperimentFileUrl(dir, fileName, apiBase);
+  const fetchTextFileWithBase = (dir, fileName) => fetchTextFile(dir, fileName, apiBase);
+
+  useEffect(() => {
+    if (isDemoRoute) {
+      setCurrentView('exploration');
+      setIsConfigured(true);
+    }
+  }, [isDemoRoute]);
+
+  useEffect(() => {
+    if (!isDemoRoute) {
+      return;
+    }
+    let desiredIntent = demoDefaultIntent;
+    if (typeof window !== 'undefined') {
+      try {
+        const stored = window.sessionStorage.getItem('demo_intent_prefill');
+        if (stored) {
+          desiredIntent = stored;
+          window.sessionStorage.removeItem('demo_intent_prefill');
+        }
+      } catch (_) {
+        // ignore storage errors
+      }
+    }
+    if (desiredIntent && analysisIntent !== desiredIntent) {
+      setAnalysisIntent(desiredIntent);
+    }
+  }, [isDemoRoute, analysisIntent, demoDefaultIntent]);
+
+  const experimentFileCount = experimentFileList.length;
+  const hasExperimentFiles = experimentFileCount > 0;
+
+
+  const baseFileItems = experimentFileList.filter((item) => item.group === 'base');
+  const runFileGroups = (() => {
+    const map = new Map();
+    experimentFileList.forEach((item) => {
+      if (item.group !== 'run') {
+        return;
+      }
+      const key = item.runName || item.path;
+      if (!map.has(key)) {
+        map.set(key, {
+          runLabel: item.runLabel || getRunLabel(item.runName || '', map.size),
+          items: [],
+        });
+      }
+      map.get(key).items.push(item);
+    });
+    return Array.from(map.entries());
+  })();
+
+  useEffect(() => {
+    const savedKey = sanitizeS2ApiKey(getStoredS2ApiKey());
+    if (savedKey) {
+      setS2ApiKey(savedKey);
+    }
+  }, []);
+
 
   // Track view changes
   const previousViewRef = useRef(currentView); // Initialize with current view
@@ -515,9 +692,9 @@ const TreePlotVisualization = () => {
   };
   // ============== 配置模型和API Key ==============
   const modelOptions = [
+    { value: 'gpt-4o', label: 'GPT-4o' },
     { value: 'deepseek-chat', label: 'DeepSeek Chat' },
     { value: 'deepseek-reasoner', label: 'DeepSeek Reasoner' },
-    { value: 'gpt-4o', label: 'GPT-4o' },
     { value: 'o1-2024-12-17', label: 'GPT-o1' },
     { value: 'claude-3-5-sonnet-20241022', label: 'Claude 3.5 Sonnet' },
   ];
@@ -525,7 +702,7 @@ const TreePlotVisualization = () => {
     const fetchPrompts = async () => {
       if (isConfigured) {
         try {
-          const response = await fetch('/api/get-prompts', { credentials: 'include' });
+          const response = await fetch(`${apiBase}/get-prompts`, { credentials: 'include' });
           if (!response.ok) {
             throw new Error('Failed to fetch prompts'); // Corrected spelling
           }
@@ -558,7 +735,7 @@ const TreePlotVisualization = () => {
     setOperationStatus('Configuring model...');
 
     try {
-      const response = await fetch('/api/configure', {
+      const response = await fetch(`${apiBase}/configure`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -596,7 +773,7 @@ const TreePlotVisualization = () => {
     setOperationStatus('Configuring model...');
 
     try {
-      const response = await fetch('/api/configure', {
+      const response = await fetch(`${apiBase}/configure`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -820,7 +997,7 @@ const TreePlotVisualization = () => {
   // ============== 自定义prompts==============
   const updateSystemPrompt = async (prompt) => {
     try {
-      const response = await fetch('/api/set-system-prompt', {
+      const response = await fetch(`${apiBase}/set-system-prompt`, {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         credentials: 'include',
@@ -837,7 +1014,7 @@ const TreePlotVisualization = () => {
 
   const updateCriteria = async (dimension, criteria) => {
     try {
-      const response = await fetch('/api/set-criteria', {
+      const response = await fetch(`${apiBase}/set-criteria`, {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         credentials: 'include',
@@ -1056,7 +1233,7 @@ const TreePlotVisualization = () => {
         intent: analysisIntent
       };
 
-      const response = await fetch('/api/evaluate', {
+      const response = await fetch(`${apiBase}/evaluate`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -1122,7 +1299,7 @@ const TreePlotVisualization = () => {
 
     try {
       // Call Flask backend
-      const response = await fetch('/api/generate-initial', {
+      const response = await fetch(`${apiBase}/generate-initial`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -1211,7 +1388,7 @@ const TreePlotVisualization = () => {
     setShowLogs(true); // Show logs when generating child ideas
 
     try {
-      const response = await fetch('/api/generate-children', {
+      const response = await fetch(`${apiBase}/generate-children`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -1284,7 +1461,7 @@ const TreePlotVisualization = () => {
     try {
       setIsGenerating(true);
       setOperationStatus('Modifying idea...');
-      const response = await fetch('/api/modify', {
+      const response = await fetch(`${apiBase}/modify`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -1362,7 +1539,7 @@ const TreePlotVisualization = () => {
     setError(null);
     try {
       /* ---------- ② 调用 Flask API ---------- */
-      const response = await fetch('/api/merge', {
+      const response = await fetch(`${apiBase}/merge`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -2223,46 +2400,177 @@ const TreePlotVisualization = () => {
 
   // Load generated files from public directory
   const loadGeneratedFiles = async (experimentDir) => {
+    const safeDir = normalizeExperimentDir(experimentDir);
     try {
-      const fileUrls = {
-        'experiment.py': `/api/files/${experimentDir}/experiment.py`,
-        'notes.txt': `/api/files/${experimentDir}/notes.txt`,
-        'experiment_results.txt': `/api/files/${experimentDir}/experiment_results.txt`,
-      };
-
       const loadedFiles = {};
-      for (const [fileName, url] of Object.entries(fileUrls)) {
+      const fileItems = [];
+
+      const baseFiles = ['experiment.py', 'notes.txt', 'experiment_results.txt'];
+      for (const fileName of baseFiles) {
+        const content = await fetchTextFileWithBase(safeDir, fileName);
+        if (content !== null) {
+          loadedFiles[fileName] = content;
+          fileItems.push({
+            path: fileName,
+            tabLabel: fileName,
+            sidebarLabel: fileName,
+            icon: getFileIcon(fileName),
+            group: 'base',
+          });
+        }
+      }
+
+      const resultContent = loadedFiles['experiment_results.txt'] || null;
+      let runNames = [];
+      if (resultContent) {
         try {
-          console.log(`Attempting to fetch: ${fileName} from ${url}`);
-          const response = await fetch(url);
-          console.log(`Response for ${fileName}:`, response.status, response.statusText);
-
-          if (response.ok) {
-            const data = await response.json();
-            if (data.content) {
-              loadedFiles[fileName] = data.content;
-              console.log(`Loaded ${fileName} successfully.`);
+          const parsed = JSON.parse(resultContent);
+          if (parsed && typeof parsed === 'object') {
+            runNames = Object.keys(parsed);
+          }
+        } catch (parseErr) {
+          console.error('Failed to parse experiment_results.txt:', parseErr);
+        }
+      }
+      if (runNames.length === 0) {
+        runNames = Array.from({ length: DEFAULT_RUN_DISCOVERY_LIMIT }, (_, i) => `run_${i + 1}`);
+      }
+
+      const runMeta = new Map();
+      await Promise.all(
+        runNames.map(async (runName, index) => {
+          if (!runName) return;
+
+          const runLabel = getRunLabel(runName, index);
+          let meta = runMeta.get(runName);
+          if (!meta) {
+            meta = { runName, runLabel };
+            runMeta.set(runName, meta);
+          }
+
+          let fetchedContent = false;
+
+          const finalInfoPath = `${runName}/final_info.json`;
+          const finalInfo = await fetchTextFileWithBase(safeDir, finalInfoPath);
+          if (finalInfo !== null) {
+            loadedFiles[finalInfoPath] = finalInfo;
+            fetchedContent = true;
+            try {
+              meta.finalInfo = JSON.parse(finalInfo);
+            } catch (err) {
+              console.error(`Failed to parse ${finalInfoPath}:`, err);
+              meta.finalInfo = null;
             }
+            meta.finalInfoPath = finalInfoPath;
+            fileItems.push({
+              path: finalInfoPath,
+              tabLabel: `${runLabel} • final_info.json`,
+              sidebarLabel: 'final_info.json',
+              icon: getFileIcon(finalInfoPath),
+              group: 'run',
+              runName,
+              runLabel,
+            });
+          }
+
+          const runScriptPath = `${runName}.py`;
+          const runScript = await fetchTextFileWithBase(safeDir, runScriptPath);
+          if (runScript !== null) {
+            loadedFiles[runScriptPath] = runScript;
+            fetchedContent = true;
+            meta.scriptPath = runScriptPath;
+            const scriptFileName = runScriptPath.split('/').pop() || runScriptPath;
+            fileItems.push({
+              path: runScriptPath,
+              tabLabel: `${runLabel} • ${scriptFileName}`,
+              sidebarLabel: scriptFileName,
+              icon: getFileIcon(runScriptPath),
+              group: 'run',
+              runName,
+              runLabel,
+            });
+          }
+
+          if (!fetchedContent) {
+            runMeta.delete(runName);
+          }
+        })
+      );
+
+      setExperimentFiles(loadedFiles);
+      setExperimentFileList(fileItems);
+
+      const runMetaMap = Object.fromEntries(runMeta.entries());
+
+      if (resultContent) {
+        try {
+          const parsed = JSON.parse(resultContent);
+          if (parsed && typeof parsed === 'object') {
+            const runs = Object.entries(parsed).map(([name, metrics], index) => {
+              const safeMetrics = typeof metrics === 'object' && metrics !== null ? metrics : {};
+              const meta = runMetaMap[name] || { runName: name, runLabel: getRunLabel(name, index) };
+              return {
+                name,
+                runLabel: meta.runLabel || getRunLabel(name, index),
+                metrics: safeMetrics,
+                success: Object.keys(safeMetrics).length > 0,
+              };
+            });
+            setExperimentRuns(runs);
           } else {
-            console.log(`Could not load ${fileName}, but this might be expected (e.g., no notes).`);
+            setExperimentRuns([]);
           }
-        } catch (err) {
-          console.error(`Error fetching ${fileName}:`, err);
+        } catch (parseErr) {
+          console.error('Failed to parse experiment_results.txt:', parseErr);
+          setExperimentRuns([]);
         }
+      } else if (Object.keys(runMetaMap).length > 0) {
+        const runs = Object.entries(runMetaMap).map(([name, meta], index) => {
+          return {
+            name,
+            runLabel: meta.runLabel || getRunLabel(name, index),
+            metrics: meta.finalInfo && typeof meta.finalInfo === 'object' ? meta.finalInfo : {},
+            success: true,
+          };
+        });
+        setExperimentRuns(runs);
+      } else {
+        setExperimentRuns([]);
       }
 
-      // Update state all at once
-      setExperimentFiles(prev => ({ ...prev, ...loadedFiles }));
-
-      // Set the code content for the editor if experiment.py was loaded
       if (loadedFiles['experiment.py']) {
-        setCodeContent(loadedFiles['experiment.py']);
         setActiveCodeTab('experiment.py');
+        setCodeFileName('experiment.py');
+        setCodeContent(loadedFiles['experiment.py']);
+      } else if (fileItems.length > 0) {
+        const firstItem = fileItems[0];
+        setActiveCodeTab(firstItem.path);
+        setCodeFileName(firstItem.tabLabel);
+        setCodeContent(loadedFiles[firstItem.path] || '');
+      } else {
+        setActiveCodeTab('');
+        setCodeFileName('');
+        setCodeContent(
+          `# Generated experiment code
+# Files are being generated in: ${safeDir}
+
+# Please check the directory for the actual code files.`
+        );
       }
 
+      return safeDir;
     } catch (err) {
-      console.log("Error loading generated files:", err);
-      setCodeContent(`# Generated experiment code\n# Files are being generated in: ${experimentDir}\n\n# Please check the directory for the actual code files.`);
+      console.log(`Error loading generated files from ${safeDir}:`, err);
+      setExperimentFiles({});
+      setExperimentFileList([]);
+      setCodeContent(
+        `# Generated experiment code
+# Files are being generated in: ${safeDir}
+
+# Please check the directory for the actual code files.`
+      );
+      setExperimentRuns([]);
+      return safeDir;
     }
   };
 
@@ -2281,11 +2589,11 @@ const TreePlotVisualization = () => {
       console.log("Manual file loading completed");
 
       // Set a fake successful result to satisfy the UI
-      setCodeResult({
+      setCodeResult(normalizeCodeResult({
         status: true,
         success: true,
         experiment_dir: "experiments"
-      });
+      }));
 
     } catch (err) {
       console.log("Manual file loading failed:", err);
@@ -2299,22 +2607,34 @@ const TreePlotVisualization = () => {
       return;
     }
 
+    const retryIdeaId = typeof node.id === 'string' ? node.id : null;
+    const retryIdeaName =
+      node.originalData?.Name ||
+      node.originalData?.Title ||
+      node.originalData?.name ||
+      node.originalData?.title ||
+      node.title ||
+      null;
+
     console.log("Retrying code generation for idea:", node.originalData.Title);
     setIsGeneratingCode(true);
     setOperationStatus('Retrying code generation...');
     setCodeResult(null);
     setExperimentFiles({});
+    setExperimentFileList([]);
     setShowLogs(true); // Show logs when starting code generation
 
     try {
-      const codeResponse = await fetch('/api/code', {
+      const codeResponse = await fetch(`${apiBase}/code`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
         },
         credentials: 'include',
         body: JSON.stringify({
-          idea: node.originalData
+          idea: node.originalData,
+          idea_id: retryIdeaId,
+          idea_name: retryIdeaName
         })
       });
 
@@ -2325,25 +2645,28 @@ const TreePlotVisualization = () => {
 
       const codeData = await codeResponse.json();
       console.log("Retry code generation completed:", codeData);
-      setCodeResult(codeData);
-
       if (!codeData.success) {
         throw new Error(codeData.error || 'Code generation failed');
       }
 
+      const normalizedResult = normalizeCodeResult(codeData);
+      setCodeResult(normalizedResult);
+
       // Load generated files
       setOperationStatus('Loading generated files...');
-      await loadGeneratedFiles(codeData.experiment_dir);
+      if (normalizedResult.experiment_dir) {
+        await loadGeneratedFiles(normalizedResult.experiment_dir);
+      }
       setOperationStatus('Code generation retry completed successfully!');
 
     } catch (error) {
       console.error("Retry code generation failed:", error);
       setOperationStatus('Retry code generation failed: ' + error.message);
-      setCodeResult({
+      setCodeResult(normalizeCodeResult({
         success: false,
         error: error.message,
         error_details: error.message
-      });
+      }));
     } finally {
       setIsGeneratingCode(false);
     }
@@ -2356,6 +2679,15 @@ const TreePlotVisualization = () => {
       return;
     }
 
+    const paperIdeaId = typeof node.id === 'string' ? node.id : null;
+    const paperIdeaName =
+      node.originalData?.Name ||
+      node.originalData?.Title ||
+      node.originalData?.name ||
+      node.originalData?.title ||
+      node.title ||
+      null;
+
     // Only require experiment directory for experimental ideas
     const isExperimental = node.originalData.is_experimental;
     if (isExperimental && !experimentDir) {
@@ -2369,17 +2701,15 @@ const TreePlotVisualization = () => {
     setShowLogs(true); // Show logs when starting paper generation
 
     try {
-      // Get S2 API key from localStorage or prompt user
-      let s2ApiKey = localStorage.getItem('s2_api_key');
-      if (!s2ApiKey) {
-        s2ApiKey = prompt('Please enter your Semantic Scholar API Key:');
-        if (!s2ApiKey) {
-          throw new Error('Semantic Scholar API key is required for paper generation');
-        }
-        localStorage.setItem('s2_api_key', s2ApiKey);
-      }
+      const effectiveS2Key =
+        sanitizeS2ApiKey(s2ApiKey) || sanitizeS2ApiKey(getStoredS2ApiKey());
+      persistS2ApiKey(effectiveS2Key);
+      setS2ApiKey(effectiveS2Key);
+      console.log(
+        `Semantic Scholar API key ${effectiveS2Key ? 'detected' : 'not provided'} for paper generation`
+      );
 
-      const paperResponse = await fetch('/api/write', {
+      const paperResponse = await fetch(`${apiBase}/write`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
@@ -2387,8 +2717,10 @@ const TreePlotVisualization = () => {
         credentials: 'include',
         body: JSON.stringify({
           idea: node.originalData,
+          idea_id: paperIdeaId,
+          idea_name: paperIdeaName,
           experiment_dir: experimentDir,
-          s2_api_key: s2ApiKey
+          s2_api_key: effectiveS2Key || null
         })
       });
 
@@ -2447,12 +2779,18 @@ const TreePlotVisualization = () => {
 
     // Check if the idea is experimental to determine S2 API key requirement
     const isExperimental = selectedNode.originalData.is_experimental === true;
-
-    // Validate Semantic Scholar API key only for non-experimental ideas
-    if (!isExperimental && !s2ApiKey.trim()) {
-      setProceedError('Semantic Scholar API key is required for non-experimental ideas');
-      return;
-    }
+    const ideaId = typeof selectedNode.id === 'string' ? selectedNode.id : null;
+    const ideaName =
+      selectedNode.originalData?.Name ||
+      selectedNode.originalData?.Title ||
+      selectedNode.originalData?.name ||
+      selectedNode.originalData?.title ||
+      selectedNode.title ||
+      null;
+
+    const sanitizedS2Key = sanitizeS2ApiKey(s2ApiKey);
+    persistS2ApiKey(sanitizedS2Key);
+    setS2ApiKey(sanitizedS2Key);
 
     setShowProceedConfirm(false);
     setProceedError(null);
@@ -2498,7 +2836,7 @@ const TreePlotVisualization = () => {
       console.log("Checking backend configuration...");
       setOperationStatus('Checking configuration...');
 
-      const configCheck = await fetch('/api/get-prompts', {
+      const configCheck = await fetch(`${apiBase}/get-prompts`, {
         credentials: 'include'
       });
 
@@ -2511,7 +2849,7 @@ const TreePlotVisualization = () => {
         }
 
         // Auto-reconfigure the backend
-        const reconfigResponse = await fetch('/api/configure', {
+        const reconfigResponse = await fetch(`${apiBase}/configure`, {
           method: 'POST',
           headers: {
             'Content-Type': 'application/json',
@@ -2545,14 +2883,16 @@ const TreePlotVisualization = () => {
           const controller = new AbortController();
           const timeoutId = setTimeout(() => controller.abort(), 300000); // 5 minutes timeout
 
-          const codeResponse = await fetch('/api/code', {
+          const codeResponse = await fetch(`${apiBase}/code`, {
             method: 'POST',
             headers: {
               'Content-Type': 'application/json',
             },
             credentials: 'include',
             body: JSON.stringify({
-              idea: selectedNode.originalData
+              idea: selectedNode.originalData,
+              idea_id: ideaId,
+              idea_name: ideaName
             }),
             signal: controller.signal
           });
@@ -2566,7 +2906,6 @@ const TreePlotVisualization = () => {
 
           codeData = await codeResponse.json();
           console.log("Code generation completed:", codeData);
-          setCodeResult(codeData);
 
           if (!codeData.success) {
             throw new Error(codeData.error || 'Code generation failed');
@@ -2580,9 +2919,9 @@ const TreePlotVisualization = () => {
             setOperationStatus('Connection lost, checking if code generation completed...');
 
             // Single check for existing files (backend may have completed despite connection issue)
-            const expectedFileUrl = '/api/files/experiments/experiment.py';
+            const expectedFileUrl = buildFileUrl('experiments', 'experiment.py');
             try {
-              const fileCheck = await fetch(expectedFileUrl);
+              const fileCheck = await fetch(expectedFileUrl, { credentials: 'include' });
               if (fileCheck.ok) {
                 const fileData = await fileCheck.json();
                 if (fileData.content && fileData.content.length > 50) {
@@ -2608,12 +2947,17 @@ const TreePlotVisualization = () => {
         }
 
         // Store results
-        setCodeResult(codeData);
-        const finalExperimentDir = codeData.experiment_dir;
+        const normalizedResult = normalizeCodeResult(codeData);
+        setCodeResult(normalizedResult);
+        const finalExperimentDir = normalizedResult?.experiment_dir;
 
         // Load generated files
         setOperationStatus('Loading generated files...');
-        await loadGeneratedFiles(finalExperimentDir);
+        if (finalExperimentDir) {
+          await loadGeneratedFiles(finalExperimentDir);
+        } else {
+          console.warn("Code generation succeeded but experiment directory is missing.");
+        }
 
         // Mark that code has been generated (this will show Code View tab)
         setHasGeneratedCode(true);
@@ -2634,7 +2978,7 @@ const TreePlotVisualization = () => {
         setOperationStatus('Generating paper...');
         setShowLogs(true); // Show logs when starting paper generation
 
-        const paperResponse = await fetch('/api/write', {
+        const paperResponse = await fetch(`${apiBase}/write`, {
           method: 'POST',
           headers: {
             'Content-Type': 'application/json',
@@ -2642,8 +2986,10 @@ const TreePlotVisualization = () => {
           credentials: 'include',
           body: JSON.stringify({
             idea: selectedNode.originalData,
+            idea_id: ideaId,
+            idea_name: ideaName,
             experiment_dir: null, // No experiment directory for non-experimental papers
-            s2_api_key: s2ApiKey.trim(),
+            s2_api_key: sanitizedS2Key || null,
           }),
         });
 
@@ -2703,10 +3049,33 @@ const TreePlotVisualization = () => {
     setPdfComments(pdfComments.filter(c => c.id !== commentId));
   };
 
+  const buildRelativeApiUrl = (path) => {
+    if (!path) return path;
+    if (/^https?:\/\//i.test(path)) {
+      return path;
+    }
+    if (path.startsWith(apiBase)) {
+      return path;
+    }
+    if (path.startsWith('/demo/api/')) {
+      return path;
+    }
+    if (path.startsWith('/api/')) {
+      return `${apiBase}${path.slice(4)}`;
+    }
+    if (path.startsWith('/')) {
+      return path;
+    }
+    return `${apiBase}/${path.replace(/^\//, '')}`;
+  };
+
   const downloadPDF = async (pdfPath) => {
     try {
+      const requestUrl = buildRelativeApiUrl(pdfPath);
       // Fetch the PDF as a blob to force download
-      const response = await fetch(`http://localhost:5000${pdfPath}`);
+      const response = await fetch(requestUrl, {
+        credentials: 'include',
+      });
 
       if (!response.ok) {
         throw new Error(`Failed to download PDF: ${response.status}`);
@@ -2746,13 +3115,14 @@ const TreePlotVisualization = () => {
     try {
       console.log('Starting paper review for:', pdfPath);
 
-      const response = await fetch('http://localhost:5000/api/review', {
+      const response = await fetch(`${apiBase}/review`, {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
         },
+        credentials: 'include',
         body: JSON.stringify({
-          pdf_path: pdfPath
+          pdf_path: buildRelativeApiUrl(pdfPath)
         }),
       });
 
@@ -2776,7 +3146,14 @@ const TreePlotVisualization = () => {
 
   // Enhanced code editing functions
   const switchCodeTab = (tabName) => {
-    // Save current content before switching
+    if (!tabName) {
+      return;
+    }
+
+    if (experimentFiles[tabName] === undefined) {
+      return;
+    }
+
     if (activeCodeTab && experimentFiles[activeCodeTab] !== undefined) {
       setExperimentFiles(prev => ({
         ...prev,
@@ -2784,9 +3161,9 @@ const TreePlotVisualization = () => {
       }));
     }
 
-    // Switch to new tab
+    const fileItem = experimentFileList.find((item) => item.path === tabName);
     setActiveCodeTab(tabName);
-    setCodeFileName(tabName);
+    setCodeFileName(fileItem?.tabLabel || tabName);
     setCodeContent(experimentFiles[tabName] || '');
   };
 
@@ -2894,14 +3271,14 @@ const TreePlotVisualization = () => {
                 {/* Download All Button */}
                 <button
                   onClick={downloadExperimentFiles}
-                  disabled={Object.keys(experimentFiles).length === 0}
+                  disabled={!hasExperimentFiles}
                   style={{
                     padding: '8px 16px',
-                    backgroundColor: Object.keys(experimentFiles).length > 0 ? '#4C84FF' : '#9CA3AF',
+                    backgroundColor: hasExperimentFiles ? '#4C84FF' : '#9CA3AF',
                     color: 'white',
                     border: 'none',
                     borderRadius: '6px',
-                    cursor: Object.keys(experimentFiles).length > 0 ? 'pointer' : 'not-allowed',
+                    cursor: hasExperimentFiles ? 'pointer' : 'not-allowed',
                     fontSize: '0.875rem',
                     fontWeight: 500,
                     display: 'flex',
@@ -2971,11 +3348,11 @@ const TreePlotVisualization = () => {
                   <div>
                     <strong>Directory:</strong>
                     <code style={{ backgroundColor: '#dcfce7', padding: '2px 6px', borderRadius: '4px', marginLeft: '8px' }}>
-                      {codeResult.experiment_dir}
+                      {codeResult.experiment_dir ? `generated/${codeResult.experiment_dir}` : 'N/A'}
                     </code>
                   </div>
                   <div>
-                    <strong>Files:</strong> {Object.keys(experimentFiles).length} loaded
+                    <strong>Files:</strong> {experimentFileCount} loaded
                   </div>
                 </div>
               </div>
@@ -3005,7 +3382,7 @@ const TreePlotVisualization = () => {
                 </div>
 
                 <div style={{ flex: 1, padding: '8px' }}>
-                  {Object.keys(experimentFiles).length === 0 ? (
+                  {!hasExperimentFiles ? (
                     <div style={{
                       textAlign: 'center',
                       padding: '20px',
@@ -3015,30 +3392,32 @@ const TreePlotVisualization = () => {
                       No files loaded
                     </div>
                   ) : (
-                    Object.keys(experimentFiles).map((fileName) => (
-                      <div
-                        key={fileName}
-                        onClick={() => switchCodeTab(fileName)}
-                        style={{
-                          padding: '8px 12px',
-                          cursor: 'pointer',
-                          borderRadius: '4px',
-                          marginBottom: '4px',
-                          backgroundColor: activeCodeTab === fileName ? '#eff6ff' : 'transparent',
-                          color: activeCodeTab === fileName ? '#1d4ed8' : '#374151',
-                          fontSize: '0.875rem',
-                          display: 'flex',
-                          alignItems: 'center',
-                          gap: '8px'
-                        }}
-                      >
-                        {fileName.endsWith('.py') ? '🐍' : fileName.endsWith('.txt') ? '📄' : '📝'}
-                        {fileName}
-                      </div>
-                    ))
+                    <>
+                      {baseFileItems.map((item) => (
+                        <div
+                          key={item.path}
+                          onClick={() => switchCodeTab(item.path)}
+                          style={{
+                            padding: '8px 12px',
+                            cursor: 'pointer',
+                            borderRadius: '4px',
+                            marginBottom: '4px',
+                            backgroundColor: activeCodeTab === item.path ? '#eff6ff' : 'transparent',
+                            color: activeCodeTab === item.path ? '#1d4ed8' : '#374151',
+                            fontSize: '0.875rem',
+                            display: 'flex',
+                            alignItems: 'center',
+                            gap: '8px'
+                          }}
+                        >
+                          <span>{item.icon}</span>
+                          <span>{item.sidebarLabel}</span>
+                        </div>
+                      ))}
+                    </>
                   )}
 
-                  {experimentRuns.length > 0 && (
+                  {runFileGroups.length > 0 && (
                     <>
                       <div style={{
                         padding: '8px 12px',
@@ -3051,21 +3430,44 @@ const TreePlotVisualization = () => {
                       }}>
                         📊 Experiment Runs
                       </div>
-                      {experimentRuns.map((run, index) => (
+                      {runFileGroups.map(([runName, group], index) => (
                         <div
-                          key={index}
+                          key={runName || index}
                           style={{
                             padding: '8px 12px',
-                            borderRadius: '4px',
-                            marginBottom: '4px',
+                            borderRadius: '6px',
+                            marginBottom: '6px',
                             backgroundColor: '#f9fafb',
-                            fontSize: '0.875rem',
                             display: 'flex',
-                            alignItems: 'center',
-                            gap: '8px'
+                            flexDirection: 'column',
+                            gap: '4px'
                           }}
                         >
-                          {run.success ? '✅' : '❌'} Run {index + 1}
+                          <div style={{ fontWeight: 600, color: '#111827' }}>
+                            {group.runLabel}
+                          </div>
+                          {group.items.map((item) => (
+                            <div
+                              key={item.path}
+                              onClick={() => switchCodeTab(item.path)}
+                              style={{
+                                padding: '6px 10px',
+                                cursor: experimentFiles[item.path] ? 'pointer' : 'not-allowed',
+                                borderRadius: '4px',
+                                backgroundColor: activeCodeTab === item.path ? '#eff6ff' : 'transparent',
+                                color: experimentFiles[item.path]
+                                  ? activeCodeTab === item.path ? '#1d4ed8' : '#374151'
+                                  : '#9ca3af',
+                                display: 'flex',
+                                alignItems: 'center',
+                                gap: '8px',
+                                paddingLeft: '18px'
+                              }}
+                            >
+                              <span>{item.icon}</span>
+                              <span>{item.sidebarLabel}</span>
+                            </div>
+                          ))}
                         </div>
                       ))}
                     </>
@@ -3076,46 +3478,98 @@ const TreePlotVisualization = () => {
               {/* Main Editor Area */}
               <div style={{ flex: 1, display: 'flex', flexDirection: 'column' }}>
                 {/* File Tabs */}
-                {Object.keys(experimentFiles).length > 0 && (
+                {hasExperimentFiles && (
                   <div style={{
                     display: 'flex',
                     backgroundColor: 'white',
                     borderRadius: '8px 8px 0 0',
                     border: '1px solid #e5e7eb',
-                    borderBottom: 'none'
+                    borderBottom: 'none',
+                    overflowX: 'auto'
                   }}>
-                    {Object.keys(experimentFiles).map((fileName) => (
+                    {experimentFileList.map((item) => (
                       <div
-                        key={fileName}
-                        onClick={() => switchCodeTab(fileName)}
+                        key={item.path}
+                        onClick={() => switchCodeTab(item.path)}
                         style={{
                           padding: '8px 16px',
-                          cursor: 'pointer',
-                          backgroundColor: activeCodeTab === fileName ? '#f8fafc' : 'transparent',
-                          borderBottom: activeCodeTab === fileName ? '2px solid #4C84FF' : '2px solid transparent',
+                          cursor: experimentFiles[item.path] ? 'pointer' : 'not-allowed',
+                          backgroundColor: activeCodeTab === item.path ? '#f8fafc' : 'transparent',
+                          borderBottom: activeCodeTab === item.path ? '2px solid #4C84FF' : '2px solid transparent',
                           fontSize: '0.875rem',
-                          fontWeight: activeCodeTab === fileName ? 500 : 400,
-                          color: activeCodeTab === fileName ? '#1f2937' : '#6b7280'
+                          fontWeight: activeCodeTab === item.path ? 500 : 400,
+                          color: experimentFiles[item.path]
+                            ? activeCodeTab === item.path ? '#1f2937' : '#6b7280'
+                            : '#9ca3af'
                         }}
                       >
-                        {fileName}
+                        {item.tabLabel}
                       </div>
                     ))}
                   </div>
                 )}
 
+                {activeCodeTab === 'experiment_results.txt' && experimentRuns.length > 0 && (
+                  <div
+                    style={{
+                      backgroundColor: '#f8fafc',
+                      border: '1px solid #e2e8f0',
+                      borderTop: hasExperimentFiles ? 'none' : '1px solid #e2e8f0',
+                      borderRadius: hasExperimentFiles ? '0 0 8px 8px' : '8px',
+                      padding: '16px',
+                      marginBottom: '12px',
+                      marginTop: hasExperimentFiles ? '-1px' : '0'
+                    }}
+                  >
+                    <div style={{ fontSize: '0.875rem', fontWeight: 600, color: '#0f172a', marginBottom: '12px' }}>
+                      Run Summary
+                    </div>
+                    <div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(200px, 1fr))', gap: '12px' }}>
+                      {experimentRuns.map((run, index) => {
+                        const runLabel = run.runLabel || getRunLabel(run.name || '', index);
+                        const metricEntries = Object.entries(run.metrics || {});
+                        return (
+                          <div
+                            key={run.name || index}
+                            style={{
+                              backgroundColor: 'white',
+                              borderRadius: '6px',
+                              border: '1px solid #e2e8f0',
+                              padding: '12px',
+                              display: 'flex',
+                              flexDirection: 'column',
+                              gap: '6px'
+                            }}
+                          >
+                            <div style={{ fontWeight: 600, color: '#0f172a' }}>{runLabel}</div>
+                            {metricEntries.length > 0 ? (
+                              metricEntries.map(([metricName, metricValue]) => (
+                                <div key={metricName} style={{ fontSize: '0.8rem', color: '#475569' }}>
+                                  <span style={{ textTransform: 'capitalize' }}>{metricName.replace(/_/g, ' ')}</span>: <span style={{ fontWeight: 600 }}>{formatMetricValue(metricValue)}</span>
+                                </div>
+                              ))
+                            ) : (
+                              <div style={{ fontSize: '0.8rem', color: '#94a3b8' }}>No metrics recorded.</div>
+                            )}
+                          </div>
+                        );
+                      })}
+                    </div>
+                  </div>
+                )}
+
                 {/* Monaco Editor */}
                 <div style={{
                   flex: 1,
                   backgroundColor: 'white',
-                  borderRadius: Object.keys(experimentFiles).length > 0 ? '0 0 8px 8px' : '8px',
+                  borderRadius: hasExperimentFiles ? '0 0 8px 8px' : '8px',
                   border: '1px solid #e5e7eb',
                   overflow: 'hidden',
                   boxShadow: '0 1px 3px rgba(0,0,0,0.1)'
                 }}>
                   <Editor
                     height="100%"
-                    language={activeCodeTab.endsWith('.py') ? 'python' : activeCodeTab.endsWith('.js') ? 'javascript' : activeCodeTab.endsWith('.ts') ? 'typescript' : activeCodeTab.endsWith('.json') ? 'json' : activeCodeTab.endsWith('.md') ? 'markdown' : 'plaintext'}
+                    language={activeCodeTab === 'experiment_results.txt' ? 'json' : activeCodeTab.endsWith('.py') ? 'python' : activeCodeTab.endsWith('.js') ? 'javascript' : activeCodeTab.endsWith('.ts') ? 'typescript' : activeCodeTab.endsWith('.json') ? 'json' : activeCodeTab.endsWith('.md') ? 'markdown' : 'plaintext'}
                     value={codeContent}
                     onChange={(value) => setCodeContent(value || '')}
                     theme="vs-dark"
@@ -3134,55 +3588,6 @@ const TreePlotVisualization = () => {
                 </div>
               </div>
 
-              {/* Console Output Panel */}
-              <div style={{
-                width: '300px',
-                backgroundColor: 'white',
-                borderRadius: '8px',
-                border: '1px solid #e5e7eb',
-                boxShadow: '0 1px 3px rgba(0,0,0,0.1)',
-                display: 'flex',
-                flexDirection: 'column'
-              }}>
-                <div style={{
-                  backgroundColor: '#f8fafc',
-                  padding: '12px 16px',
-                  borderBottom: '1px solid #e5e7eb',
-                  fontSize: '0.875rem',
-                  color: '#64748b',
-                  fontWeight: 500,
-                  display: 'flex',
-                  justifyContent: 'space-between',
-                  alignItems: 'center'
-                }}>
-                  💻 Console Output
-                  <button
-                    onClick={() => setConsoleOutput('')}
-                    style={{
-                      background: 'none',
-                      border: 'none',
-                      color: '#6b7280',
-                      cursor: 'pointer',
-                      fontSize: '0.75rem'
-                    }}
-                  >
-                    Clear
-                  </button>
-                </div>
-
-                <div style={{
-                  flex: 1,
-                  padding: '12px',
-                  fontSize: '0.75rem',
-                  fontFamily: 'monospace',
-                  backgroundColor: '#1a1a1a',
-                  color: '#e5e7eb',
-                  overflow: 'auto',
-                  whiteSpace: 'pre-wrap'
-                }}>
-                  {consoleOutput || 'No output yet...'}
-                </div>
-              </div>
             </div>
 
             {proceedError && (
@@ -3198,7 +3603,7 @@ const TreePlotVisualization = () => {
               </div>
             )}
 
-            {!codeResult && !isGeneratingCode && !proceedError && Object.keys(experimentFiles).length === 0 && (
+            {!codeResult && !isGeneratingCode && !proceedError && !hasExperimentFiles && (
               <div style={{
                 textAlign: 'center',
                 padding: '40px',
@@ -3332,7 +3737,7 @@ const TreePlotVisualization = () => {
                     </button>
                   </div>
                   <iframe
-                    src={`http://localhost:5000${paperResult.pdf_path}#toolbar=1&navpanes=1&scrollbar=1&page=1&view=FitH&zoom=100`}
+                    src={`${buildRelativeApiUrl(paperResult.pdf_path)}#toolbar=1&navpanes=1&scrollbar=1&page=1&view=FitH&zoom=100`}
                     style={{
                       width: '100%',
                       height: 'calc(100% - 45px)',
@@ -4107,15 +4512,13 @@ const TreePlotVisualization = () => {
                 color: '#374151',
                 fontWeight: 500
               }}>
-                Semantic Scholar API Key {!selectedNode?.originalData?.is_experimental ? '*' : '(Optional for now)'}
+                Semantic Scholar API Key (Optional)
               </label>
               <input
                 type="password"
                 value={s2ApiKey}
                 onChange={(e) => setS2ApiKey(e.target.value)}
-                placeholder={selectedNode?.originalData?.is_experimental ?
-                  "Enter your Semantic Scholar API key (can be provided later for paper generation)" :
-                  "Enter your Semantic Scholar API key"}
+                placeholder="Optional: improves citation quality"
                 style={{
                   width: '100%',
                   padding: '8px 12px',
@@ -4128,10 +4531,7 @@ const TreePlotVisualization = () => {
                 }}
               />
               <div style={{ marginTop: '4px', fontSize: '0.75rem', color: '#6B7280' }}>
-                {selectedNode?.originalData?.is_experimental ?
-                  'For experimental ideas: Required only when generating paper. You can provide it later.' :
-                  'Required for paper generation.'
-                } Get your API key from{' '}
+                Providing a key lets us fetch references from Semantic Scholar. Leave blank to skip for now. Get your API key from{' '}
                 <a
                   href="https://www.semanticscholar.org/product/api"
                   target="_blank"
@@ -4177,16 +4577,16 @@ const TreePlotVisualization = () => {
               <button
                 style={{
                   padding: '8px 16px',
-                  backgroundColor: ((!selectedNode?.originalData?.is_experimental && !s2ApiKey.trim()) || isGeneratingCode || isGeneratingPaper) ? '#9CA3AF' : '#4C84FF',
+                  backgroundColor: (isGeneratingCode || isGeneratingPaper) ? '#9CA3AF' : '#4C84FF',
                   color: 'white',
                   border: 'none',
                   borderRadius: '6px',
-                  cursor: ((!selectedNode?.originalData?.is_experimental && !s2ApiKey.trim()) || isGeneratingCode || isGeneratingPaper) ? 'not-allowed' : 'pointer',
+                  cursor: (isGeneratingCode || isGeneratingPaper) ? 'not-allowed' : 'pointer',
                   fontSize: '0.875rem',
                   fontWeight: 500,
                 }}
                 onClick={handleConfirmProceed}
-                disabled={(!selectedNode?.originalData?.is_experimental && !s2ApiKey.trim()) || isGeneratingCode || isGeneratingPaper}
+                disabled={isGeneratingCode || isGeneratingPaper}
               >
                 {isGeneratingCode || isGeneratingPaper ? 'Processing...' : 'Yes, Proceed'}
               </button>
diff --git a/frontend/src/setupProxy.js b/frontend/src/setupProxy.js
new file mode 100644
index 00000000..e5e3ea22
--- /dev/null
+++ b/frontend/src/setupProxy.js
@@ -0,0 +1,28 @@
+const { createProxyMiddleware } = require('http-proxy-middleware');
+
+const LIVE_TARGET = process.env.LIVE_API_TARGET || 'http://localhost:5000';
+const DEMO_TARGET = process.env.DEMO_API_TARGET || 'http://localhost:5001';
+
+module.exports = function setupProxy(app) {
+  app.use(
+    ['/demo/api', '/demo/socket.io'],
+    createProxyMiddleware({
+      target: DEMO_TARGET,
+      changeOrigin: true,
+      ws: true,
+      pathRewrite: {
+        '^/demo/api': '/api',
+        '^/demo/socket.io': '/socket.io',
+      },
+    })
+  );
+
+  app.use(
+    ['/api', '/socket.io'],
+    createProxyMiddleware({
+      target: LIVE_TARGET,
+      changeOrigin: true,
+      ws: true,
+    })
+  );
+};
diff --git a/poetry.lock b/poetry.lock
index faf3eb5e..84ef3938 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.2.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
 
 [[package]]
 name = "aider-chat"
@@ -1056,6 +1056,27 @@ files = [
     {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
 ]
 
+[[package]]
+name = "dnspython"
+version = "2.8.0"
+description = "DNS toolkit"
+optional = false
+python-versions = ">=3.10"
+groups = ["main"]
+files = [
+    {file = "dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af"},
+    {file = "dnspython-2.8.0.tar.gz", hash = "sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f"},
+]
+
+[package.extras]
+dev = ["black (>=25.1.0)", "coverage (>=7.0)", "flake8 (>=7)", "hypercorn (>=0.17.0)", "mypy (>=1.17)", "pylint (>=3)", "pytest (>=8.4)", "pytest-cov (>=6.2.0)", "quart-trio (>=0.12.0)", "sphinx (>=8.2.0)", "sphinx-rtd-theme (>=3.0.0)", "twine (>=6.1.0)", "wheel (>=0.45.0)"]
+dnssec = ["cryptography (>=45)"]
+doh = ["h2 (>=4.2.0)", "httpcore (>=1.0.0)", "httpx (>=0.28.0)"]
+doq = ["aioquic (>=1.2.0)"]
+idna = ["idna (>=3.10)"]
+trio = ["trio (>=0.30)"]
+wmi = ["wmi (>=1.5.1) ; platform_system == \"Windows\""]
+
 [[package]]
 name = "docker"
 version = "7.1.0"
@@ -1079,6 +1100,25 @@ docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
 ssh = ["paramiko (>=2.4.3)"]
 websockets = ["websocket-client (>=1.3.0)"]
 
+[[package]]
+name = "eventlet"
+version = "0.40.3"
+description = "Highly concurrent networking library"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "eventlet-0.40.3-py3-none-any.whl", hash = "sha256:e681cae6ee956cfb066a966b5c0541e734cc14879bda6058024104790595ac9d"},
+    {file = "eventlet-0.40.3.tar.gz", hash = "sha256:290852db0065d78cec17a821b78c8a51cafb820a792796a354592ae4d5fceeb0"},
+]
+
+[package.dependencies]
+dnspython = ">=1.15.0"
+greenlet = ">=1.0"
+
+[package.extras]
+dev = ["black", "build", "commitizen", "isort", "pip-tools", "pre-commit", "twine"]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.2"
@@ -1585,6 +1625,74 @@ protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4
 [package.extras]
 grpc = ["grpcio (>=1.44.0,<2.0.0)"]
 
+[[package]]
+name = "greenlet"
+version = "3.2.4"
+description = "Lightweight in-process concurrent programming"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c"},
+    {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590"},
+    {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f10fd42b5ee276335863712fa3da6608e93f70629c631bf77145021600abc23c"},
+    {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c8c9e331e58180d0d83c5b7999255721b725913ff6bc6cf39fa2a45841a4fd4b"},
+    {file = "greenlet-3.2.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58b97143c9cc7b86fc458f215bd0932f1757ce649e05b640fea2e79b54cedb31"},
+    {file = "greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d"},
+    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5"},
+    {file = "greenlet-3.2.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8854167e06950ca75b898b104b63cc646573aa5fef1353d4508ecdd1ee76254f"},
+    {file = "greenlet-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:73f49b5368b5359d04e18d15828eecc1806033db5233397748f4ca813ff1056c"},
+    {file = "greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2"},
+    {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246"},
+    {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:94abf90142c2a18151632371140b3dba4dee031633fe614cb592dbb6c9e17bc3"},
+    {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:4d1378601b85e2e5171b99be8d2dc85f594c79967599328f95c1dc1a40f1c633"},
+    {file = "greenlet-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0db5594dce18db94f7d1650d7489909b57afde4c580806b8d9203b6e79cdc079"},
+    {file = "greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8"},
+    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52"},
+    {file = "greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa"},
+    {file = "greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9"},
+    {file = "greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd"},
+    {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb"},
+    {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968"},
+    {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9"},
+    {file = "greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6"},
+    {file = "greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0"},
+    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0"},
+    {file = "greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f"},
+    {file = "greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02"},
+    {file = "greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31"},
+    {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945"},
+    {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc"},
+    {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a"},
+    {file = "greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504"},
+    {file = "greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671"},
+    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b"},
+    {file = "greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae"},
+    {file = "greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b"},
+    {file = "greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0"},
+    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f"},
+    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5"},
+    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1"},
+    {file = "greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735"},
+    {file = "greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337"},
+    {file = "greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01"},
+    {file = "greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:b6a7c19cf0d2742d0809a4c05975db036fdff50cd294a93632d6a310bf9ac02c"},
+    {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:27890167f55d2387576d1f41d9487ef171849ea0359ce1510ca6e06c8bece11d"},
+    {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:18d9260df2b5fbf41ae5139e1be4e796d99655f023a636cd0e11e6406cca7d58"},
+    {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:671df96c1f23c4a0d4077a325483c1503c96a1b7d9db26592ae770daa41233d4"},
+    {file = "greenlet-3.2.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:16458c245a38991aa19676900d48bd1a6f2ce3e16595051a4db9d012154e8433"},
+    {file = "greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9913f1a30e4526f432991f89ae263459b1c64d1608c0d22a5c79c287b3c70df"},
+    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b90654e092f928f110e0007f572007c9727b5265f7632c2fa7415b4689351594"},
+    {file = "greenlet-3.2.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:81701fd84f26330f0d5f4944d4e92e61afe6319dcd9775e39396e39d7c3e5f98"},
+    {file = "greenlet-3.2.4-cp39-cp39-win32.whl", hash = "sha256:65458b409c1ed459ea899e939f0e1cdb14f58dbc803f2f93c5eab5694d32671b"},
+    {file = "greenlet-3.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:d2e685ade4dafd447ede19c31277a224a239a0a1a4eca4e6390efedf20260cfb"},
+    {file = "greenlet-3.2.4.tar.gz", hash = "sha256:0dca0d95ff849f9a364385f36ab49f50065d76964944638be9691e1832e9f86d"},
+]
+
+[package.extras]
+docs = ["Sphinx", "furo"]
+test = ["objgraph", "psutil", "setuptools"]
+
 [[package]]
 name = "grep-ast"
 version = "0.8.1"
@@ -5930,4 +6038,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10, <3.12"
-content-hash = "8afa87528e8080a6910bc3078d7fdb5829f5e204f9d8eecabf836ef6b9028d05"
+content-hash = "f0335eecbf8ab3826abd39b6c2363e66a74f085533dd15c4ecc4e7b95867dd50"
diff --git a/pyproject.toml b/pyproject.toml
index 0b0e98d4..8865bd49 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ flask-socketio = "^5.5.1"
 docker = "^7.1.0"
 fastmcp = "*"
 mcp = "*"
+eventlet = "^0.40.3"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "*"
diff --git a/tiny_scientist/mcp/docker_runner_server.py b/tiny_scientist/mcp/docker_runner_server.py
index 8bd7664b..582e0c86 100644
--- a/tiny_scientist/mcp/docker_runner_server.py
+++ b/tiny_scientist/mcp/docker_runner_server.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import ast
+import hashlib
 import os
 import shutil
 import sys
@@ -21,6 +22,29 @@
 CONFIG_PATH = PACKAGE_ROOT / "config.toml"
 config = toml.load(CONFIG_PATH) if CONFIG_PATH.exists() else {"core": {}}
 
+DEFAULT_BASE_PACKAGES = [
+    "torch",
+    "datasets",
+    "numpy",
+    "pandas",
+    "scikit-learn",
+    "matplotlib",
+    "seaborn",
+    "tqdm",
+    "requests",
+    "pillow",
+]
+
+
+def _resolve_configured_packages() -> List[str]:
+    docker_cfg = config.get("docker") if isinstance(config, dict) else None
+    configured = None
+    if isinstance(docker_cfg, dict):
+        configured = docker_cfg.get("base_packages")
+    if isinstance(configured, list) and all(isinstance(pkg, str) for pkg in configured):
+        return configured
+    return DEFAULT_BASE_PACKAGES.copy()
+
 
 class DockerExperimentRunner:
     def __init__(
@@ -32,6 +56,16 @@ def __init__(
         self.docker_base = docker_base
         self.docker_client = None
         self.use_docker = False
+        self.base_packages = _resolve_configured_packages()
+        self._base_package_set: Set[str] = set(self.base_packages)
+        self._base_fingerprint = self._compute_base_fingerprint()
+        last_colon = self.docker_image.rfind(":")
+        last_slash = self.docker_image.rfind("/")
+        if last_colon > last_slash:
+            self._base_image_repo = self.docker_image[:last_colon]
+        else:
+            self._base_image_repo = self.docker_image
+        self.base_image_tag = f"{self._base_image_repo}:{self._base_fingerprint}"
 
         # Initialize Docker client
         try:
@@ -49,24 +83,7 @@ def detect_required_packages(
     ) -> List[str]:
         """Detect required packages from import statements in a Python file."""
         if base_packages is None:
-            base_packages = set(
-                [
-                    "numpy",
-                    "pandas",
-                    "scikit-learn",
-                    "matplotlib",
-                    "seaborn",
-                    "torch",
-                    "tensorflow",
-                    "transformers",
-                    "datasets",
-                    "evaluate",
-                    "wandb",
-                    "tqdm",
-                    "requests",
-                    "pillow",
-                ]
-            )
+            base_packages = set(DEFAULT_BASE_PACKAGES)
 
         # Common package name mappings (import_name -> pip_package_name)
         package_mapping = {
@@ -294,21 +311,22 @@ def get_or_build_base_image(self) -> Optional[str]:
             return None
         if self.docker_client is not None:
             try:
-                self.docker_client.images.get(self.docker_image)
-                print(f"[Docker] Using existing image: {self.docker_image}")
+                self.docker_client.images.get(self.base_image_tag)
+                print(f"[Docker] Using existing image: {self.base_image_tag}")
             except ImageNotFound:
-                print(f"[Docker] Building image: {self.docker_image}")
-                dockerfile = f"""
-FROM {self.docker_base}
-RUN pip install --no-cache-dir numpy pandas scikit-learn matplotlib seaborn torch tensorflow transformers datasets evaluate wandb tqdm requests pillow
-"""
+                print(f"[Docker] Building image: {self.base_image_tag}")
+                dockerfile_lines = [f"FROM {self.docker_base}"]
+                if self.base_packages:
+                    joined = " ".join(sorted(self.base_packages))
+                    dockerfile_lines.append(f"RUN pip install --no-cache-dir {joined}")
+                dockerfile = "\n".join(dockerfile_lines) + "\n"
                 with tempfile.TemporaryDirectory() as tmpdir:
                     with open(os.path.join(tmpdir, "Dockerfile"), "w") as f:
                         f.write(dockerfile)
                     self.docker_client.images.build(
-                        path=tmpdir, tag=self.docker_image, rm=True
+                        path=tmpdir, tag=self.base_image_tag, rm=True
                     )
-            return self.docker_image
+            return self.base_image_tag
         return None
 
     def get_or_build_experiment_image(self, experiment_py_path: str) -> Optional[str]:
@@ -316,9 +334,16 @@ def get_or_build_experiment_image(self, experiment_py_path: str) -> Optional[str
         if not self.use_docker:
             return None
         base_image = self.get_or_build_base_image()
-        extra_pkgs = self.detect_required_packages(experiment_py_path)
+        extra_pkgs = self.detect_required_packages(
+            experiment_py_path, base_packages=self._base_package_set
+        )
         if extra_pkgs:
-            image_name = f"tiny-scientist-exp-{hash(tuple(extra_pkgs))}"
+            extras_fingerprint = hashlib.sha256(
+                "|".join(sorted(extra_pkgs)).encode("utf-8")
+            ).hexdigest()[:12]
+            image_name = (
+                f"tiny-scientist-exp-{self._base_fingerprint}-{extras_fingerprint}"
+            )
             if self.docker_client is not None:
                 try:
                     self.docker_client.images.get(image_name)
@@ -362,6 +387,11 @@ def get_or_build_experiment_image(self, experiment_py_path: str) -> Optional[str
         else:
             return base_image
 
+    def _compute_base_fingerprint(self) -> str:
+        """Fingerprint the base image definition so cache invalidation is automatic."""
+        normalized = "\n".join([self.docker_base] + sorted(self.base_packages))
+        return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:12]
+
     def run_experiment_in_docker(
         self, experiment_code: str, run_num: int, output_dir: str, timeout: int = 7200
     ) -> Optional[Tuple[int, str]]:
diff --git a/tiny_scientist/prompts/coder_prompt.yaml b/tiny_scientist/prompts/coder_prompt.yaml
index b4a70302..a06e4185 100644
--- a/tiny_scientist/prompts/coder_prompt.yaml
+++ b/tiny_scientist/prompts/coder_prompt.yaml
@@ -30,35 +30,35 @@ experiment_prompt: |
   Proposed Approach: {approach}
 
   ## Experimental Setup
-  The following describes the experiment setup. You must base your implementation strictly on this structure and keep it lightweight.
+  The following describes the experiment setup. Base your implementation on this structure and make reasonable engineering choices.
 
   Models/Algorithms (key terms): {model_keywords}
   Detailed model specification:
   {model_details}
 
   Datasets (key terms): {dataset_keywords}
-  Detailed dataset specification (use this loader exactly — do not invent file paths):
+  Detailed dataset specification and loading steps (provide exact, runnable code to load; programmatic loaders preferred, but local paths or downloads are allowed if handled robustly):
   {dataset_details}
 
   Evaluation metrics (key terms): {metric_keywords}
   Detailed metric specification:
   {metric_details}
 
-  ## Hard Constraints
-  - Use only lightweight architectures: bag-of-words/logistic regression, shallow MLP (≤2 hidden layers, ≤128 units), shallow CNN (≤3 conv layers, ≤32 filters), linear SVM, single-layer GRU (≤64 hidden units). Total parameter count should stay under ~100k.
-  - ABSOLUTELY FORBIDDEN: importing `transformers`, `BertTokenizer`, any BERT/GPT/ViT class, graph neural networks, diffusion models, random feature generators (`torch.rand`, `np.random` placeholders for embeddings), manual placeholder paths, loading huge pretrained checkpoints.
-  - Dataset must be loaded via the provided command. If it fails, switch to one of: `datasets.load_dataset("ag_news")`, `datasets.load_dataset("imdb")`, `datasets.load_dataset("yelp_polarity")`, `datasets.load_dataset("cifar10")`, `datasets.load_dataset("mnist")`, `datasets.load_dataset("fashion_mnist")`, `datasets.load_dataset("amazon_polarity")`, or an equivalent auto-downloadable dataset. NEVER fall back to local files. For text, prefer simple vectorizers (`CountVectorizer`, `TfidfVectorizer`) over tokenizers.
-  - Always subsample large datasets: cap training examples at ≤5,000 and validation/test at ≤2,000 each (use `select(range(...))` or appropriate slicing). Document this in code comments.
+  ## Flexibility and Tools
+  - You may use any libraries or models appropriate for the task (including pretrained models, transformers, graph neural networks, or diffusion models) as long as the script remains runnable end-to-end.
+  - If the primary plan is resource-intensive, provide sensible defaults or smaller settings that still execute successfully without manual intervention.
+  - Implement robust error handling for dataset loading. If a preferred loader fails, you may use alternative sources (other datasets, local files, or web downloads) with clear, reproducible code.
+  - Subsampling large datasets is optional; consider providing a simple toggle or variable to control sample sizes when needed.
 
   ## Required Script Structure
   Your script MUST include the following components (you may adapt names, but all roles must be present):
-  1. `load_data()` – downloads the dataset using the loader above, subsamples to the limits (≤5k train / ≤2k val/test), and converts examples into explicit tensors/arrays (`torch.tensor` or `numpy.array`). If using HuggingFace datasets, call `.set_format(type="torch", columns=[...])` or manually build tensors, then wrap them with `torch.utils.data.TensorDataset` + `DataLoader` (or sklearn splits if you stay in sklearn). Include a brief comment explaining why this preprocessing keeps the experiment lightweight.
-  2. `build_model()` – constructs the lightweight model defined in the plan. Document input/output dimensions explicitly, and add a short justification for why this architecture is the simplest option that still addresses the task.
-  3. `train(model, train_loader, optimizer, criterion, device)` – iterates over batches, moves tensors to `device`, computes loss, `loss.backward()`, `optimizer.step()`. Log the epoch count and explain in a comment why the number of epochs is sufficient.
-  4. `evaluate(model, data_loader, device)` – returns a dict of metrics computed from real predictions vs ground truth. Include a comment explaining why the chosen metrics match the scientific goal.
-  5. `main(out_dir)` – orchestrates the steps above, saves `final_info.json` with metric values (no placeholder numbers), and prints dataset sizes so we can verify subsampling.
+  1. `load_data()` – loads the dataset using the specified loader and converts examples into explicit tensors/arrays (`torch.tensor` or `numpy.array`). If using HuggingFace datasets, call `.set_format(type="torch", columns=[...])` or manually build tensors, then wrap them with `torch.utils.data.TensorDataset` + `DataLoader` (or sklearn splits if you stay in sklearn). Optionally include a simple knob to subsample for faster iteration when needed.
+  2. `build_model()` – constructs the model defined in the plan. Document input/output dimensions explicitly, and add a short justification for why this architecture fits the task.
+  3. `prepare_or_train(...)` – OPTIONAL. If the plan involves training, iterate over batches, move tensors to `device`, compute loss, run `optimizer.zero_grad()`, `loss.backward()`, and `optimizer.step()`. Otherwise, perform necessary preparation steps (e.g., load pretrained weights, set evaluation mode, prompt/tokenizer setup, calibration). Briefly document why training is skipped.
+  4. `evaluate(model, data_loader, device)` – returns a dict of metrics computed from real predictions vs ground truth (or other appropriate outputs for the task). Include a comment explaining why the chosen metrics match the scientific goal.
+  5. `main(out_dir)` – orchestrates the steps above, saves `final_info.json` with metric values (no placeholder numbers), and prints dataset sizes or key dataset stats for transparency.
 
-  Keep training loops short (≤5 epochs, batch_size ≤64) so the experiment finishes quickly on CPU. If the dataset is extremely small, 3 epochs are usually enough.
+  If training is included, set sensible default durations so the experiment completes in a reasonable time. If the dataset is very large, consider defaulting to smaller settings while allowing easy adjustment in code.
 
   ## Execution Command (DO NOT MODIFY):
   You have {max_runs} runs to complete this experiment. For each run, the script will be executed using:
@@ -67,27 +67,20 @@ experiment_prompt: |
 
   ## YOU MUST ENSURE experiment.py:
   1. Parses the `--out_dir` argument and calls `os.makedirs(out_dir, exist_ok=True)`.
-  2. Loads the dataset using the specified loader (or the fallback whitelist above) with NO manual file paths.
-  3. Builds a lightweight model that matches the dataset feature shape; document tensor dimensions in code comments when not obvious. No hidden “placeholder embeddings”.
-  4. Performs REAL training with gradient updates (`optimizer.zero_grad()`, `loss.backward()`, `optimizer.step()`) on actual features (no zero/random stand-ins).
-  5. Computes metrics from real predictions (e.g., accuracy/F1) using sklearn/torch utilities—no hardcoded results.
+  2. Loads the dataset using programmatic code (HuggingFace/torchvision/sklearn or explicit loading logic) and does not rely on placeholder paths. If using local paths or downloads, ensure they are valid or handled gracefully.
+  3. Builds a model that matches the dataset feature shape; document tensor dimensions in code comments when not obvious. No hidden “placeholder embeddings”.
+  4. If training is part of the plan, perform REAL training with gradient updates on actual features (no zero/random stand-ins). If training is skipped, ensure REAL inference/preparation is performed (e.g., loading weights) and explain briefly in code comments.
+  5. Computes metrics from actual predictions vs labels (or appropriate outputs) using reliable utilities—no hardcoded results.
   6. Saves a dict of metrics to `{{out_dir}}/final_info.json` (no nested folders).
 
-  ## Computational Constraints
-  - Ensure the code is computationally affordable to run on a single GPU or CPU machine.
-  - Avoid using large models like GPT, T5, BERT-large, or full ImageNet training.
-  - Prefer small-scale tasks, toy models, or low-cost benchmarks (e.g., MNIST, UCI datasets, small MLPs or ResNet18).
-  - Do not use complex distributed training or multi-GPU setups.
-
-  Do not add extra command-line arguments.
+  The script must run successfully with only the `--out_dir` argument. If you add more arguments, provide sensible defaults so they are optional.
 
   **VERIFICATION CHECKLIST** (You MUST pass all checks):
-  - [ ] Dataset is downloaded via library loader (no placeholder paths, no mocks, no random tensors) and converted to tensors/arrays explicitly. Document any subsampling (≤5k train / ≤2k val/test).
-  - [ ] Model is a lightweight architecture (no transformer/graph/pretrained behemoth) and input/output shapes line up. Forbidden imports (`transformers`, BERT classes) are absent.
-  - [ ] Training loop performs `optimizer.zero_grad()`, `loss.backward()`, `optimizer.step()` on real batches (no synthetic placeholder features).
+  - [ ] Dataset is loaded with real data (no placeholder paths or random tensors) and converted to tensors/arrays explicitly. If subsampling or heavy settings are used, document them.
+  - [ ] Model input/output shapes line up; tensor dimensions are clear from comments or code.
+  - [ ] If training is included, the loop performs real gradient updates on real batches (no synthetic placeholder features). If training is skipped, the code performs real inference/preparation (e.g., loads weights) and documents the choice.
   - [ ] Evaluation computes metrics from actual predictions vs labels (no dummy numbers).
   - [ ] `final_info.json` is written in `out_dir` with real metric values.
-  - [ ] Sanity check: print or log dataset sizes and confirm they respect the limits; mention parameter count estimate in a comment.
 
   If your current experiment.py has placeholder code like `...`, replace them with runnable implementations.
   If any external functions like `compute_loss`, `evaluate_model`, or `log_results` are used, implement them too.
@@ -121,7 +114,7 @@ experiment_error_prompt: |
   You have {max_runs} runs total. We're currently on run {run_time}.
   Please fix `experiment.py` so that it runs successfully with:
   `python experiment.py --out_dir=run_{run_time}`.
-  Make sure to implement any missing parts like model definition, loss function, data loading, and final_info.json saving.
+  Make sure to implement any missing parts like model definition, (optional) loss function if training, data loading, and final_info.json saving.
 
 
 experiment_timeout_prompt: |
diff --git a/tiny_scientist/prompts/thinker_prompt.yaml b/tiny_scientist/prompts/thinker_prompt.yaml
index 3a3669e0..9d43e60c 100644
--- a/tiny_scientist/prompts/thinker_prompt.yaml
+++ b/tiny_scientist/prompts/thinker_prompt.yaml
@@ -523,22 +523,18 @@ experiment_plan_prompt: |
   In <THOUGHT>, briefly discuss your reasoning behind the chosen approach.
 
   In <JSON>, provide the plan in JSON format with these fields:
-  - "Model": Detailed description of model architecture with specific components. You must stay within LIGHTWEIGHT architectures:
-    * Allowed families: bag-of-words/logistic regression, shallow MLP/FFN (≤2 hidden layers, ≤128 units), shallow CNN (≤3 convolutional layers, ≤32 filters), linear SVM, single-layer GRU (≤64 hidden units).
-    * Forbidden: Transformers (BERT, GPT, ViT, etc.), large pretrained backbones, graph neural networks, diffusion models, custom random feature generators.
+  - "Model": Detailed description of model architecture with specific components. Choose architectures appropriate to the task (pretrained models, transformers, graph neural networks, or diffusion models are allowed).
     * Document input/output dimensions so coder can wire tensors without guessing. Provide literal numeric values (e.g., use 3072 instead of `32 * 32 * 3`).
     * Absolutely NO comments (`//` or `/* */`) inside JSON values.
-    * Keep parameter counts tiny: total hidden units per layer ≤ 128, total trainable parameters ≤ ~100k.
   - "Dataset": Detailed description of dataset including:
-    * "Name": MUST be a real dataset available on HuggingFace Datasets (verify it exists!)
-    * "Size": Number of samples (limit each split to ≤ 5,000 training examples and ≤ 2,000 for validation/test in the initial plan to keep runtime low)
-    * "Splits": Train/validation/test split ratios (explicitly state how you will downsample large datasets to the limits above)
-    * "Preprocessing": Required preprocessing steps (e.g., lowercasing text, CountVectorizer/TF-IDF, image normalization). Specify resulting feature shapes.
-    * Absolutely NO comments (`//` or `/* */`) or inline expressions inside the JSON.
-    * "Load_Command": The exact Python call that loads the dataset programmatically (e.g., `datasets.load_dataset("ag_news")`, `datasets.load_dataset("imdb")`, `sklearn.datasets.load_iris()`). The identifier MUST be valid on the target library—generic names like `'coco'` or any path placeholders are forbidden. If a dataset needs manual downloads, select a different public dataset that can be fetched automatically.
-    * CRITICAL: Stay within this downloadable shortlist unless the intent explicitly demands otherwise: `{{"ag_news", "imdb", "yelp_polarity", "cifar10", "mnist", "fashion_mnist", "amazon_polarity"}}`. If none fit, clearly justify an alternative that is still auto-downloadable.
+    * "Name": Clearly identify the dataset (e.g., HuggingFace dataset, standard benchmark, or well-defined local/hosted dataset).
+    * "Size": Number of samples
+    * "Splits": Train/validation/test split ratios
+    * "Preprocessing": Required preprocessing steps (e.g., tokenization/vectorization, image normalization). Specify resulting feature shapes.
+    * Absolutely NO comments (`//` or `/* */`) or inline expressions inside the JSON values.
+    * "Load_Command": The exact Python call or code snippet that loads the dataset programmatically (e.g., `datasets.load_dataset(...)`, `sklearn.datasets.load_iris()`, torchvision loaders, or explicit path-based loaders). If local files or downloads are required, include the expected path(s) and/or download steps.
   - "Metric": Description of evaluation metrics with justification (use key name "Metric", NOT "Evaluation_Metrics")
-    * Include a short self-check note confirming that dataset size limits, model simplicity, and lack of comments are satisfied.
+    * Include a short self-check note confirming that the plan is reproducible and the JSON contains no inline comments.
 
   In <MARKDOWN_TABLE>, provide a DETAILED experimental plan as a Markdown table. This table MUST include the following columns and rows:
   - "Component": The part of the experiment
@@ -546,21 +542,16 @@ experiment_plan_prompt: |
   - "Justification / Rationale": Why this choice is appropriate, citing relevant literature
   - "Status": Leave this column EMPTY
   - Include an additional row titled **Sanity Checks** listing:
-    * Dataset subsampling strategy confirming ≤5,000 train / ≤2,000 val/test examples.
-    * Model parameter count estimate (≤100k parameters).
+    * Any subsampling or resource scaling choices (optional) and how to toggle them.
+    * Reproducibility notes (random seeds, dependency versions or install instructions).
     * Confirmation that JSON contains no inline comments or expressions.
 
   **REQUIRED ROWS** (at minimum):
   1. **Model Architecture**: Specify exact architecture, layers, dimensions, parameters
   2. **Dataset**: Name, size, splits (train/val/test), preprocessing steps
-     - **CRITICAL DATASET REQUIREMENT**: You MUST use a dataset that is:
-       a) Publicly available on HuggingFace Datasets Hub (https://huggingface.co/datasets) and can be loaded with `datasets.load_dataset()`, OR
-       b) A standard benchmark dataset with clear download instructions (e.g., via torchvision, tensorflow_datasets)
-       - ❌ DO NOT invent dataset names (e.g., "Gutenberg Dataset" does not exist on HuggingFace)
-       - ✅ Prefer the shortlist `ag_news`, `imdb`, `yelp_polarity`, `cifar10`, `mnist`, `fashion_mnist`, `amazon_polarity`. If intent requires a different dataset, confirm it is auto-downloadable and provide the exact `datasets.load_dataset` call.
-       - ❌ DO NOT use placeholder file paths such as `path/to/...`; rely on library loaders only
-       - ✅ Verify the dataset exists (including the config/split name) before specifying it
-       - Include the exact load command: e.g., `datasets.load_dataset("wikitext", "wikitext-2-raw-v1")`
+     - Use a clearly identifiable dataset (HuggingFace, torchvision/tensorflow_datasets, other public benchmarks, or well-defined local/hosted data).
+     - Provide exact, runnable loading code or commands (e.g., `datasets.load_dataset("wikitext", "wikitext-2-raw-v1")`, torchvision loaders, or explicit path-based loaders with expected locations).
+     - If manual downloads or authentication are required, include concise instructions or code to fetch the data.
   3. **Baselines**: List 3-5 specific baseline methods to compare against (with citations)
   4. **Training Setup**: Optimizer, learning rate, batch size, epochs, hardware
   5. **Evaluation Metrics**: Primary and secondary metrics with justification
diff --git a/tiny_scientist/utils/llm.py b/tiny_scientist/utils/llm.py
index 54a5fa64..3e5c26a3 100644
--- a/tiny_scientist/utils/llm.py
+++ b/tiny_scientist/utils/llm.py
@@ -31,6 +31,7 @@
     "gpt-4o-mini-2024-07-18",
     "gpt-4o-2024-05-13",
     "gpt-4o-2024-08-06",
+    "gpt-5",
     "o1-preview-2024-09-12",
     "o1-mini-2024-09-12",
     "o1-2024-12-17",
@@ -143,6 +144,28 @@ def get_batch_responses_from_llm(
             output_tokens = getattr(response.usage, "completion_tokens", 0)
             if cost_tracker is not None:
                 cost_tracker.add_cost(model, input_tokens, output_tokens, task_name)
+    elif model.startswith("gpt-5"):
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            max_completion_tokens=MAX_NUM_TOKENS,
+            n=n_responses,
+            stop=None,
+            seed=0,
+        )
+        content = [r.message.content for r in response.choices]
+        new_msg_history = [
+            new_msg_history + [{"role": "assistant", "content": c}] for c in content
+        ]
+        if hasattr(response, "usage"):
+            input_tokens = getattr(response.usage, "prompt_tokens", 0)
+            output_tokens = getattr(response.usage, "completion_tokens", 0)
+            if cost_tracker is not None:
+                cost_tracker.add_cost(model, input_tokens, output_tokens, task_name)
     elif any(
         model.startswith(prefix)
         for prefix in ["meta-llama/", "Qwen/", "deepseek-ai/", "mistralai/"]
@@ -283,6 +306,24 @@ def get_response_from_llm(
         if hasattr(response, "usage"):
             input_tokens = getattr(response.usage, "prompt_tokens", 0)
             output_tokens = getattr(response.usage, "completion_tokens", 0)
+    elif model.startswith("gpt-5"):
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            max_completion_tokens=MAX_NUM_TOKENS,
+            n=1,
+            stop=None,
+            seed=0,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+        if hasattr(response, "usage"):
+            input_tokens = getattr(response.usage, "prompt_tokens", 0)
+            output_tokens = getattr(response.usage, "completion_tokens", 0)
     elif model in ["o1-preview-2024-09-12", "o1-mini-2024-09-12"]:
         new_msg_history = msg_history + [{"role": "user", "content": msg}]
         response = client.chat.completions.create(
@@ -462,20 +503,27 @@ def get_batch_responses_from_llm_with_tools(
         "gpt" in model or model in ["o1-preview-2024-09-12", "o1-mini-2024-09-12"]
     ):
         new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        kwargs = {
+            "model": model,
+            "messages": [
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            "tools": tools,
+            "tool_choice": "auto",
+            "n": n_responses,
+            "stop": None,
+            "seed": 0,
+        }
+        if model.startswith("gpt-5"):
+            kwargs["max_completion_tokens"] = MAX_NUM_TOKENS
+        else:
+            kwargs["temperature"] = temperature
+            kwargs["max_tokens"] = MAX_NUM_TOKENS
+
         try:
             response = client.chat.completions.create(  # type: ignore[call-overload]
-                model=model,
-                messages=[
-                    {"role": "system", "content": system_message},
-                    *new_msg_history,
-                ],
-                tools=tools,
-                tool_choice="auto",  # Or specify a tool like {"type": "function", "function": {"name": "my_function"}}
-                temperature=temperature,
-                max_tokens=MAX_NUM_TOKENS,
-                n=n_responses,
-                stop=None,
-                seed=0,  # Seed might not be available for all models or with tool use
+                **kwargs
             )
 
             # Extract token usage for OpenAI
diff --git a/tiny_scientist/utils/pricing.py b/tiny_scientist/utils/pricing.py
index 1fc0a142..4dd5f3db 100644
--- a/tiny_scientist/utils/pricing.py
+++ b/tiny_scientist/utils/pricing.py
@@ -7,6 +7,7 @@
     "gpt-3.5-turbo": (0.5, 1.5),
     "gpt-4o-mini": (0.15, 0.6),
     "gpt-4o": (2.5, 10),
+    "gpt-5": (1.25, 10),
     "o1-preview": (15, 60),
     "o1-mini": (1.1, 4.4),
     "o1": (15, 60),