diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index 98bd5a2..e6383e2 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -87,17 +87,35 @@ mkdir -p "$BRIDGE_LOG_DIR" # --- Kill anything holding port 7890, any existing bridge tmux session, # and any leftover old-style PID-file supervisor. echo "Cleaning up old bridge..." + +# Kill the tmux session first — this stops the restart loop from respawning +# the bridge while we're trying to clean up the port. +tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true + +# Now gracefully stop any process on the port. SIGTERM lets the bridge close +# the HTTP server and release the port cleanly; SIGKILL is the fallback. PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true) if [ -n "$PORT_PIDS" ]; then - echo "Killing processes on port 7890: $PORT_PIDS" - echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true - sleep 1 + echo "Stopping processes on port 7890 (SIGTERM): $PORT_PIDS" + echo "$PORT_PIDS" | xargs kill 2>/dev/null || true + # Wait up to 3s for graceful shutdown + for i in 1 2 3; do + sleep 1 + PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true) + [ -z "$PORT_PIDS" ] && break + done + # Force-kill anything that didn't exit + if [ -n "$PORT_PIDS" ]; then + echo "Force-killing stubborn processes: $PORT_PIDS" + echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true + sleep 1 + fi fi -tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true + OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" if [ -f "$OLD_PID_FILE" ]; then OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)" - [ -n "$OLD_PID" ] && kill -9 "$OLD_PID" 2>/dev/null || true + [ -n "$OLD_PID" ] && kill "$OLD_PID" 2>/dev/null || true rm -f "$OLD_PID_FILE" fi @@ -151,6 +169,12 @@ tmux new-session -d -s "$BRIDGE_TMUX_SESSION" "\ exit_code=\$?; \ echo \"[\$(date -Is)] bridge: exited with code \$exit_code, restarting in 5s\" >> $BRIDGE_LOG_FILE; \ sleep 5; \ + tries=0; \ + while lsof -ti :7890 >/dev/null 2>&1 && [ \$tries -lt 10 ]; do \ + echo \"[\$(date -Is)] bridge: port 7890 still in use, waiting...\" >> $BRIDGE_LOG_FILE; \ + sleep 2; \ + tries=\$((tries + 1)); \ + done; \ done" echo "Bridge tmux session: $BRIDGE_TMUX_SESSION" diff --git a/slack-bridge/broker-bridge.mjs b/slack-bridge/broker-bridge.mjs index 9bc00a6..2b42e38 100755 --- a/slack-bridge/broker-bridge.mjs +++ b/slack-bridge/broker-bridge.mjs @@ -885,6 +885,36 @@ function getLogLinesForResponse(url) { return lines; } +/** Reference to the HTTP server so we can close it on shutdown. */ +let apiServer = null; +let shuttingDown = false; + +/** + * Graceful shutdown: close the HTTP server (releases the port), then exit. + * Called on SIGTERM/SIGINT so restarts don't fight over the port. + */ +function gracefulShutdown(signal) { + if (shuttingDown) return; + shuttingDown = true; + logInfo(`🛑 received ${signal} — shutting down gracefully`); + if (apiServer) { + apiServer.close(() => { + logInfo("🛑 HTTP server closed, exiting"); + process.exit(0); + }); + // Force exit after 5s if connections don't drain + setTimeout(() => { + logWarn("🛑 forceful exit after 5s timeout"); + process.exit(1); + }, 5000).unref(); + } else { + process.exit(0); + } +} + +process.on("SIGTERM", () => gracefulShutdown("SIGTERM")); +process.on("SIGINT", () => gracefulShutdown("SIGINT")); + function startApiServer() { const server = createServer(async (req, res) => { const url = new URL(req.url, `http://localhost:${API_PORT}`); @@ -1024,9 +1054,33 @@ function startApiServer() { } }); - server.listen(API_PORT, "127.0.0.1", () => { + // Retry with backoff if the port is still held by a dying predecessor. + const MAX_BIND_RETRIES = 5; + const BIND_RETRY_DELAY_MS = 2000; + let bindAttempt = 0; + + function tryListen() { + bindAttempt++; + server.listen(API_PORT, "127.0.0.1"); + } + + server.on("listening", () => { + apiServer = server; logInfo(`📡 Outbound API listening on http://127.0.0.1:${API_PORT}`); }); + + server.on("error", (err) => { + if (err.code === "EADDRINUSE" && bindAttempt < MAX_BIND_RETRIES) { + logWarn(`⚠️ port ${API_PORT} in use, retrying in ${BIND_RETRY_DELAY_MS}ms (attempt ${bindAttempt}/${MAX_BIND_RETRIES})`); + server.close(); + setTimeout(tryListen, BIND_RETRY_DELAY_MS); + } else { + logError(`❌ HTTP server error: ${err.message}`); + process.exit(1); + } + }); + + tryListen(); } async function startPollLoop() {