From cccf3b6812845e10d4df918583c28a77ce361b69 Mon Sep 17 00:00:00 2001 From: Baudbot Date: Mon, 23 Feb 2026 23:18:02 -0500 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20graceful=20bridge=20restarts=20?= =?UTF-8?q?=E2=80=94=20SIGTERM=20handling=20+=20EADDRINUSE=20retry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three layers of defense against port conflicts during bridge restarts: 1. broker-bridge.mjs: SIGTERM/SIGINT handler closes the HTTP server cleanly before exiting, so the port is released immediately instead of lingering in TIME_WAIT. 2. broker-bridge.mjs: EADDRINUSE retry with backoff (5 attempts, 2s apart) so if the port IS briefly held, the bridge waits instead of crashing. 3. startup-cleanup.sh: Kill the tmux restart loop FIRST (prevents respawning), then SIGTERM the port holder and wait up to 3s for graceful exit before falling back to SIGKILL. The restart loop also checks port availability before each relaunch. --- pi/skills/control-agent/startup-cleanup.sh | 34 +++++++++++-- slack-bridge/broker-bridge.mjs | 55 +++++++++++++++++++++- 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/pi/skills/control-agent/startup-cleanup.sh b/pi/skills/control-agent/startup-cleanup.sh index 98bd5a2..e6383e2 100755 --- a/pi/skills/control-agent/startup-cleanup.sh +++ b/pi/skills/control-agent/startup-cleanup.sh @@ -87,17 +87,35 @@ mkdir -p "$BRIDGE_LOG_DIR" # --- Kill anything holding port 7890, any existing bridge tmux session, # and any leftover old-style PID-file supervisor. echo "Cleaning up old bridge..." + +# Kill the tmux session first — this stops the restart loop from respawning +# the bridge while we're trying to clean up the port. +tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true + +# Now gracefully stop any process on the port. SIGTERM lets the bridge close +# the HTTP server and release the port cleanly; SIGKILL is the fallback. PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true) if [ -n "$PORT_PIDS" ]; then - echo "Killing processes on port 7890: $PORT_PIDS" - echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true - sleep 1 + echo "Stopping processes on port 7890 (SIGTERM): $PORT_PIDS" + echo "$PORT_PIDS" | xargs kill 2>/dev/null || true + # Wait up to 3s for graceful shutdown + for i in 1 2 3; do + sleep 1 + PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true) + [ -z "$PORT_PIDS" ] && break + done + # Force-kill anything that didn't exit + if [ -n "$PORT_PIDS" ]; then + echo "Force-killing stubborn processes: $PORT_PIDS" + echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true + sleep 1 + fi fi -tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true + OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" if [ -f "$OLD_PID_FILE" ]; then OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)" - [ -n "$OLD_PID" ] && kill -9 "$OLD_PID" 2>/dev/null || true + [ -n "$OLD_PID" ] && kill "$OLD_PID" 2>/dev/null || true rm -f "$OLD_PID_FILE" fi @@ -151,6 +169,12 @@ tmux new-session -d -s "$BRIDGE_TMUX_SESSION" "\ exit_code=\$?; \ echo \"[\$(date -Is)] bridge: exited with code \$exit_code, restarting in 5s\" >> $BRIDGE_LOG_FILE; \ sleep 5; \ + tries=0; \ + while lsof -ti :7890 >/dev/null 2>&1 && [ \$tries -lt 10 ]; do \ + echo \"[\$(date -Is)] bridge: port 7890 still in use, waiting...\" >> $BRIDGE_LOG_FILE; \ + sleep 2; \ + tries=\$((tries + 1)); \ + done; \ done" echo "Bridge tmux session: $BRIDGE_TMUX_SESSION" diff --git a/slack-bridge/broker-bridge.mjs b/slack-bridge/broker-bridge.mjs index 9bc00a6..bd6c893 100755 --- a/slack-bridge/broker-bridge.mjs +++ b/slack-bridge/broker-bridge.mjs @@ -885,6 +885,36 @@ function getLogLinesForResponse(url) { return lines; } +/** Reference to the HTTP server so we can close it on shutdown. */ +let apiServer = null; +let shuttingDown = false; + +/** + * Graceful shutdown: close the HTTP server (releases the port), then exit. + * Called on SIGTERM/SIGINT so restarts don't fight over the port. + */ +function gracefulShutdown(signal) { + if (shuttingDown) return; + shuttingDown = true; + logInfo(`🛑 received ${signal} — shutting down gracefully`); + if (apiServer) { + apiServer.close(() => { + logInfo("🛑 HTTP server closed, exiting"); + process.exit(0); + }); + // Force exit after 5s if connections don't drain + setTimeout(() => { + logWarn("🛑 forceful exit after 5s timeout"); + process.exit(1); + }, 5000).unref(); + } else { + process.exit(0); + } +} + +process.on("SIGTERM", () => gracefulShutdown("SIGTERM")); +process.on("SIGINT", () => gracefulShutdown("SIGINT")); + function startApiServer() { const server = createServer(async (req, res) => { const url = new URL(req.url, `http://localhost:${API_PORT}`); @@ -1024,9 +1054,32 @@ function startApiServer() { } }); - server.listen(API_PORT, "127.0.0.1", () => { + // Retry with backoff if the port is still held by a dying predecessor. + const MAX_BIND_RETRIES = 5; + const BIND_RETRY_DELAY_MS = 2000; + let bindAttempt = 0; + + function tryListen() { + bindAttempt++; + server.listen(API_PORT, "127.0.0.1"); + } + + server.on("listening", () => { + apiServer = server; logInfo(`📡 Outbound API listening on http://127.0.0.1:${API_PORT}`); }); + + server.on("error", (err) => { + if (err.code === "EADDRINUSE" && bindAttempt < MAX_BIND_RETRIES) { + logWarn(`⚠️ port ${API_PORT} in use, retrying in ${BIND_RETRY_DELAY_MS}ms (attempt ${bindAttempt}/${MAX_BIND_RETRIES})`); + setTimeout(tryListen, BIND_RETRY_DELAY_MS); + } else { + logError(`❌ HTTP server error: ${err.message}`); + process.exit(1); + } + }); + + tryListen(); } async function startPollLoop() { From 50b76c385521bc81bafdc4ccdb33d3e1df15126e Mon Sep 17 00:00:00 2001 From: Baudbot Date: Tue, 24 Feb 2026 07:46:56 -0500 Subject: [PATCH 2/2] fix: call server.close() before EADDRINUSE retry per Node.js docs --- slack-bridge/broker-bridge.mjs | 1 + 1 file changed, 1 insertion(+) diff --git a/slack-bridge/broker-bridge.mjs b/slack-bridge/broker-bridge.mjs index bd6c893..2b42e38 100755 --- a/slack-bridge/broker-bridge.mjs +++ b/slack-bridge/broker-bridge.mjs @@ -1072,6 +1072,7 @@ function startApiServer() { server.on("error", (err) => { if (err.code === "EADDRINUSE" && bindAttempt < MAX_BIND_RETRIES) { logWarn(`⚠️ port ${API_PORT} in use, retrying in ${BIND_RETRY_DELAY_MS}ms (attempt ${bindAttempt}/${MAX_BIND_RETRIES})`); + server.close(); setTimeout(tryListen, BIND_RETRY_DELAY_MS); } else { logError(`❌ HTTP server error: ${err.message}`);